In [1]:
import pandas as pd 
import functools
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import numpy as np 

In [2]:
# impute with zero
def simple_imputation(input_function):
    @functools.wraps(input_function)
    def simple_imputation_wrapper(*args, **kwargs):
        return_value = input_function(*args, **kwargs)
        print("--------------Before Imputation--------------")
        print(return_value.isnull().sum(axis = 0))
        return_value.fillna(0, inplace = True)
        print("--------------After Imputation--------------")
        print(return_value.isnull().sum(axis = 0))
        return return_value
    return simple_imputation_wrapper

In [3]:
@simple_imputation
def read_data():
    df = pd.read_csv("wines_data.csv", sep = ";")
    return df

In [4]:
 read_data()

--------------Before Imputation--------------
country                 2
designation         43826
points                  5
price               13396
province                7
region_1            23845
region_2            85659
variety                 5
winery                  5
last_year_points        0
dtype: int64
--------------After Imputation--------------
country             0
designation         0
points              0
price               0
province            0
region_1            0
region_2            0
variety             0
winery              0
last_year_points    0
dtype: int64


Unnamed: 0,country,designation,points,price,province,region_1,region_2,variety,winery,last_year_points
0,US,Martha's Vineyard,96.0,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz,94
1,Spain,Carodorum Selección Especial Reserva,96.0,110.0,Northern Spain,Toro,0,Tinta de Toro,Bodega Carmen Rodríguez,92
2,US,Special Selected Late Harvest,96.0,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley,100
3,US,Reserve,96.0,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi,94
4,France,La Brûlade,95.0,66.0,Provence,Bandol,0,Provence red blend,Domaine de la Bégude,94
...,...,...,...,...,...,...,...,...,...,...
144032,Italy,0,91.0,20.0,Southern Italy,Fiano di Avellino,0,White Blend,Feudi di San Gregorio,84
144033,France,Cuvée Prestige,91.0,27.0,Champagne,Champagne,0,Champagne Blend,H.Germain,83
144034,Italy,Terre di Dora,91.0,20.0,Southern Italy,Fiano di Avellino,0,White Blend,Terredora,97
144035,France,Grand Brut Rosé,90.0,52.0,Champagne,Champagne,0,Champagne Blend,Gosset,89


In [5]:
df = read_data()
df.isnull().sum(axis = 0)

--------------Before Imputation--------------
country                 2
designation         43826
points                  5
price               13396
province                7
region_1            23845
region_2            85659
variety                 5
winery                  5
last_year_points        0
dtype: int64
--------------After Imputation--------------
country             0
designation         0
points              0
price               0
province            0
region_1            0
region_2            0
variety             0
winery              0
last_year_points    0
dtype: int64


country             0
designation         0
points              0
price               0
province            0
region_1            0
region_2            0
variety             0
winery              0
last_year_points    0
dtype: int64

In [6]:
def meanmode_imputation(input_function):
    @functools.wraps(input_function)
    def meanmode_imputation_wrapper(*args, **kwargs):
        return_value = input_function(*args, **kwargs)
        print("--------------Before Mean/Mode Imputation--------------")
        print(return_value.isnull().sum(axis = 0))
        for col in list(return_value.columns):
            if return_value[col].dtype == float:
                return_value[col].fillna(return_value[col].mean(), inplace = True)
            elif return_value[col].dtype.name == 'category':  
                return_value[col].fillna(return_value[col].mode()[0], inplace = True)
        print("--------------After Mean/Mode Imputation--------------")
        print(return_value.isnull().sum(axis = 0))
        return return_value
    return meanmode_imputation_wrapper

In [7]:
@meanmode_imputation
def read_data(data_type_dict):
    df = pd.read_csv("wines_data.csv", sep = ";")
    for col in list(df.columns):
      df[col] = df[col].astype(data_type_dict[col])
    return df

In [8]:
 data_type_dict = {'country':'category', 'designation':'category',
'points':'float', 'price':'float', 'province':'category', 'region_1':'category',
 'region_2':'category', 'variety':'category', 'winery':'category', 'last_year_points':'float'}
df = read_data(data_type_dict)

--------------Before Mean/Mode Imputation--------------
country                 2
designation         43826
points                  5
price               13396
province                7
region_1            23845
region_2            85659
variety                 5
winery                  5
last_year_points        0
dtype: int64
--------------After Mean/Mode Imputation--------------
country             0
designation         0
points              0
price               0
province            0
region_1            0
region_2            0
variety             0
winery              0
last_year_points    0
dtype: int64


In [9]:
def iterative_imputation(input_function):
    @functools.wraps(input_function)
    def iterative_imputation_wrapper(*args, **kwargs):
        return_value = input_function(*args, **kwargs)
        print("--------------Before Bayesian Ridge Regression Imputation--------------")
        print(return_value.isnull().sum(axis = 0))
        return_num = return_value[['price', 'points', 'last_year_points']]
        return_cat = return_value.drop(columns=['price', 'points', 'last_year_points'])

        imp_bayesian = IterativeImputer(max_iter=10, random_state=0)
        imp_bayesian.fit(np.array(return_num))
        return_num = pd.DataFrame(np.round(imp_bayesian.transform(np.array(return_num))), columns = ['price', 'points', 'last_year_points'])
        for col in list(return_cat.columns):
            return_cat[col].fillna(return_cat[col].mode()[0], inplace = True)
        return_value = pd.concat([return_cat, return_num], axis=1)
        print("--------------After Bayesian Ridge Regression Imputation--------------")
        print(return_value.isnull().sum(axis = 0))
        return return_value
    return iterative_imputation_wrapper

In [10]:
@iterative_imputation
def read_data(data_type_dict):
    df = pd.read_csv("wines_data.csv", sep = ";")
    for col in list(df.columns):
      df[col] = df[col].astype(data_type_dict[col])
    return df

In [11]:
 data_type_dict = {'country':'category', 'designation':'category',
'points':'float', 'price':'float', 'province':'category', 'region_1':'category',
 'region_2':'category', 'variety':'category', 'winery':'category', 'last_year_points':'float'}
df = read_data(data_type_dict)

--------------Before Bayesian Ridge Regression Imputation--------------
country                 2
designation         43826
points                  5
price               13396
province                7
region_1            23845
region_2            85659
variety                 5
winery                  5
last_year_points        0
dtype: int64
--------------After Bayesian Ridge Regression Imputation--------------
country             0
designation         0
province            0
region_1            0
region_2            0
variety             0
winery              0
price               0
points              0
last_year_points    0
dtype: int64


In [12]:
 data_type_dict = {'country':'category', 'designation':'category',
'points':'float', 'price':'float', 'province':'category', 'region_1':'category',
 'region_2':'category', 'variety':'category', 'winery':'category', 'last_year_points':'float'}
df_original = pd.read_csv("wines_data.csv", sep = ";")
df_imp = read_data(data_type_dict)


--------------Before Bayesian Ridge Regression Imputation--------------
country                 2
designation         43826
points                  5
price               13396
province                7
region_1            23845
region_2            85659
variety                 5
winery                  5
last_year_points        0
dtype: int64
--------------After Bayesian Ridge Regression Imputation--------------
country             0
designation         0
province            0
region_1            0
region_2            0
variety             0
winery              0
price               0
points              0
last_year_points    0
dtype: int64


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=e827d9a5-6b96-4401-8c9c-6ec5b955d54f' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>