In [28]:
from numpy import isnan, mean, std
from pandas import read_csv
from sklearn.impute import SimpleImputer

In [19]:
''' Read data in and take a look at missing data '''
dataframe = read_csv('https://raw.githubusercontent.com/jbrownlee/Datasets/master/horse-colic.csv', header=None, na_values='?')

for i in range(dataframe.shape[1]):
    # count number of rows with missing values
    n_miss = dataframe[[i]].isnull().sum()
    #print(n_miss)
    perc = n_miss / dataframe.shape[0] * 100
    print(f'Row {i}: {float(round(perc, 2))}%')
    

Row 0: 0.33%
Row 1: 0.0%
Row 2: 0.0%
Row 3: 20.0%
Row 4: 8.0%
Row 5: 19.33%
Row 6: 18.67%
Row 7: 23.0%
Row 8: 15.67%
Row 9: 10.67%
Row 10: 18.33%
Row 11: 14.67%
Row 12: 18.67%
Row 13: 34.67%
Row 14: 35.33%
Row 15: 82.33%
Row 16: 34.0%
Row 17: 39.33%
Row 18: 9.67%
Row 19: 11.0%
Row 20: 55.0%
Row 21: 66.0%
Row 22: 0.33%
Row 23: 0.0%
Row 24: 0.0%
Row 25: 0.0%
Row 26: 0.0%
Row 27: 0.0%


In [31]:
''' Next we will look at the Simple Imputer'''
dataframe = read_csv('https://raw.githubusercontent.com/jbrownlee/Datasets/master/horse-colic.csv', header=None, na_values='?')

data = dataframe.values
ix = [i for i in range(data.shape[1]) if i != 23]
X, y = data[:, ix], data[:, 23]

imputer = SimpleImputer(strategy='mean')
imputer.fit(X)
X_trans = imputer.transform(X)
print(f'Missing: {sum(isnan(X_trans).flatten())}')

Missing: 0


In [35]:
''' Next we will evaluate a model with a Random Forest Classifier and iterate over the strategies for imputing. '''
from pandas import read_csv
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline

# read in data
dataframe = read_csv('https://raw.githubusercontent.com/jbrownlee/Datasets/master/horse-colic.csv', header=None, na_values='?')
data = dataframe.values
ix = [i for i in range(data.shape[1]) if i != 23]
X, y = data[:, ix], data[:, 23]

# set strategies
results = list()
strategies = ['mean', 'median', 'most_frequent', 'constant']

# iterate
for s in strategies:
    # generate pipeline
    pipeline = Pipeline(steps=[('i', SimpleImputer(strategy=s)), ('m', RandomForestClassifier())])
    
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    results.append(scores)
    print(f'{s}: {mean(scores)}, {std(scores)}')
''' It is important to point out here that we are not using this to predict anything at the moment, and any dataset 
for such will also need to be transformed in the same fashion. '''

mean: 0.8577777777777779, 0.0537024265493092
median: 0.8644444444444443, 0.0589622184952071
most_frequent: 0.8744444444444445, 0.058171733834259665
constant: 0.8755555555555556, 0.05230489909306586
