In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib

In [None]:
data = pd.read_csv('../input/pima-indians-diabetes-database/diabetes.csv')
data

# Train/Test Split

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.2, random_state=42)
X_train = train.drop('Outcome', axis=1)
y_train = train['Outcome']
X_test = test.drop('Outcome', axis=1)
y_test = test['Outcome']

# How do the features interact with each other?

In [None]:
from pandas.plotting import scatter_matrix
scatter_matrix(train, figsize=(20,16))
plt.savefig('interactions.png')
plt.show()

In [None]:
train.plot(x='Insulin', y='Age', style='o');

# Create a Custom Imputer 
As we can see, some numerical feature interaction reveal these weird streaks at 0. In most cases, these are probably just missing values and thus we should use an imputer to get rid of them. Therefore, I will use a simple RandomForestRefressor to replace the zeros by meaningful values.

(TODO: does a better evaluation lead to (a drastically) better performance of the final model? I will have to test it)

Let's create one pipeline for categorical variables, one pipeline for regular numerical variables and one pipeline for numerical columns with these weird zero-streaks.

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import TransformerMixin


class ZeroImputer(TransformerMixin):
    """
    A Transformer for sklearn Pipelines. 
    Replaces zeroes with predicted values.
    Each Column is treated individually.
    """ 
    def __init__(self, col_names):
        self.col_names = col_names
        self.models_fitted = [0]*len(col_names)
        
    def fit(self, X, y = None):
        for i,col in enumerate(self.col_names):
            temp = X[X[col] != 0]
            X_impute = temp.drop([col], axis=1)
            y_impute = temp[col]
            rfr = RandomForestRegressor(random_state=42)
            rfr_fitted = rfr.fit(X_impute, y_impute)
            self.models_fitted[i] = rfr_fitted 
        return self
    
    def transform(self, X):
        for i,col in enumerate(self.col_names):
            temp = X[X[col] == 0]
            X_impute = temp.drop([col], axis=1)
            if len(X_impute) > 0:
                preds_impute = self.models_fitted[i].predict(X_impute)
                missing = X.loc[X[col] == 0,:][col] 
                X.loc[X[col] == 0,[col]] = preds_impute.reshape(-1,1)
        return X

In [None]:
cat_cols = ['Pregnancies']
num_cols_zeros = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
num_cols = ['DiabetesPedigreeFunction', 'Age']
all_cols = X_train.columns

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer 
from sklearn.compose import ColumnTransformer 



num_pipeline = Pipeline([('imputer', SimpleImputer(strategy='median'))])

num_zeros_pipeline = Pipeline([('imputer', ZeroImputer(col_names=num_cols_zeros))])
                         
cat_pipeline = Pipeline([('imputer', SimpleImputer(strategy='most_frequent'))])

preprocess_pipeline = ColumnTransformer([('cat', cat_pipeline, cat_cols),
                                         ('num_zero', num_zeros_pipeline, num_cols_zeros),
                                        ('num', num_pipeline, num_cols)])


# This preprocessing-pipeline 
# will be integrated in the full pipeline later on.
# just for the scatter matrix:
preprocessed = pd.DataFrame(preprocess_pipeline.fit_transform(X_train))
preprocessed.columns = all_cols
scatter_matrix(preprocessed, figsize=(20,16));

We got rid of the weird zero-streaks =). But we should still perform some further transformations on our data.
Let's expand our preprocessing pipeline!

# Create a custom Pipeline

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import KBinsDiscretizer

num_pipeline = Pipeline([('imputer', SimpleImputer(strategy='median')),
                        ('scaler', StandardScaler())])

num_zeros_pipeline = Pipeline([('imputer', ZeroImputer(col_names=num_cols_zeros)),
                              ('scaler', StandardScaler())])
                         
cat_pipeline = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')),
                         ('binning', KBinsDiscretizer(encode='onehot-dense'))])

preprocess_pipeline = ColumnTransformer([('cat', cat_pipeline, cat_cols),
                                         ('num_zero', num_zeros_pipeline, num_cols_zeros),
                                        ('num', num_pipeline, num_cols)])

# train or load the model

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV 

n_estimators_range = np.arange(start=5, stop=400, step=25)
max_features_range = np.arange(start=1, stop=5)
n_bins_range = np.arange(start=2, stop=4)
rfc = RandomForestClassifier(random_state=42)

pipe_base = Pipeline([
    ('preprocessing', preprocess_pipeline),
    ('forest', rfc)
])

pipe_base_params = [{'forest__n_estimators': n_estimators_range, 
                     'forest__max_features': max_features_range,
                    'preprocessing__cat__binning__n_bins': n_bins_range}]

gscv = GridSearchCV(pipe_base, pipe_base_params, cv=5, scoring='f1')
gscv.fit(X_train, y_train)
pipe_base = gscv.best_estimator_

#load the baseline pipe, run, if you don't want to 
#train the same model over and over again..
'''with open('pipe_base.pkl',mode='rb') as load_model:
    pipe_base = joblib.load(load_model)'''



pipe_base_preds = pipe_base.predict(X_test)

# Let's evaluate the results

In [None]:
from sklearn.metrics import f1_score
f1_score(pipe_base_preds, y_test)

In [None]:
from sklearn.metrics import accuracy_score 
accuracy_score(pipe_base_preds, y_test)

# dump the model
Run this code if you created a model and you want to store it:

In [None]:
with open('pipe_base.pkl',mode='wb') as load_model:
    joblib.dump(pipe_base,load_model)