<center>
<h2> Classificaion Pipeline </h2>

</center>

## Import Libraries

In [None]:
try:
    import mlens
except ImportError:
    !pip install mlens
    import mlens

In [None]:
import os
import time
import pandas as pd
import pandas_profiling
import numpy as np

import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (15.0, 10.0)

import warnings
warnings.filterwarnings('ignore')

from tqdm import tqdm_notebook

In [None]:
# seeding
SEED = 7
np.random.seed(SEED)

start = time.time()

In [None]:
def execution_time(start):
    _ = time.time()
    hours, _ = divmod(_-start, 3600)
    minutes, seconds = divmod(_, 60)
    print("Execution Time:  {:0>2} hours: {:0>2} minutes: {:05.2f} seconds".format(int(hours),int(minutes),seconds))

## Data Preprocessing and Visualization

In [None]:
# Reading the data
df = pd.read_csv("../input/pima-indians-diabetes-database/diabetes.csv")
df_name = df.columns

print('Shape of the dataframe: ', df.shape)

### Discriptive Statistics

In [None]:
df.head()

In [None]:
df.info();

In [None]:
df.describe()

In [None]:
# Basic stats
def basic_stats(df):
    b = pd.DataFrame()
    b['Missing value'] = df.isnull().sum()
    b['N unique value'] = df.nunique()
    b['dtype'] = df.dtypes
    return b

basic_stats(df)

In [None]:
# plot missing values similar to missingno package

def plot_missing_values(df):
    sns.heatmap(df.isnull().T, cbar=False)
    
plot_missing_values(df)

In [None]:
# missing data replace with mode
def replace_missing_value(df):
    col = df.columns
    for i in col:
        if df[i].isnull().sum()>0:
            df[i].fillna(df[i].mode()[0],inplace=True)

# replace_missing_value(df)

In [None]:
# complete data profiling using pandas_profiling package
df.profile_report()

### Data Visualization

In [None]:
# Univariate graphs to see the distribution
df.hist();

In [None]:
# Correlation Matrix
def correlation_matrix(df):
    Corr = df.corr()

    mask = np.zeros(Corr.shape, dtype=bool)
    mask[np.triu_indices(len(mask))] = True

    sns.heatmap(Corr, cmap = 'coolwarm', annot = True, mask = mask);

correlation_matrix(df);

In [None]:
# pairplot
sns.pairplot(df, hue="Outcome", palette="husl", markers=["o", "s"], diag_kind='hist');

In [None]:
# Dependent Variable Distribution
def dv_distribution(df, dv):
    print(df[dv].value_counts())
    plt.pie(df[dv].value_counts().values, labels=df[dv].value_counts().keys(), startangle=90, autopct='%.1f%%')
    plt.title('Dependent Variable Distribution');

dv_distribution(df, 'Outcome')

In [None]:
# Outliers Visualization
def plot_outliers(df):
    df_name = df.columns
    fig, axs = plt.subplots(1, len(df_name), figsize=(20, 10))

    for i, col in enumerate(df_name):
        axs[i].set_title(col)
        axs[i].boxplot(df[col])
    fig.suptitle('Outliers');

plot_outliers(df)

### Data Preprocessing

In [None]:
# Creating Dependent and Independent variables
X =  df[df_name[0:8]]
Y = df[df_name[8]]

# Dummy Variables
# X = pd.get_dummies(X, drop_first=True)
# X_cv = pd.get_dummies(X_cv, drop_first=True)

In [None]:
# Spliting data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test =train_test_split(X, Y, test_size=0.25, random_state=0, stratify=df['Outcome'])

In [None]:
# convert to category
def category_type(df):
    col = df.columns
    for i in col:
        if df[i].nunique()<=104:
            df[i] = df[i].astype('category')

#category_type(train)
#category_type(test)

## Baseline: Models Evaluation

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, make_scorer
from sklearn.model_selection import StratifiedKFold, cross_val_score

# Spot-Check Algorithms (Classification)
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# Spot-Check Ensemble Models (Classification)
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, AdaBoostClassifier

def getBaselineModels():
    models = []
    models.append(('LR', LogisticRegression()))
    models.append(('LDA', LinearDiscriminantAnalysis()))
    models.append(('NB', GaussianNB()))
    models.append(('KNN', KNeighborsClassifier()))
    models.append(('CART', DecisionTreeClassifier()))
    models.append(('SVM', SVC(probability=True)))

    models.append(('AB', AdaBoostClassifier()))
    models.append(('GBM', GradientBoostingClassifier()))
    models.append(('ET', ExtraTreesClassifier()))
    models.append(('RF', RandomForestClassifier()))

    return models

def baselineModelsEval(X_train, y_train, models):
    # Test options and evaluation metric
    num_folds = 10
    scoring = make_scorer(accuracy_score)

    # evaluate each model in turn
    results = {}
    for name, model in tqdm_notebook(models):
        kfold = StratifiedKFold(n_splits=num_folds, random_state=SEED)
        cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
        results[name] = cv_results

    return results

In [None]:
def scoreDataFrame(results):
    scores = []
    names = []
    for k, r in results.items():
        names.append(k)
        scores.append(round(r.mean(),4))

    scoreDataFrame = pd.DataFrame({'Model':names, 'Score': scores})

    return scoreDataFrame

def plotScores(results):
    # boxplot algorithm comparison
    fig = plt.figure()
    fig.suptitle('Algorithm Comparison')
    ax = fig.add_subplot(111)
    plt.boxplot(list(results.values()))
    ax.set_xticklabels(list(results.keys()));

In [None]:
models = getBaselineModels()
results = baselineModelsEval(X_train, y_train, models)
plotScores(results)
baselineScore = scoreDataFrame(results)
baselineScore

## Feature Engineering

### Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

def getScaledModel(nameOfScaler):
    
    if nameOfScaler == 'standard':
        scaler = StandardScaler()
    elif nameOfScaler =='minmax':
        scaler = MinMaxScaler()

    pipelines = []
    pipelines.append((nameOfScaler+'LR'  , Pipeline([('Scaler', scaler),('LR'  , LogisticRegression())])))
    pipelines.append((nameOfScaler+'LDA' , Pipeline([('Scaler', scaler),('LDA' , LinearDiscriminantAnalysis())])))
    pipelines.append((nameOfScaler+'KNN' , Pipeline([('Scaler', scaler),('KNN' , KNeighborsClassifier())])))
    pipelines.append((nameOfScaler+'CART', Pipeline([('Scaler', scaler),('CART', DecisionTreeClassifier())])))
    pipelines.append((nameOfScaler+'NB'  , Pipeline([('Scaler', scaler),('NB'  , GaussianNB())])))
    pipelines.append((nameOfScaler+'SVM' , Pipeline([('Scaler', scaler),('SVM' , SVC())])))
    pipelines.append((nameOfScaler+'AB'  , Pipeline([('Scaler', scaler),('AB'  , AdaBoostClassifier())])))
    pipelines.append((nameOfScaler+'GBM' , Pipeline([('Scaler', scaler),('GMB' , GradientBoostingClassifier())])  ))
    pipelines.append((nameOfScaler+'RF'  , Pipeline([('Scaler', scaler),('RF'  , RandomForestClassifier())])))
    pipelines.append((nameOfScaler+'ET'  , Pipeline([('Scaler', scaler),('ET'  , ExtraTreesClassifier())])))
    
    return pipelines

In [None]:
#standard scaler
models = getScaledModel('standard')
results = baselineModelsEval(X_train, y_train, models)
plotScores(results)
scaledScoreStandard = scoreDataFrame(results)
compareModels = pd.concat([baselineScore, scaledScoreStandard], axis=1)
compareModels

In [None]:
#minmax scaler
models = getScaledModel('minmax')
results = baselineModelsEval(X_train, y_train, models)
plotScores(results)
scaledScoreMinMax = scoreDataFrame(results)
compareModels = pd.concat([baselineScore, scaledScoreStandard, scaledScoreMinMax], axis=1)
compareModels

### Removing Outlies

In [None]:
df_t = df.copy()
df_t_name = df_t.columns

In [None]:
def outliers(df_out, drop=False):
    
    #good_data = df_out.copy()
    for nameOfFeature in df_out.columns:
        valueOfFeature = df_out[nameOfFeature]
        # Calculate Q1 (25th percentile of the data) for the given feature
        Q1 = np.percentile(valueOfFeature, 25.)

        # Calculate Q3 (75th percentile of the data) for the given feature
        Q3 = np.percentile(valueOfFeature, 75.)

        # Use the interquartile range to calculate an outlier step (1.5 times the interquartile range)
        step = (Q3-Q1)*1.5

        outliers = valueOfFeature[~((valueOfFeature >= Q1 - step) & (valueOfFeature <= Q3 + step))].index.tolist()
        feature_outliers = valueOfFeature[~((valueOfFeature >= Q1 - step) & (valueOfFeature <= Q3 + step))].values

        # Remove the outliers, if any were specified
        print("\n" + "\u0332".join(nameOfFeature) + ": \n")
        print ("Number of outliers (inc duplicates): {} and outliers: {}".format(len(outliers), feature_outliers))

        if drop:
            df_out = df_out.drop(df_out.index[outliers]).reset_index(drop = True)
            print("New dataset with removed outliers has shape ({}, {})".format(*df_out.shape))
    
    return df_out

In [None]:
# without drop
_ = outliers(df_t)

In [None]:
# with drop
df_clean = outliers(df_t, drop=True)

In [None]:
print('df shape: {}, new df shape: {}, we lost {} rows, {}% of our data'.format(df.shape[0], df_clean.shape[0], df.shape[0]-df_clean.shape[0],
                                                        (df.shape[0]-df_clean.shape[0])/df.shape[0]*100))

#### Model evaluation on Cleaned Data

In [None]:
df_clean_name = df_clean.columns

X_c =  df_clean[df_clean_name[0:8]]
Y_c = df_clean[df_clean_name[8]]

X_train_c, X_test_c, y_train_c, y_test_c =train_test_split(X_c, Y_c, test_size=0.25, random_state=0, stratify=df_clean['Outcome'])

In [None]:
models = getScaledModel('minmax')
results = baselineModelsEval(X_train_c, y_train_c, models)
plotScores(results)
scaledScoreMinMax_c = scoreDataFrame(results)
compareModels = pd.concat([baselineScore, scaledScoreStandard, scaledScoreMinMax, scaledScoreMinMax_c], axis=1)
compareModels

## Model Interpretability (for Feature Importance/Selection)

### ExtraTreeClassifier's feature importance

In [None]:
clf = ExtraTreesClassifier(n_estimators=250, random_state=SEED)
clf.fit(X_train_c, y_train_c);

In [None]:
 def featureImportance(X, y): 
    clf = ExtraTreesClassifier(n_estimators=250, random_state=SEED)
    clf.fit(X, y)

    # Plot feature importance
    feature_importance = clf.feature_importances_
    # make importances relative to max importance
    feature_importance = 100.0 * (feature_importance / feature_importance.max())
    sorted_idx = np.argsort(feature_importance)
    pos = np.arange(sorted_idx.shape[0]) + .5
    plt.figure(figsize=(5,5))
    plt.barh(pos, feature_importance[sorted_idx], align='center')
    plt.yticks(pos, df.columns[sorted_idx])
    plt.xlabel('Relative Importance')
    plt.title('Variable Importance');

featureImportance(X_train_c, y_train_c)

### LIME (Local Interpretable Model-Agnostic Explanation)

In [None]:
import lime
from lime.lime_tabular import LimeTabularExplainer

In [None]:
# initialization of LIME explainer
explainer = LimeTabularExplainer(X_train_c.values, 
                                 mode='classification',
                                 feature_names=X_train_c.columns,
                                 class_names=['Diabetic', 'Not Diabetic'])

> **Lime - ExtraTreeClassifer**

In [None]:
exp = explainer.explain_instance(X_test_c.values[0],
                                 clf.predict_proba,
                                 num_features=X_train_c.shape[1])

exp.show_in_notebook(show_table = True)

### ELI5 (Explain Like I'm 5)

In [None]:
import eli5
from eli5.sklearn import PermutationImportance

> Global Model Interpretation

In [None]:
eli5.show_weights(clf, feature_names=list(X_train_c.columns), top=None)

> Local Model Interpretation

In [None]:
eli5.show_prediction(clf, X_test_c.values[0], feature_names=list(X_train_c.columns), top=None)

> Permutation Inportance from Testing Data

In [None]:
exp = PermutationImportance(clf, random_state=0).fit(X_test_c, y_test_c)

eli5.show_weights(exp, feature_names=list(X_train_c.columns), top=None)

### SHAP (SHapley Additive exPlanations)

In [None]:
import shap
from shap import TreeExplainer, KernelExplainer, LinearExplainer
shap.initjs()

> Local Model Interpretation

In [None]:
explainer = TreeExplainer(clf, X_train_c, feature_dependence='imdependent')
shap_values = explainer.shap_values(X_test_c.values)
shap.force_plot(explainer.expected_value[1],
                shap_values[1],
                X_test_c.values,
                feature_names=X_train_c.columns)

> Global Model Interpretation

In [None]:
ssplot = shap.summary_plot(shap_values, X_test_c.values, feature_names=X_train_c.columns)

> Partial Dependency Plot

In [None]:
# Let's check plot for 'Glucose'

#shap.dependence_plot('Glucose', shap_values, X_test_c)

***`'Glucose','BMI','Age','DiabetesPedigreeFunction' columns have most effect on the data.`***

#### Model evaluation on Feature Selection

In [None]:
df_feature_imp = df_clean[['Glucose','BMI','Age','DiabetesPedigreeFunction','Outcome']]
df_feature_imp_name = df_feature_imp.columns

In [None]:
X =  df_feature_imp[df_feature_imp_name[0:df_feature_imp.shape[1]-1]]
Y = df_feature_imp[df_feature_imp_name[df_feature_imp.shape[1]-1]]

X_train_im, X_test_im, y_train_im, y_test_im =train_test_split(X, Y, test_size=0.1, random_state=0,
                                                   stratify=df_feature_imp['Outcome'])

In [None]:
models = getScaledModel('minmax')
results = baselineModelsEval(X_train_im, y_train_im,models)
plotScores(results)
scaledScoreMinMax_im = scoreDataFrame(results)
compareModels = pd.concat([baselineScore,
                           scaledScoreStandard,
                           scaledScoreMinMax,
                           scaledScoreMinMax_c,
                           scaledScoreMinMax_im], axis=1)
compareModels

## Parameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import uniform, randint

In [None]:
df_imp_scaled = MinMaxScaler().fit_transform(df_clean[['Glucose','BMI','Age','DiabetesPedigreeFunction','Outcome']])
df_imp_scaled_name = df_clean.columns

X =  df_imp_scaled[:,0:4]
Y =  df_imp_scaled[:,4]
X_train_sc, X_test_sc, y_train_sc, y_test_sc =train_test_split(X, Y, test_size=0.1, random_state=0,
                                                   stratify=df_imp_scaled[:,4])

### Grid Seach/ Random Search

In [None]:
class RandomSearch(object):
    
    def __init__(self,X_train,y_train,model,hyperparameters):
        
        self.X_train = X_train
        self.y_train = y_train
        self.model = model
        self.hyperparameters = hyperparameters
        
    def RandomSearch(self):
        # Create randomized search 10-fold cross validation and 100 iterations
        cv = 10
        clf = RandomizedSearchCV(self.model,
                                 self.hyperparameters,
                                 random_state=1,
                                 n_iter=100,
                                 cv=cv,
                                 iid = True,
                                 verbose=0,
                                 n_jobs=-1,
                                 )
        # Fit randomized search
        best_model = clf.fit(self.X_train, self.y_train)
        message = (best_model.best_score_, best_model.best_params_)
        print("Best: %f using %s" % (message))

        return best_model,best_model.best_params_
    
    def BestModelPridict(self,X_test):
        
        best_model,_ = self.RandomSearch()
        pred = best_model.predict(X_test)
        return pred

In [None]:
class GridSearch(object):
    
    def __init__(self,X_train,y_train,model,hyperparameters):
        
        self.X_train = X_train
        self.y_train = y_train
        self.model = model
        self.hyperparameters = hyperparameters
        
    def GridSearch(self):
        # Create randomized search 10-fold cross validation and 100 iterations
        cv = 10
        clf = GridSearchCV(self.model,
                                 self.hyperparameters,
                                 cv=cv,
                                 verbose=0,
                                 n_jobs=-1,
                                 )
        # Fit randomized search
        best_model = clf.fit(self.X_train, self.y_train)
        message = (best_model.best_score_, best_model.best_params_)
        print("Best: %f using %s" % (message))

        return best_model,best_model.best_params_
    
    def BestModelPridict(self,X_test):
        
        best_model,_ = self.GridSearch()
        pred = best_model.predict(X_test)
        return pred

In [None]:
models = {LogisticRegression(): dict(C=uniform(loc=0, scale=4), penalty = ['l1', 'l2']),
          KNeighborsClassifier(): dict(n_neighbors=[i for i in range(1, 21)]),
          SVC(): dict(C=[0.1, 0.3, 0.5, 0.7, 0.9, 1.0, 1.3, 1.5, 1.7, 2.0], kernel=['linear', 'poly', 'rbf', 'sigmoid']),
          DecisionTreeClassifier(): dict(max_depth=[3,None], max_features=randint(1, 4), min_samples_leaf=randint(1, 4), criterion=["gini", "entropy"]),
          AdaBoostClassifier(): dict(learning_rate=[.01,.05,.1,.5,1], n_estimators=[50,100,150,200,250,300]),
          GradientBoostingClassifier(): dict(learning_rate=[.01,.05,.1,.5,1], n_estimators=[50,100,150,200,250,300]),
          RandomForestClassifier(): dict(n_estimators=[50,100,150,200,250,300], max_depth=[5,8,15,25,30], min_samples_split=[2,5,10,15,100], min_samples_leaf = [1,2,5,10]),
          ExtraTreesClassifier(): dict(n_estimators=[50,100,150,200,250,300], min_samples_split=[2,5,10,15,100], min_samples_leaf = [1,2,5,10])}

In [None]:
for model, hyperparameters in tqdm_notebook(models.items()):
    print("\u0332".join(type(model).__name__))
    _ = RandomSearch(X_train_sc, y_train_sc, model, hyperparameters)
    _ = _.BestModelPridict(X_test_sc)
    print("\n")

## Ensemble Methods

### VotingClassifier

In [None]:
from sklearn.ensemble import VotingClassifier

# best params from parameter tuning step
param = {'C': 0.7678243129497218, 'penalty': 'l1'}
model1 = LogisticRegression(**param)

param = {'n_neighbors': 15}
model2 = KNeighborsClassifier(**param)

param = {'kernel': 'linear', 'C': 1.7}
model3 = SVC(**param)

param = {'criterion': 'gini', 'max_depth': 3, 'max_features': 2, 'min_samples_leaf': 3}
model4 = DecisionTreeClassifier(**param)

param = {'learning_rate': 0.05, 'n_estimators': 150}
model5 = AdaBoostClassifier(**param)

param = {'learning_rate': 0.01, 'n_estimators': 100}
model6 = GradientBoostingClassifier(**param)

model7 = GaussianNB()

param = {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_depth': 25}
model8 = RandomForestClassifier(**param)

param = {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 10}
model9 = ExtraTreesClassifier(**param)


# create the sub models
estimators = [('LR',model1), ('KNN',model2), ('SVC',model3),
              ('DT',model4), ('ADa',model5), ('GB',model6),
              ('NB',model7), ('RF',model8),  ('ET',model9)]

In [None]:
# create the ensemble model
kfold = StratifiedKFold(n_splits=10, random_state=SEED)

ensemble = VotingClassifier(estimators)
results = cross_val_score(ensemble, X_train_sc,y_train_sc, cv=kfold)
vc_result = results.mean()
print('Accuracy on train: ',vc_result)

ensemble_model = ensemble.fit(X_train_sc,y_train_sc)
pred = ensemble_model.predict(X_test_sc)
print('Accuracy on test:' , (y_test_sc == pred).mean())

### Predict and Error Corrolation

In [None]:
def get_models():
    param = {'C': 0.7678243129497218, 'penalty': 'l1'}
    model1 = LogisticRegression(**param)

    param = {'n_neighbors': 15}
    model2 = KNeighborsClassifier(**param)

    param = {'kernel': 'linear', 'C': 1.7, 'probability':True}
    model3 = SVC(**param)

    param = {'criterion': 'gini', 'max_depth': 3, 'max_features': 2, 'min_samples_leaf': 3}
    model4 = DecisionTreeClassifier(**param)

    param = {'learning_rate': 0.05, 'n_estimators': 150}
    model5 = AdaBoostClassifier(**param)

    param = {'learning_rate': 0.01, 'n_estimators': 100}
    model6 = GradientBoostingClassifier(**param)

    model7 = GaussianNB()

    param = {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_depth': 25}
    model8 = RandomForestClassifier(**param)

    param = {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 10}
    model9 = ExtraTreesClassifier(**param)

    models = {'LR':model1, 'KNN':model2, 'SVC':model3,
              'DT':model4, 'ADa':model5, 'GB':model6,
              'NB':model7, 'RF':model8,  'ET':model9
              }

    return models

In [None]:
def train_predict(model_list,xtrain, xtest, ytrain, ytest):
    """Fit models in list on training set and return preds"""
    P = np.zeros((ytest.shape[0], len(model_list)))
    P = pd.DataFrame(P)

    print("Fitting models.")
    cols = list()
    for i, (name, m) in enumerate(tqdm_notebook(models.items())):
        m.fit(xtrain, ytrain)
        P.iloc[:, i] = m.predict_proba(xtest)[:, 1]
        cols.append(name)

    P.columns = cols
    return P

In [None]:
models = get_models()
P = train_predict(models, X_train_sc, X_test_sc, y_train_sc, y_test_sc)

In [None]:
correlation_matrix(P)

error correlations on a class prediction basis things look a bit more promising:

In [None]:
correlation_matrix(P.apply(lambda predic: 1*(predic >= 0.5) - y_test_sc))

### Stacking

In [None]:
base_learners = get_models()

meta_learner = GradientBoostingClassifier(
    n_estimators=1000,
    loss="exponential",
    max_features=6,
    max_depth=3,
    subsample=0.5,
    learning_rate=0.001, 
    random_state=SEED
)

In [None]:
from mlens.ensemble import SuperLearner

# create the super learner
def get_super_learner():
    ensemble = SuperLearner(scorer=accuracy_score, folds=10, random_state=SEED, verbose=True)
    # add base models
    ensemble.add(list(base_learners.values()), proba=True)
    # add the meta model
    ensemble.add_meta(meta_learner, proba=True)
 
    return ensemble

In [None]:
ensemble = get_super_learner()

# Train the ensemble
ensemble.fit(X_train_sc, y_train_sc)

# Predict the test set
p_ensemble = ensemble.predict_proba(X_test_sc)

In [None]:
pp = []
for p in p_ensemble[:, 1]:
    if p>0.5:
        pp.append(1.)
    else:
        pp.append(0.)

In [None]:
SL_result = (y_test_sc == pp).mean()
print("Super Learner Accuracy score: %.8f" % SL_result)

## Save Models

In [0]:
import joblib

# Output a pickle file for the model
joblib.dump(ensemble, 'super_learner.pkl') 
 
# Load the pickle file
clf_load = joblib.load('super_learner.pkl')

# Check that the loaded model is the same as the original
clf_load.scorer(y_test_sc, pp) == ensemble.scorer(y_test_sc, pp)

## Gradient Boosting

In [None]:
train_df = df_clean[:500]
test_df = df_clean[500:]

y_test = test_df['Outcome']
test_df.drop(columns=['Outcome'], inplace=True)

features = test_df.columns
categoricals = []

In [None]:
train_df.shape, test_df.shape

In [None]:
class Base_Model(object):
    
    def __init__(self, train_df, test_df, target, features, categoricals=[], n_splits=10, verbose=True):
        self.train_df = train_df
        self.test_df = test_df
        self.target = target
        self.features = features
        self.categoricals = categoricals
        self.n_splits = n_splits
        self.verbose = verbose
        self.cv = self.get_cv()
        self.params = self.get_params()
        self.y_pred, self.score, self.model = self.fit()
        
    def train_model(self, train_set, val_set):
        raise NotImplementedError
        
    def get_cv(self):
        cv = StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=42)
        return cv.split(self.train_df, self.train_df[self.target])
    
    def get_params(self):
        raise NotImplementedError
        
    def convert_dataset(self, x_train, y_train, x_val, y_val):
        raise NotImplementedError
        
    def convert_x(self, x):
        return x
        
    def fit(self):
        oof_pred = np.zeros((len(train_df), ))
        y_pred = np.zeros((len(test_df), ))
        for fold, (train_idx, val_idx) in enumerate(self.cv):
            x_train, x_val = self.train_df[self.features].iloc[train_idx], self.train_df[self.features].iloc[val_idx]
            y_train, y_val = self.train_df[self.target][train_idx], self.train_df[self.target][val_idx]
            train_set, val_set = self.convert_dataset(x_train, y_train, x_val, y_val)
            model = self.train_model(train_set, val_set)
            conv_x_val = self.convert_x(x_val)
            _ = np.where(model.predict(conv_x_val) > 0.5, 1, 0)
            oof_pred[val_idx] = _.reshape(oof_pred[val_idx].shape)
            x_test = self.convert_x(self.test_df[self.features])
            y_pred += model.predict(x_test).reshape(y_pred.shape) / self.n_splits
            print('Partial score of fold {} is: {}'.format(fold, accuracy_score(y_val, oof_pred[val_idx])))
        loss_score = accuracy_score(self.train_df[self.target], oof_pred)
        if self.verbose:
            print('Our oof Accuracy is: ', loss_score)
        return y_pred, loss_score, model

### LightGBM

In [None]:
import lightgbm as lgb

class Lgb_Model(Base_Model):
    
    def train_model(self, train_set, val_set):
        verbosity = 100 if self.verbose else 0
        return lgb.train(self.params, train_set, valid_sets=[train_set, val_set], verbose_eval=verbosity)
        
    def convert_dataset(self, x_train, y_train, x_val, y_val):
        train_set = lgb.Dataset(x_train, y_train, categorical_feature=self.categoricals)
        val_set = lgb.Dataset(x_val, y_val, categorical_feature=self.categoricals)
        return train_set, val_set
        
    def get_params(self):
        params = {'n_estimators':10,
                    'boosting_type': 'gbdt',
                    'objective': 'binary',
                    'metric': 'auc',
                    }
        return params

In [None]:
lgb_model = Lgb_Model(train_df, test_df, 'Outcome', features, categoricals)

### XGBoost

In [None]:
import xgboost as xgb

class Xgb_Model(Base_Model):
    
    def train_model(self, train_set, val_set):
        verbosity = 0 if self.verbose else 0
        return xgb.train(self.params, train_set, 
                         num_boost_round=5000, evals=[(train_set, 'train'), (val_set, 'val')], 
                         verbose_eval=verbosity, early_stopping_rounds=100)
        
    def convert_dataset(self, x_train, y_train, x_val, y_val):
        train_set = xgb.DMatrix(x_train, y_train)
        val_set = xgb.DMatrix(x_val, y_val)
        return train_set, val_set
    
    def convert_x(self, x):
        return xgb.DMatrix(x)
        
    def get_params(self):
        params = {'colsample_bytree': 0.8,                 
                    'learning_rate': 0.01,
                    'max_depth': 10,
                    'subsample': 1,
                    'objective':'binary:hinge',
                    'eval_metric':'auc',
                    'min_child_weight':3,
                    'gamma':0.25,
                    'n_estimators':10}

        return params

In [None]:
xgb_model = Xgb_Model(train_df, test_df, 'Outcome', features, categoricals)

### CatBoost

In [None]:
from catboost import CatBoostClassifier

class Catb_Model(Base_Model):
    
    def train_model(self, train_set, val_set):
        verbosity = 0 if self.verbose else 0
        clf = CatBoostClassifier(**self.params)
        clf.fit(train_set['X'], train_set['y'], 
                eval_set=(val_set['X'], val_set['y']),
                verbose=verbosity, 
                cat_features=self.categoricals)
        return clf
        
    def convert_dataset(self, x_train, y_train, x_val, y_val):
        train_set = {'X': x_train, 'y': y_train}
        val_set = {'X': x_val, 'y': y_val}
        return train_set, val_set
        
    def get_params(self):
        params = {'loss_function': 'Logloss',
                   'task_type': "CPU",
                   'iterations': 10,
                   'od_type': "Iter",
                    'depth': 10,
                    'colsample_bylevel': 0.5, 
                    'early_stopping_rounds': 300,
                    'random_seed': 42,
                    'use_best_model': True
                    }
        return params

In [None]:
catb_model = Catb_Model(train_df, test_df, 'Outcome', features, categoricals)

## FeedForward Neural Network

In [None]:
import tensorflow as tf

class Nn_Model(Base_Model):
    
    def __init__(self, train_df, test_df, target, features, categoricals=[], n_splits=10, verbose=True):
        super().__init__(train_df, test_df, target, features, categoricals, n_splits, verbose)
        
    def train_model(self, train_set, val_set):
        verbosity = 0 if self.verbose else 0
        model = tf.keras.models.Sequential([
            tf.keras.layers.Input(shape=(train_set['X'].shape[1],)),
            tf.keras.layers.Dense(200, activation='relu'),
            tf.keras.layers.LayerNormalization(),
            tf.keras.layers.Dropout(0.3),
            tf.keras.layers.Dense(100, activation='relu'),
            tf.keras.layers.LayerNormalization(),
            tf.keras.layers.Dropout(0.3),
            tf.keras.layers.Dense(50, activation='relu'),
            tf.keras.layers.LayerNormalization(),
            tf.keras.layers.Dropout(0.3),
            tf.keras.layers.Dense(25, activation='relu'),
            tf.keras.layers.LayerNormalization(),
            tf.keras.layers.Dropout(0.3),
            tf.keras.layers.Dense(1, activation='sigmoid')
        ])
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-2), loss='binary_crossentropy', metrics=['accuracy'])
        save_best = tf.keras.callbacks.ModelCheckpoint('nn_model.w8', save_weights_only=True, save_best_only=True, verbose=0)
        early_stop = tf.keras.callbacks.EarlyStopping(patience=20)
        model.fit(train_set['X'], 
                train_set['y'], 
                validation_data=(val_set['X'], val_set['y']),
                epochs=100,
                verbose=0,
                callbacks=[save_best, early_stop])
        model.load_weights('nn_model.w8')
        return model
        
    def convert_dataset(self, x_train, y_train, x_val, y_val):
        train_set = {'X': x_train, 'y': y_train}
        val_set = {'X': x_val, 'y': y_val}
        return train_set, val_set
        
    def get_params(self):
        return None

In [None]:
nn_model = Nn_Model(train_df, test_df, 'Outcome', features, categoricals)

In [None]:
from random import choice

class Cnn_Model(Base_Model):
    
    def __init__(self, train_df, test_df, target, features, categoricals=[], n_splits=5, verbose=True):
        self.create_feat_2d(features)
        super().__init__(train_df, test_df, target, features, categoricals, n_splits, verbose)
        
    def create_feat_2d(self, features, n_feats_repeat=50):
        self.n_feats = len(features)
        self.n_feats_repeat = n_feats_repeat
        self.mask = np.zeros((self.n_feats_repeat, self.n_feats), dtype=np.int32)
        for i in range(self.n_feats_repeat):
            l = list(range(self.n_feats))
            for j in range(self.n_feats):
                c = l.pop(choice(range(len(l))))
                self.mask[i, j] = c
        self.mask = tf.convert_to_tensor(self.mask)
        print(self.mask.shape)
       
        
    
    def train_model(self, train_set, val_set):
        verbosity = 0 if self.verbose else 0

        inp = tf.keras.layers.Input(shape=(self.n_feats))
        x = tf.keras.layers.Lambda(lambda x: tf.gather(x, self.mask, axis=1))(inp)
        x = tf.keras.layers.Reshape((self.n_feats_repeat, self.n_feats, 1))(x)
        x = tf.keras.layers.Conv2D(18, (50, 50), strides=50, activation='relu')(x)
        x = tf.keras.layers.Flatten()(x)
        x = tf.keras.layers.Dense(100, activation='relu')(x)
        x = tf.keras.layers.LayerNormalization()(x)
        x = tf.keras.layers.Dropout(0.3)(x)
        x = tf.keras.layers.Dense(50, activation='relu')(x)
        x = tf.keras.layers.LayerNormalization()(x)
        x = tf.keras.layers.Dropout(0.3)(x)
        out = tf.keras.layers.Dense(1)(x)
        
        model = tf.keras.Model(inp, out)
    
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4), loss='binary_crossentropy', metrics='accuracy')
        print(model.summary())
        save_best = tf.keras.callbacks.ModelCheckpoint('cnn_model.w8', save_weights_only=True, save_best_only=True, verbose=1)
        early_stop = tf.keras.callbacks.EarlyStopping(patience=20)
        model.fit(train_set['X'], 
                train_set['y'], 
                validation_data=(val_set['X'], val_set['y']),
                epochs=100,
                 callbacks=[save_best, early_stop])
        model.load_weights('cnn_model.w8')
        return model
        
    def convert_dataset(self, x_train, y_train, x_val, y_val):
        train_set = {'X': x_train, 'y': y_train}
        val_set = {'X': x_val, 'y': y_val}
        return train_set, val_set
        
    def get_params(self):
        return None

In [None]:
print("XGBoost Score: ", xgb_model.score)
print("LightGBM Score: ", lgb_model.score)
print("CatBoost Score: ", catb_model.score)
print("Neural Network Score: ", nn_model.score)
print("Voting Classifier Score: ", vc_result)
print("Super Learner Score: ", SL_result)

In [None]:
compareModels

In [None]:
#joblib.dump(xgb_model, 'xgb.pkl')
#joblib.dump(lgb_model, 'lgb.pkl')
#joblib.dump(catb_model, 'catb.pkl')

In [None]:
execution_time(start)