# Objective: Classify people with diabetes or not using  different features

In [None]:
import numpy as np # linear algebra
import pandas as pd 
from sklearn import preprocessing
import matplotlib.pyplot as plt 
import seaborn as sns
import missingno
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import model_selection, metrics, preprocessing

#Machine Learning
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

#preprocessing
from sklearn.impute import KNNImputer

## Exploratory Data Analysis

References:
1. https://github.com/dformoso/sklearn-classification/blob/master/Data%20Science%20Workbook%20-%20Census%20Income%20Dataset.ipynb
2. https://github.com/mrdbourke/your-first-kaggle-submission/blob/master/kaggle-titanic-dataset-example-submission-workflow.ipynb
Associated video: https://www.youtube.com/watch?v=f1y9wDDxWnA&feature=youtu.be

In [None]:
df = pd.read_csv('../input/diabetes-dataset/diabetes2.csv')
df.dtypes

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.isnull().sum()

Shows that there are no null values in the 

In [None]:
df.describe()

In [None]:
missingno.matrix(df) #nice way of visualizing missing values

# **Feature: Pregnancies**

In [None]:
plt.figure()
ax = sns.distplot(df['Pregnancies'][df.Outcome == 1], color ="darkturquoise", rug = True)
sns.distplot(df['Pregnancies'][df.Outcome == 0], color ="lightcoral",rug = True)
plt.legend(['Diabetes', 'No Diabetes'])

Distplot shows both the histogram and the KDE together. From the graph, number of pregnancies show some distinguishability(?) 

In [None]:
df['Pregnancies'].value_counts()
#df['Pregnancies'].unique()

In [None]:
sns.boxplot(x = df['Pregnancies'])

# Feature: Glucose

In [None]:
plt.figure()
ax = sns.distplot(df['Glucose'][df.Outcome == 1], color ="darkturquoise", rug = True)
sns.distplot(df['Glucose'][df.Outcome == 0], color ="lightcoral", rug = True)
plt.legend(['Diabetes', 'No Diabetes'])

In [None]:
min(df['Glucose']) 

In [None]:
df[df['Glucose'] == 0]

Normal Glucose should be between 60 to 140, 0 seems absurd

In [None]:
df[df['Glucose'].lt(60)]

In [None]:
df['Glucose'].value_counts() #not significant

In [None]:
sns.boxplot(x = df['Glucose'])

Since only five rows have Glucose 0, it can be removed

# Feature: BloodPressure

In [None]:
plt.figure()
ax = sns.distplot(df['BloodPressure'][df.Outcome == 1], color ="darkturquoise", rug=True)
sns.distplot(df['BloodPressure'][df.Outcome == 0], color ="lightcoral", rug=True)
plt.legend(['Diabetes', 'No Diabetes'])

In [None]:
min(df['BloodPressure']) #Again, seems absurd

In [None]:
print(df.loc[df['BloodPressure'] == 0].shape[0])
print(df.loc[df['BloodPressure'] == 0].shape[0]/df.shape[0])

In [None]:
df[df['BloodPressure'].lt(40)]

In [None]:
df[df['BloodPressure'].lt(40)].shape

In [None]:
df[df['BloodPressure'].gt(120)]

In [None]:
sns.boxplot(x = df['BloodPressure'])

Since 35 rows have Blood pressure = 0, it would not be wise to drop them 

# Feature: SkinThickness

In [None]:
plt.figure()
ax = sns.distplot(df['SkinThickness'][df.Outcome == 1], color ="darkturquoise", rug=True)
sns.distplot(df['SkinThickness'][df.Outcome == 0], color ="lightcoral", rug=True)
plt.legend(['Diabetes', 'No Diabetes'])

In [None]:
sns.boxplot(x = df['SkinThickness'])

In [None]:
df[df['SkinThickness'] == 0].shape

In [None]:
df[df['SkinThickness'].lt(2)]

We can't drop 227 rows as well

# Feature: Insulin

In [None]:
plt.figure()
ax = sns.distplot(df['Insulin'][df.Outcome == 1], color ="darkturquoise", rug=True)
sns.distplot(df['Insulin'][df.Outcome == 0], color ="lightcoral", rug=True)
plt.legend(['Diabetes', 'No Diabetes'])

In [None]:
sns.boxplot(x = df['Insulin'])

In [None]:
df[df['Insulin'].lt(16)]

In [None]:
df[df['Insulin'] == 0]

In [None]:
df[df['Insulin'] == 0].shape

# Feature: BMI

In [None]:
plt.figure()
ax = sns.distplot(df['BMI'][df.Outcome == 1], color ="darkturquoise", rug=True)
sns.distplot(df['BMI'][df.Outcome == 0], color ="lightcoral", rug=True)
plt.legend(['Diabetes', 'No Diabetes'])

In [None]:
sns.boxplot(x = df['BMI'])

In [None]:
df[df.BMI == 0]

In [None]:
df[df.BMI == 0].shape

# Feature: DiabetesPedigreeFunction

In [None]:
plt.figure()
ax = sns.distplot(df['DiabetesPedigreeFunction'][df.Outcome == 1], color ="darkturquoise", rug=True)
sns.distplot(df['DiabetesPedigreeFunction'][df.Outcome == 0], color ="lightcoral", rug=True)
plt.legend(['Diabetes', 'No Diabetes'])

In [None]:
sns.boxplot(x = df['DiabetesPedigreeFunction'])

# Feature: Age

In [None]:
plt.figure()
ax = sns.distplot(df['Age'][df.Outcome == 1], color ="darkturquoise", rug=True)
sns.distplot(df['Age'][df.Outcome == 0], color ="lightcoral", rug=True)
sns.distplot(df['Age'], color ="green", rug=True)
plt.legend(['Diabetes', 'No Diabetes', 'all'])

In [None]:
sns.boxplot(x = df['Age'])

We saw that there are many missing values in the columns, but they have been imputed with zero. Let's remove them and visualize 

In [None]:
df_with_na = df.copy(deep = True)
df_with_na['Insulin'] = df['Insulin'].map(lambda i: np.nan if i==0 else i)
df_with_na['SkinThickness'] = df['SkinThickness'].map(lambda i: np.nan if i==0 else i)
df_with_na['BloodPressure'] = df['BloodPressure'].map(lambda i: np.nan if i==0 else i)
df_with_na['BMI'] = df['BMI'].map(lambda i: np.nan if i==0 else i)
df_with_na['Glucose'] = df['Glucose'].map(lambda i: np.nan if i==0 else i)

missingno.matrix(df_with_na) #nice way of visualizing missing values

In [None]:
sns.heatmap(df.corr(), annot = True)

In [None]:
sns.heatmap(df_with_na.corr(), annot = True)

It can be seen that there's high correlation between BMI and SkinThickness. Since Skinthickness has many missing values, maybe we can drop it. 

# Bivariate Analysis

In [None]:
sns.pairplot(df, hue='Outcome')

No two features have very good distinguishing property

# Feature importance analysis

We could see there are many missing values for many columns. Let's see how we can handle it[](http://)

In [None]:
clf = RandomForestClassifier()
clf.fit(df.drop('Outcome', axis = 1), df['Outcome'])
plt.figure()
importance = clf.feature_importances_
print(df.drop('Outcome', axis=1).columns)
print(clf.feature_importances_)
importance = pd.DataFrame(importance, index=df.drop('Outcome', axis=1).columns, columns=["Importance"])
importance.sort_values(by='Importance', ascending=True).plot(kind='barh', figsize=(20,len(importance)/2))

We know BMI has 11 zero values. Let's see if the feature importance increases if we remove those values

In [None]:
df_no_BMI0 = df[df.BMI != 0]

clf = RandomForestClassifier()
clf.fit(df_no_BMI0.drop('Outcome', axis = 1), df_no_BMI0['Outcome'])
plt.figure()
importance = clf.feature_importances_
print(df_no_BMI0.drop('Outcome', axis=1).columns)
print(clf.feature_importances_)
importance = pd.DataFrame(importance, index=df_no_BMI0.drop('Outcome', axis=1).columns, columns=["Importance"])
importance.sort_values(by='Importance', ascending=True).plot(kind='barh', figsize=(20,len(importance)/2))


Remains more or less the same. Let's check it with Blood pressure (35 zero values)

In [None]:
df_no_BP0 = df[df.BloodPressure != 0]

clf = RandomForestClassifier()
clf.fit(df_no_BP0.drop('Outcome', axis = 1), df_no_BP0['Outcome'])
plt.figure()
importance = clf.feature_importances_
print(df_no_BP0.drop('Outcome', axis=1).columns)
print(clf.feature_importances_)
importance = pd.DataFrame(importance, index=df_no_BP0.drop('Outcome', axis=1).columns, columns=["Importance"])
importance.sort_values(by='Importance', ascending=True).plot(kind='barh', figsize=(20,len(importance)/2))

Let's look at the insulin data (370 missing)

In [None]:
df_no_IN0 = df[df.Insulin != 0]

clf = RandomForestClassifier()
clf.fit(df_no_IN0.drop('Outcome', axis = 1), df_no_IN0['Outcome'])
plt.figure()
importance = clf.feature_importances_
print(df_no_IN0.drop('Outcome', axis=1).columns)
print(clf.feature_importances_)
importance = pd.DataFrame(importance, index=df_no_IN0.drop('Outcome', axis=1).columns, columns=["Importance"])
importance.sort_values(by='Importance', ascending=True).plot(kind='barh', figsize=(20,len(importance)/2))

Dropping the 0 values show significant increase in the feature importance of Insulin. Shows we need to handle it somehow

In [None]:
df_no_SK0 = df[df.SkinThickness != 0]

clf = RandomForestClassifier()
clf.fit(df_no_SK0.drop('Outcome', axis = 1), df_no_SK0['Outcome'])
plt.figure()
importance = clf.feature_importances_
print(df_no_SK0.drop('Outcome', axis=1).columns)
print(clf.feature_importances_)
importance = pd.DataFrame(importance, index=df_no_SK0.drop('Outcome', axis=1).columns, columns=["Importance"])
importance.sort_values(by='Importance', ascending=True).plot(kind='barh', figsize=(20,len(importance)/2))

Changes the importances of other features, but skin thickness feature importance remains low

In [None]:
df_no_missing = df_with_na.dropna()
print(df_no_missing.shape)

clf = RandomForestClassifier()
clf.fit(df_no_missing.drop('Outcome', axis = 1), df_no_missing['Outcome'])
plt.figure()
importance = clf.feature_importances_
print(df_no_missing.drop('Outcome', axis=1).columns)
print(clf.feature_importances_)
importance = pd.DataFrame(importance, index=df_no_missing.drop('Outcome', axis=1).columns, columns=["Importance"])
importance.sort_values(by='Importance', ascending=True).plot(kind='barh', figsize=(20,len(importance)/2))

So we see after removing the missing values, the feature importance list looks like this. Glucose, insulin and age seem to be the three most important features.

## Data preprocessing:

We saw missing values in the following columns:
1. Glucose: 5
2. BMI: 11
3. Insulin: 370
4. SkinThickness: 227
5. Blood Pressure: 35

Steps:
1. Remove the missing values of Glucose and BMI (both very important features) [DATASET 1]
2. Drop SkinThickness because the feature importance is low [DATASET 2]
3. Create a train test split (test split should not contain any missing value)
4. Use data imputation 

References:
1. General overview of popular methods: https://towardsdatascience.com/6-different-ways-to-compensate-for-missing-values-data-imputation-with-examples-6022d9ca0779
> > 
2. Imputation using KNN: https://datascienceplus.com/knnimputer-for-missing-value-imputation-in-python-using-scikit-learn/
https://scikit-learn.org/stable/modules/generated/sklearn.impute.KNNImputer.html



## DATASET 1

In [None]:
X, X_test, y, y_test = train_test_split(df.drop('Outcome', axis= 1), df['Outcome'], test_size=0.20, random_state=42)
X, X_test_with_na, y, _ = train_test_split(df_with_na.drop('Outcome', axis= 1), df_with_na['Outcome'], test_size=0.20, random_state=42)

In [None]:
df_train = pd.concat([X, y], axis = 1)
df_train_without_missing = df_train.dropna()

y_wo_missing_train = df_train_without_missing['Outcome']
X_wo_missing_train = df_train_without_missing.drop(columns = ['Outcome'])

scaler = preprocessing.StandardScaler().fit(X_wo_missing_train)

#DATASET 1
X_wo_missing_train = scaler.transform(X_wo_missing_train)
X_wo_missing_test = scaler.transform(X_test)
X_wo_missing_test_with_na = scaler.transform(X_test_with_na)

print(X.shape)
print(df_train_without_missing.shape)

## DATASET 2

In [None]:
imputer = KNNImputer(n_neighbors=5)
X_knn_imp = imputer.fit_transform(X)
df_knn_imp = pd.DataFrame(X_knn_imp, columns = df.drop('Outcome', axis = 1).columns) #DATASET 2

clf = RandomForestClassifier()
clf.fit(df_knn_imp, y)
plt.figure()
importance = clf.feature_importances_
print(df_knn_imp.columns)
print(clf.feature_importances_)
importance = pd.DataFrame(importance, index=df_knn_imp.columns, columns=["Importance"])
importance.sort_values(by='Importance', ascending=True).plot(kind='barh', figsize=(20,len(importance)/2))

scaler = preprocessing.StandardScaler().fit(df_knn_imp)
X_knn_imp_train = scaler.transform(df_knn_imp)
X_knn_imp_test = scaler.transform(X_test)

## DATASET 3

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imp = IterativeImputer(max_iter = 10, random_state = 42)
imp.fit(X)
X_iter_imp = imp.transform(X)
X_iter_imp = pd.DataFrame(X_iter_imp, columns = X.columns)


clf = RandomForestClassifier()
clf.fit(X_iter_imp, y)
plt.figure()
importance = clf.feature_importances_
print(X_iter_imp.columns)
print(clf.feature_importances_)
importance = pd.DataFrame(importance, index=X_iter_imp.columns, columns=["Importance"])
importance.sort_values(by='Importance', ascending=True).plot(kind='barh', figsize=(20,len(importance)/2))

scaler = preprocessing.StandardScaler().fit(X_iter_imp)
X_iter_imp_train = scaler.transform(X_iter_imp)
X_iter_imp_test = scaler.transform(X_test)

# Model training: 

Steps: 
1. Choose dataset
2. Scale train and test data
3. Do gridsearch (if applicable) for best hyperparamters

References: 
1. Scaling: https://scikit-learn.org/stable/modules/preprocessing.html
2. Cross validation: 
https://stats.stackexchange.com/questions/411290/how-to-use-a-cross-validated-model-for-prediction
https://scikit-learn.org/stable/modules/cross_validation.html
3. Grid Search:
https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV
4. Randomized Search vs Grid Search: 
    https://scikit-learn.org/stable/auto_examples/model_selection/plot_randomized_search.html
    https://stackoverflow.com/questions/57426633/what-is-randomsearchcv-and-gridsearchcv
5. ROC Curves:
https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html

6. Logistic regression:
    a. Documentation : https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
    b. Which solver to use: 
    https://towardsdatascience.com/dont-sweat-the-solver-stuff-aea7cddc3451
    https://stackoverflow.com/questions/38640109/logistic-regression-python-solvers-defintions/52388406#52388406

## Utils:

In [None]:
# 
def fit_ml_algo(algo, X_train, y_train, X_test, y_test, cv):
    model = algo.fit(X_train, y_train)
    test_prediction = model.predict(X_test)
    test_probs = model.predict_proba(X_test)[:,1]
    train_accuracy = model.score(X_train, y_train)*100
    test_accuracy = model.score(X_test, y_test)*100
    train_prediction = model_selection.cross_val_predict(algo, X_train, y_train, cv = 10, n_jobs = -1)
    acc_cv = metrics.accuracy_score(y_train, train_prediction)*100
    model_scores = model_selection.cross_val_score(LogisticRegression(), X_train, y_train, cv = 10, n_jobs = -1)

    print("Cross Validation accuracy: (%0.2f) %0.4f (+/- %0.4f)" % (acc_cv, model_scores.mean(), model_scores.std() * 2))
    print('Model Test Accuracy: %0.2f   Model Train Accuracy: %0.2f'%(test_accuracy, train_accuracy))
    print(metrics.classification_report(y_test, test_prediction))
    print("Confusion matrix")
    print(metrics.confusion_matrix(y_test,test_prediction))
    
    return train_prediction, test_prediction, test_probs
    
    
# calculate the fpr and tpr for all thresholds of the classification
def plot_roc_curve(y_test, preds):
    fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
    roc_auc = metrics.auc(fpr, tpr)
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([-0.01, 1.01])
    plt.ylim([-0.01, 1.01])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()
    
    
# Adding gridsearch report creating code
def report(results, n_top = 5):
    for i in range(1, n_top +1 ):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

## Dataset 1: data without missing values

In [None]:
c = [0.01, 0.1, 1, 5, 10]
param_grid = [{'C': c, 'penalty': ['l2'], 'solver': ['liblinear', 'newton-cg','saga','lbfgs']}, {'C': c, 'penalty': ['l1'], 'solver': ['liblinear', 'saga']}]

lg_grid = model_selection.GridSearchCV(LogisticRegression(), param_grid, n_jobs = -1)
lg_grid.fit(X_train_wo_msng, y_train_wo_msng)

report(lg_grid.cv_results_)

In [None]:
train_prediction_1, test_prediction_1, test_probs_1 = fit_ml_algo(LogisticRegression(C = 0.1, penalty = 'l1', solver = 'liblinear'), X_train_without_missing_tf, y_train_without_missing, X_test_tf, y_test, cv = 10)

In [None]:
plot_roc_curve(y_test, test_probs_1)

## Dataset 2: 

In [None]:
c = [0.01, 0.1, 1, 5, 10]
param_grid = [{'C': c, 'penalty': ['l2'], 'solver': ['liblinear', 'newton-cg','saga','lbfgs']}, {'C': c, 'penalty': ['l1'], 'solver': ['liblinear', 'saga']}]

lg_grid = model_selection.GridSearchCV(LogisticRegression(), param_grid, n_jobs = -1)
lg_grid.fit(X_knn_imp_train, y)

report(lg_grid.cv_results_)

In [None]:
train_prediction_2, test_prediction_2, test_probs_2 = fit_ml_algo(LogisticRegression(C = 0.1, penalty = 'l1', solver = 'liblinear'), X_knn_imp_train, y, X_knn_imp_test, y_test, cv = 10)

In [None]:
plot_roc_curve(y_test, test_probs_2)

## Dataset 3

In [None]:
c = [0.01, 0.1, 1, 5, 10]
param_grid = [{'C': c, 'penalty': ['l2'], 'solver': ['liblinear', 'newton-cg','saga','lbfgs']}, {'C': c, 'penalty': ['l1'], 'solver': ['liblinear', 'saga']}]

lg_grid = model_selection.GridSearchCV(LogisticRegression(), param_grid, n_jobs = -1)
lg_grid.fit(X_iter_imp_train, y)

report(lg_grid.cv_results_)

In [None]:
train_prediction_3, test_prediction_3, test_probs_3 = fit_ml_algo(LogisticRegression(C = 0.1, penalty = 'l1', solver = 'saga'), X_iter_imp_train, y, X_iter_imp_test, y_test, cv = 10)

In [None]:
plot_roc_curve(y_test, test_probs_3)

## SVM

1. Documentation:
https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC

2. ROC for SVC:
https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html

In [None]:
#gridsearch params
c = [0.01, 0.1, 1, 5, 10]
param_grid = [{'C': c, 'kernel': ['linear', 'poly','rbf','sigmoid']}]

### SVM: Dataset 1

In [None]:
svc_grid = model_selection.GridSearchCV(SVC(), param_grid, n_jobs = -1)
svc_grid.fit(X_train_wo_msng, y_train_wo_msng)
report(svc_grid.cv_results_)

In [None]:
train_prediction, test_prediction, test_probs = fit_ml_algo(SVC(C =5, kernel = 'linear', probability = True), X_train_wo_msng, y_train_wo_msng, X_test_tf, y_test, cv = 10)

In [None]:
plot_roc_curve(y_test, test_probs)

### SVM: Dataset 2

In [None]:
svc_grid = model_selection.GridSearchCV(SVC(), param_grid, n_jobs = -1)
svc_grid.fit(X_knn_imp_train, y)
report(svc_grid.cv_results_)

In [None]:
train_prediction, test_prediction, test_probs = fit_ml_algo(SVC(C = 0.01, kernel = 'linear', probability = True), X_knn_imp_train, y, X_knn_imp_test, y_test, cv = 10)

In [None]:
plot_roc_curve(y_test, test_probs)

### SVM: Dataset 3

In [None]:
svc_grid = model_selection.GridSearchCV(SVC(), param_grid, n_jobs = -1)
svc_grid.fit(X_iter_imp_train, y)
report(svc_grid.cv_results_)

In [None]:
train_prediction, test_prediction, test_probs = fit_ml_algo(SVC(C = 0.01, kernel = 'linear', probability = True), X_iter_imp_train, y, X_iter_imp_test, y_test, cv = 10)

In [None]:
plot_roc_curve(y_test, test_probs)

## XGBoost

1. Documentation: https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn

2. Hyperparameter grid search in XGBoost: https://www.kaggle.com/tilii7/hyperparameter-grid-search-with-xgboost

3. 

In [None]:
params = {'min_child_weight': [1, 5, 10], 'gamma': [0.5, 1, 1.5, 2, 5], 'subsample': [0.6, 0.8, 1.0], 'colsample_bytree': [0.6, 0.8, 1.0], 'max_depth': [3, 4, 5]}
estimator = XGBClassifier(objective= 'binary:logistic', nthread=4, seed=42)
xg_grid = model_selection.GridSearchCV(estimator=estimator, param_grid=params, scoring = 'roc_auc', n_jobs = 10, cv = 10, verbose=True)

### XGBoost: Dataset 1

In [None]:
xg_grid.fit(X_wo_missing_train, y_wo_missing_train)

In [None]:
best_estimator = xg_grid.best_estimator_
report(xg_grid.cv_results_)

In [None]:
train_prediction, test_prediction, test_probs = fit_ml_algo(best_estimator, X_wo_missing_train, y_wo_missing_train, X_wo_missing_test_with_na, y_test, cv = 10)

In [None]:
plot_roc_curve(y_test, test_probs)

### XGBoost: Dataset 2

In [None]:
xg_grid.fit(X_knn_imp_train, y)

In [None]:
best_estimator = xg_grid.best_estimator_
report(xg_grid.cv_results_)

In [None]:
train_prediction, test_prediction, test_probs = fit_ml_algo(best_estimator, X_knn_imp_train, y, X_knn_imp_test, y_test, cv = 10)

### XGBoost: Dataset 3

In [None]:
xg_grid.fit(X_iter_imp_train, y)

In [None]:
best_estimator = xg_grid.best_estimator_
report(xg_grid.cv_results_)

In [None]:
train_prediction, test_prediction, test_probs = fit_ml_algo(best_estimator, X_iter_imp_train, y, X_iter_imp_test, y_test, cv = 10)