- The objective of the dataset is to diagnostically predict whether or not a patient has diabetes, based on certain diagnostic measurements included in the dataset

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [None]:
path = '/kaggle/input/pima-indians-diabetes-database/diabetes.csv'
df = pd.read_csv(path)
df.head(3)

In [None]:
df.info()

### What insights can we get out of this table?
- Minimum of Plasma glucose concentration, Diastolic blood pressure, Triceps skinfold thickness, 2-Hour serum insulin and Body mass index equals to zero!!!
- I don't know about Glucose, Insulin, SkinThickness but I do know that BloodPressure and BMI cannot be zero
- So it's better to replace those 0's with Nans

In [None]:
df.describe()

### Missing values
- Insulin and SkinThickness have the highest percentage of missing values among others

In [None]:
df.loc[df["Glucose"] == 0.0, "Glucose"] = np.NAN
df.loc[df["BloodPressure"] == 0.0, "BloodPressure"] = np.NAN
df.loc[df["SkinThickness"] == 0.0, "SkinThickness"] = np.NAN
df.loc[df["Insulin"] == 0.0, "Insulin"] = np.NAN
df.loc[df["BMI"] == 0.0, "BMI"] = np.NAN
df.isna().sum() / len(df)

### Regression Imputation
#### Random Imputation

In [None]:
missing_columns = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]

def random_imputation(df, feature):

    number_missing = df[feature].isnull().sum()
    observed_values = df.loc[df[feature].notnull(), feature]
    df.loc[df[feature].isnull(), feature + '_imp'] = np.random.choice(observed_values, number_missing, replace = True)
    
    return df

for feature in missing_columns:
    df[feature + '_imp'] = df[feature]
    df = random_imputation(df, feature)

#### Stochastic Regression Imputation

In [None]:
from sklearn import linear_model

random_data = pd.DataFrame(columns = ["Ran" + name for name in missing_columns])

for feature in missing_columns:
        
    random_data["Ran" + feature] = df[feature + '_imp']
    parameters = list(set(df.columns) - set(missing_columns) - {feature + '_imp'})
    
    model = linear_model.LinearRegression()
    model.fit(X = df[parameters], y = df[feature + '_imp'])
    
    #Standard Error of the regression estimates is equal to std() of the errors of each estimates
    predict = model.predict(df[parameters])
    std_error = (predict[df[feature].notnull()] - df.loc[df[feature].notnull(), feature + '_imp']).std()
    
    random_predict = np.random.normal(size = df[feature].shape[0], 
                                      loc = predict, 
                                      scale = std_error)
    random_data.loc[(df[feature].isnull()) & (random_predict > 0), "Ran" + feature] = random_predict[(df[feature].isnull()) & 
                                                                            (random_predict > 0)]


We can observe from the plots above that we have introduced some degree of variability into the variables and retained the native distribution as well.

In [None]:
sns.set()
fig, axes = plt.subplots(nrows = 5, ncols = 2)
fig.set_size_inches(16, 10)

for index, variable in enumerate(missing_columns):
    sns.histplot(df[variable].dropna(), kde = False, ax = axes[index, 0])
    sns.histplot(random_data["Ran" + variable], kde = False, ax = axes[index, 0], color = 'red')
    axes[index, 0].set(xlabel = variable + " / " + variable + '_imp')
    
    sns.boxplot(data = pd.concat([df[variable], random_data["Ran" + variable]], axis = 1),
                ax = axes[index, 1])
    
    plt.tight_layout()

### Our new dataset

In [None]:
imputed_df = pd.concat([random_data, df[['Outcome', 'Pregnancies', 'Age', 'DiabetesPedigreeFunction']]], axis=1)
cols = imputed_df.columns
# reorder columns in the dataset
imputed_df = imputed_df[cols[:5].tolist() + cols[6:].tolist() + cols[5:6].tolist()]
cols = imputed_df.columns
imputed_df.columns = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI'] + cols[5:].tolist()
imputed_df.head(3)

#### Variance among features
- Insulin feature has a really high variance

In [None]:
imputed_df[imputed_df.columns[:-1]].var().plot(kind='bar', figsize=(16,4), colormap='Set2',
                                          xlabel='Features', ylabel='Variance', title='Variance among features');

### EDA

### Univariate analysis
#### Pregnancies feature

In [None]:
fig, axes = plt.subplots(1,1, figsize=(14,5))

g = sns.countplot(x='Pregnancies', hue='Outcome', data=imputed_df, palette=sns.color_palette(), ax=axes);
g.set_title('Pregnancies by Outcome');

#### Box plots 
- It seems Insulin and Glucose have the most effect on the outcome

In [None]:
list_of_metrics = [['Glucose','BloodPressure'], ['SkinThickness', 'Insulin'], ['BMI', 'Age']]

def box_plot_func(axes, metric):
    bp_dict = imputed_df.boxplot(column=f'{metric}', by='Outcome', fontsize=12, ax=axes,\
                         vert=False, return_type='both', patch_artist = True);
    # colors for boxplots
    colors = ['b','r']
    for row_key, (ax,row) in bp_dict.iteritems():
        ax.set_xlabel('')
        for i,box in enumerate(row['boxes']):
            box.set_facecolor(colors[i])
    
    ax.set_title("Boxplot of " + f"{metric}")
    ax.set_xlabel(f"{metric}")
    ax.set_ylabel('Rent amount')
    plt.suptitle("")

    
fig, axes = plt.subplots(3,2, figsize=(18,20))
for i in range(3):
    for j in range(2):
        box_plot_func(axes[i,j], list_of_metrics[i][j])

### Pearson correlation for binary categorical variable

In [None]:
f, (ax) = plt.subplots(1, figsize=(18,8))
sns.heatmap(imputed_df.corr(), ax=ax, annot = True);

### Distribution of our features

In [None]:
fig, (axes) = plt.subplots(2, 4, figsize=(18,6)) 
col = 0
for i in range(2):
    for j in range(4):
        sns.histplot(imputed_df[imputed_df.columns[col]], ax=axes[i,j], color='blue',  kde=True)
        col += 1

### Proportions of 1's and 0's
- Dataset is skewed

In [None]:
imputed_df.groupby('Outcome')['Outcome'].size().plot(kind='pie', autopct='%1.1f%%');

### Train/dev/test 

In [None]:
from sklearn.model_selection import train_test_split

X, y = imputed_df.drop('Outcome', axis=1), imputed_df['Outcome']
x_train, x_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2)
# x_valid, x_test, y_valid, y_test = train_test_split(x_temp, y_temp, stratify=y_temp, test_size=0.5)
print('Train', x_train.shape, y_train.shape)
# print('Development', x_valid.shape, y_valid.shape)
print('Test', x_test.shape, y_test.shape)

### Feature selection
- f_classification
- Chi2
- Mutual Information

In [None]:
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif, SelectKBest, SelectPercentile

class UnivariateFeatureSelction:
    def __init__(self, n_features, scoring):
        valid_scoring = {
            'f_classif': f_classif,
            'chi2': chi2,
            'mutual_info_classif': mutual_info_classif
        }
        
        self.selection = SelectPercentile(
            valid_scoring[scoring], percentile=int(n_features * 100))
        
    def fit(self, X, y):
        return self.selection.fit(X, y)

    def transform(self, X):
        return self.selection.transform(X)
    
    def fit_transform(self, X, y):
        return self.selection.fit_transform(X, y)

ufs = UnivariateFeatureSelction(n_features=0.1,scoring="chi2")

ufs.fit(x_train, y_train)
x_train_transformed = ufs.transform(x_train)
# x_valid_transformed = ufs.transform(x_valid)
x_test_transformed  = ufs.transform(x_test)

### Model

In [None]:
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import LogisticRegression, Lasso, RidgeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import cross_val_score, KFold

clf = make_pipeline(StandardScaler(), LogisticRegression())
roc_auc_score = cross_val_score(clf, x_train_transformed, y_train, cv=10, scoring='roc_auc')
roc_auc_score_min = np.min(roc_auc_score)
roc_auc_score_max = np.max(roc_auc_score)
roc_auc_score_mean = np.mean(roc_auc_score)
print(f'Min score: {roc_auc_score_min}, Max score: {roc_auc_score_max}, Mean: {roc_auc_score_mean}')

### Comparison of all models

In [None]:
### empty list
clfs = []

clfs.append(("RandomForestClassifier",
             Pipeline([("RandomForest", RandomForestClassifier())]))) 

clfs.append(("GradientBoostingClassifier",
             Pipeline([("GradientBoosting", GradientBoostingClassifier())]))) 

clfs.append(("DecisionTreeClassifier",
             Pipeline([("DART", DecisionTreeClassifier())])))

clfs.append(("LogisticRegression",
             Pipeline([("Scaler", StandardScaler()),
                       ("LogisticRegression", LogisticRegression())]))) 

clfs.append(("SVM",
             Pipeline([("Scaler", StandardScaler()),
                       ("SVM", SVC())]))) 

### Metrics
# roc_auc
scoring = 'roc_auc'
n_folds = 10

results, names  = [], [] 

for name, model  in clfs:
    kfold = KFold(n_splits=n_folds)
    cv_results = cross_val_score(model, x_train_transformed, y_train, cv= kfold, scoring=scoring, n_jobs=-1)    
    names.append(name)
    results.append(cv_results)    
#     msg = "%s: %f (+/- %f)" % (name, cv_results.mean(),  cv_results.std())
#     print(msg)
    
# boxplot algorithm comparison
fig = plt.figure(figsize=(16,6))
fig.suptitle('Classification Algorithm Comparison', fontsize=14)
ax = fig.add_subplot(111)
sns.boxplot(data=results)
ax.set_xticklabels(names)
ax.set_xlabel("Algorithm", fontsize=14)
ax.set_ylabel("ROC-AUC of Models", fontsize=14)
ax.set_xticklabels(ax.get_xticklabels(),rotation=45)
plt.show()