In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df=pd.read_csv('/kaggle/input/pima-indians-diabetes-database/diabetes.csv')

In [None]:
df

In [None]:
df.isnull().sum() # So there is no need for missing value imputation

In [None]:
df.describe()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
fig = plt.figure(figsize=(8,20))

for index,i in enumerate(df.columns[:-1]):
    ax = fig.add_subplot(len(df.columns),1,index+1)
    ax.hist(df[i].loc[df['Outcome']==0],color='Green',bins=50)
    ax.hist(df[i].loc[df['Outcome']==1],color='Red',bins=50)
    ax.legend(['Non-Diabetic','Diabetic'])
    
    ax.set_title('Distribution of '+ i)
    

plt.tight_layout()
plt.show()

From this we can understand that the zero points in BMI, Skin thickness, Blood pressure and Glucose are clearly outliers, However insulin could still be zero as the subject might not have been administered Insulin, thus the above mentioned outliers need to be treated

In [None]:
sorted(df['Glucose'].unique())[1]

In [None]:
# Removing and replacing the lower outliers with values lesser than the percentile described
def outlier_correction(df,feature,percentile):
    df.loc[((df[feature]==0)|(df[feature]<df[feature].quantile(percentile)))
                              ,feature]=df[feature].quantile(percentile)

    

In [None]:
# another alternative to remove the lowest(In our case zero) and replace them with the next smallest value
def outlier_minima(df,feature):
    df.loc[df[feature]==df[feature].min(),feature]=sorted(df[feature].unique())[1]
    #return df

In [None]:
outlier_minima(df,'Glucose')
outlier_minima(df,'BloodPressure')
outlier_minima(df,'SkinThickness')
outlier_minima(df,'BMI')

In [None]:
df.describe()

In [None]:
# Distribution after outlier correction
fig = plt.figure(figsize=(8,20))

for index,i in enumerate(df.columns[:-1]):
    ax = fig.add_subplot(len(df.columns),1,index+1)
    ax.hist(df[i].loc[df['Outcome']==0],color='Green',bins=50)
    ax.hist(df[i].loc[df['Outcome']==1],color='Red',bins=50)
    ax.legend(['Non-Diabetic','Diabetic'])
    
    ax.set_title('Distribution of '+ i)
    

plt.tight_layout()
plt.show()

In [None]:
sns.pairplot(df,hue='Outcome')

Not a linear distribution clearly, so it would be better to try ensemble techniques

### Machine learning phase

In [None]:
#Converting to arrays
X=df.iloc[:,:-1]
y=df.iloc[:,-1]

In [None]:
np.unique(y,return_counts=True) # Imbalanced Data, Better to use stratification

### Trying Leave one out with several algorithms, Post normalization

In [None]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import LeaveOneOut
from sklearn import metrics
from sklearn.utils import shuffle
import lightgbm
from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings("ignore")
#X=df
#shuffle data
X, y = shuffle(X, y, random_state=20)
#
#model
#clf= XGBClassifier()
clf=RandomForestClassifier(class_weight='balanced')
#clf=lightgbm.LGBMClassifier(is_unbalance=True)
#clf = KNeighborsClassifier(n_neighbors=10, weights = 'distance')
#
#leave-one-out
y_pred = np.array([])
y_true = np.array([])
loo = LeaveOneOut()
for train_index, test_index in loo.split(X):
    X_train = X.iloc[train_index,:]
    X_test = X.iloc[test_index,:]
    y_train = y[train_index]
    y_test = y[test_index]
    #
    #normalize data
    X_train_mean = np.mean(X_train, axis=0)
    X_train_std = np.std(X_train, axis=0)
    X_train_norm = (X_train - X_train_mean)/X_train_std
    X_test_norm = (X_test - X_train_mean)/X_train_std
    #
    #train
    clf.fit(X_train_norm, y_train)
    #
    #test
    y_pred = np.append(y_pred,clf.predict(X_test_norm))
    y_true = np.append(y_true,y_test)
#
#metrics
print("Labels: Non-Diabetic, Diabetic")
print("Confusion matrix")
print(metrics.confusion_matrix(y_true, y_pred))
print("Precision")
print(metrics.precision_score(y_true, y_pred, average=None))
print("Recall")
print(metrics.recall_score(y_true, y_pred, average=None))
print("F1 score")
print(metrics.f1_score(y_true, y_pred, average=None))
print(metrics.f1_score(y_true, y_pred, average='weighted'))
print(metrics.accuracy_score(y_true,y_pred))

|Algorithm | Accuracy |  F1 Score |
|----------|----------|-----------|
| KNN-10   |  0.58    |   0.54    |
| LGB      |  0.53    |   0.529   |
| RF       |  0.63    |   0.56    |

### Random forest most effective, 5 fold cross validation, No normalisation

In [None]:
from sklearn.model_selection import cross_val_score
clf = RandomForestClassifier(max_depth=8,class_weight={0:0.28,1:0.72})
scores = cross_val_score(clf, X, y, cv=5)

f1=cross_val_score(clf, X, y, cv=5,scoring='f1_weighted')
scores

In [None]:
f1

In [None]:
scores.mean()

In [None]:
f1.mean()

Although not great results could be achieved by giving higher weights to the diabetic styate one could reduce false negatives and there is a decent accuracy and F1 score

In [None]:
from scipy import stats
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV
import sklearn
#from sklearn import cross_validation

#shuffle data
X, y = shuffle(X, y, random_state=20)


clf1=RandomForestClassifier(class_weight='balanced')
param_dist = {'n_estimators': [100,150,200,300],
              'min_samples_split':[1,2,4],
              'max_depth': [3, 5, 6, 8, 9],
              'min_samples_leaf':[1,2,3]
             }
              
              
             
scorer = sklearn.metrics.make_scorer(sklearn.metrics.f1_score, average = 'weighted')
#numFolds = 10
#kfold_5 = cross_validation.KFold(n = len(X), shuffle = True, n_folds = numFolds)

clfcv = RandomizedSearchCV(clf1, 
                         param_distributions = param_dist,
                         cv = 5,  
                         n_iter = 20, 
                         scoring = scorer, 
                         error_score = 0, 
                         verbose = 3, 
                         n_jobs = -1,random_state=20)

In [None]:
clfcv.fit(X,y)

In [None]:
clfcv.best_score_

The f1 score obtained after Random search cross validation for tuning the model

In [None]:
clfcv.best_params_

In [None]:
clfcv.best_estimator_

After Random search Cv we get an F1 score of 77.18,
Thus although the predictions will not be up to the mark, upon tuning the model using Randomsied search cross validation, an F1 score of 77.18 has been obtained, Also since it is an imbalanced data set, it is better to rely on the F1 score as we had already seen in the Leave one out cross validation step that there are a lot of False Negatives