In [None]:
#Importing all necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import *
from sklearn.metrics import *
from sklearn.ensemble import *
from sklearn.svm import *

In [None]:
#Loading dataset and reading the first few columns
df = pd.read_csv('../input/diabetescsv/diabetes.csv')
df.head()

In [None]:
#  Check the number of rows and columns in the dataset
df.shape

This dataset is originally from the National Institute of Diabetes and Digestive and Kidney Diseases. The objective of the dataset is to diagnostically predict whether or not a patient has diabetes, based on certain diagnostic measurements included in the dataset. Several constraints were placed on the selection of these instances from a larger database. In particular, all patients here are females at least 21 years old of Pima Indian heritage.

The datasets consists of several medical predictor variables and one target variable, Outcome. Predictor variables includes the number of pregnancies the patient has had, their BMI, insulin level, age, and so on. italicized text italicized text

# Get basic statistics - gives statistics of only numerical col

In [None]:
# Get basic statistics - gives statistics of only numerical columns in the dataset. 
#714 columns for age indicates missing values in age column
df.describe()

The above dataframe describes all key information in the dataset. I have made a few google searches to gain some domain knowledge. This is mainly to identify any outlier or inconsistent values in the dataset. As per the secondary research, Skin Thickness COlumn has outliers since max values are seen as 99 while average values among Indian women ranges from 18-22 for tricep area.

Below are the counts of 0 values for Glucose, BP and BMI. As seen, Glucose and BMI has 5 and 11 0 values resp. and apart from these values, the distributions in the graphs are gaussian. We can later correct these 0 values to have normal distributions for both. Blood Pressure has 35 0 values and I will not adjust these 0 values. italicized text


In [None]:
df.groupby('Glucose').size()

In [None]:
df.groupby('BloodPressure').size()

In [None]:
df.groupby('BMI').size()

In [None]:
sns.set_style('darkgrid')
cols = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age']
n_rows = 2
n_cols = 4

fig, axs = plt.subplots(n_rows, n_cols, figsize=(n_cols*3.5, n_rows*3.5))

for r in range(0, n_rows):
    for c in range(0,n_cols):
        i = r*n_cols +c  
        ax = axs[r][c] 
        
        sns.distplot(df[cols[i]], ax = ax)
        
    plt.tight_layout()


Checking the distributions of all column data to identify the distribution type and outliers if any.

The graphs for Glucose, Blood Pressure and BMI shows 0 values. Most of the distributions do not follow Gussian distribution and shows signs of positive skewness and kurtosis

Now I will further check for outlier distributions and quartile quartile distributions. I already have an idea of the outliers as mentioned above 


In [None]:
sns.set_style('darkgrid')
cols = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age']
n_rows = 2
n_cols = 4

fig, axs = plt.subplots(n_rows, n_cols, figsize=(n_cols*3.5, n_rows*3.5))

for r in range(0, n_rows):
    for c in range(0,n_cols):
        i = r*n_cols +c  
        ax = axs[r][c] 
        
        sns.boxplot(df[cols[i]], ax = ax)
        
    plt.tight_layout()

The box plots for Glucose and BMI clearcly shows the distributions of majority data lying between 100-140 for Glucose and 25-35 for BMI. BMI also has outliers in the right quartile, and their numbers are high. I will treat the Glucose and BMI columns for 0 values since there are few.

Before I do that, lets also check the outcome across features using pairplot


In [None]:
sns.pairplot(df, height = 3, hue = 'Outcome', diag_kind = 'kde')
plt.figure(figsize=(10,10))
plt.show()

The above pairplots shows how the outcomes have varied based on different features. To get the exact correlation or dependency on a partifular feature, I will check for correlation below. The below correlation matrix shows Glucose, BMI and Pregnency to be highly correlated with the outcome.I will also visualize the same using a heatmap.

In [None]:

df_correlation = df.corr()
df_correlation

In [None]:
fig, ax = plt.subplots(figsize = (16,6))
sns.heatmap(df_correlation, annot = True, annot_kws= {'size': 12})

The correlation heatmap shows 4 main correlated features to outcome, Age, BMI, Glucose, Pregnancies. However, Pregnancy and Age are highly correlated and will result in co-linearity. Since no particular feature shows high correlation, I will not perform feature selection, and will continue with the existing features to run prediction models

Before we do that, important data processing is needed. I am replacing 0 values in Glucose, BMI, Blood Pressure, Insulin and SKinTHickness with its mean values. However, mean values are added in below code separately for people with Diabetes and people without since these features are important contributors to diabetes.

In [None]:
filt1=(df['Outcome']==0)&(df['Glucose']==0)
filt2=(df['Outcome']==1)&(df['Glucose']==0)
df.loc[filt1,'Glucose']=110
df.loc[filt2,'Glucose']=141

filt1=(df['Outcome']==0)&(df['BloodPressure']==0)
filt2=(df['Outcome']==1)&(df['BloodPressure']==0)
df.loc[filt1,'BloodPressure']=68
df.loc[filt2,'BloodPressure']=71

filt1=(df['Outcome']==0)&(df['SkinThickness']==0)
filt2=(df['Outcome']==1)&(df['SkinThickness']==0)
df.loc[filt1,'SkinThickness']=20
df.loc[filt2,'SkinThickness']= 22

filt1=(df['Outcome']==0)&(df['Insulin']==0)
filt2=(df['Outcome']==1)&(df['Insulin']==0)
df.loc[filt1,'Insulin']=69
df.loc[filt2,'Insulin']=100

filt1=(df['Outcome']==0)&(df['BMI']==0)
filt2=(df['Outcome']==1)&(df['BMI']==0)
df.loc[filt1,'BMI']=30
df.loc[filt2,'BMI']=37
df.head()

Now, separating outcome which is the dependent variable and rest of the features which are independent

In [None]:
X = df[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']]
y = df['Outcome']
print("Shape of X :", X.shape)
print("Shape of y :",y.shape)

Spliting the dataset into test and train 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 42)

Standardizing the data by using StandardScalar package

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In the below cell, I am defining a function to train any model we need to check


In [None]:
def models(X_train, y_train):
    
    #1st we will use Logistic regression
    from sklearn.linear_model import LogisticRegression
    log = LogisticRegression(random_state=0)
    log.fit(X_train, y_train)

    from sklearn.linear_model import RidgeClassifier
    clf = RidgeClassifier()
    clf.fit(X_train, y_train)
    
    # Using KNeighbors 
    from sklearn.neighbors import KNeighborsClassifier
    knn = KNeighborsClassifier(n_neighbors =5, metric = 'minkowski', p=2)
    knn.fit(X_train,y_train)
    
    # Using SVM (linear kernel)
    from sklearn.svm import SVC
    svc_lin = SVC(kernel = 'linear', random_state = 0)
    svc_lin.fit(X_train,y_train)
    
    # Using SVM (RBF kernel)
    from sklearn.svm import SVC
    svc_rbf = SVC(kernel = 'rbf', random_state = 0)
    svc_rbf.fit(X_train,y_train)
    
    #Use GaussianNB
    from sklearn.naive_bayes import GaussianNB
    gauss = GaussianNB()
    gauss.fit(X_train,y_train)
    
    #Using Decision tree
    from sklearn.tree import DecisionTreeClassifier
    tree = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
    tree.fit(X_train, y_train)
    
    #Using Random Forest Classifier
    from sklearn.ensemble import RandomForestClassifier
    forest = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state =0)
    forest.fit(X_train, y_train)

    import xgboost as xgb
    xb = xgb.XGBClassifier(random_state=0)
    xb.fit(X_train,y_train)

    print('[0]Logistic Regression Training Accuracy', log.score(X_train, y_train))
    print('[1]K Nearest Neighbors Regression Training Accuracy', knn.score(X_train, y_train))
    print('[2]SVC Linear Regression Training Accuracy', svc_lin.score(X_train, y_train))
    print('[3]SVC RBF Regression Training Accuracy', svc_rbf.score(X_train, y_train))
    print('[4]Gaussian Regression Training Accuracy', gauss.score(X_train, y_train))
    print('[5]Decision Tree Regression Training Accuracy', tree.score(X_train, y_train))
    print('[6]Random Forest Regression Training Accuracy', forest.score(X_train, y_train))
    print('[7]Ridge Classifier Training Accuracy', clf.score(X_train, y_train))  
    print('[8]XGB Classifier Training Accuracy', xb.score(X_train, y_train)) 
      
    return log, knn, svc_lin, svc_rbf, gauss, tree, forest, clf,xb

In [None]:
models = models(X_train, y_train)

Now I am going to use confusion matrix to check testing accuracy of all the above models

In [None]:
from sklearn.metrics import confusion_matrix

for i in range(len(models)):
    cm = confusion_matrix(y_test, models[i].predict(X_test))
    
    #Extracting TN, FN, TP, FP
    TN, FN, TP, FP = confusion_matrix(y_test, models[i].predict(X_test)).ravel()
    test_score = (TP + TN)/(TP + TN + FN + FP)
    print(cm)
    print('Model[{}] Testing Accuracy = "{}"'.format(i, test_score))
    print()

As clearly seen above, no particular model is performing well on test data. All are weak learners. I will further use voting classifier to combine weak learners to give better results

In [None]:
from sklearn.linear_model import LogisticRegression 
log = LogisticRegression(random_state=0)
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
from sklearn.linear_model import RidgeClassifier
rc = RidgeClassifier()
from sklearn.ensemble import VotingClassifier
vt = VotingClassifier(estimators = [('log',log),("tree",tree),('rc',rc)],voting="hard", flatten_transform=True)
vt.fit(X_train,y_train)
vt.score(X_test, y_test)

The voting classifier did improve the test accuracy significantly. However, we can further imporve this. I am using Gradient Boosting Classifier to check if we can imporve on the results further

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gb_clf = GradientBoostingClassifier(random_state =0)
gb_clf.fit(X_train,y_train)
gb_score = gb_clf.score(X_test,y_test)
gb_score

Gradient Boosting Classifier further imporves the test data accuracy. For additional imporvements, I am also checking for XGB CLassifier, which again enhances our test results.

In [None]:
import xgboost as xgb
xgb_model = xgb.XGBClassifier(random_state=0)
xgb_model.fit(X_train,y_train)

In [None]:
from sklearn.metrics import accuracy_score
y_pred = xgb_model.predict(X_test)
accuracy_score(y_test,y_pred)

Printing the performance report for XGB model 

In [None]:
!pip install catboost
from catboost import CatBoostClassifier, Pool
cat_model=CatBoostClassifier()
cat_model.fit(X_train, y_train)
y_pred=model.predict(X_test)
score=accuracy_score(y_pred,y_test)
print("Test score is ",score)

In [None]:
pred = cat_model.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,pred))

In [None]:
from sklearn.metrics import roc_curve,auc
fpr, tpr, thresholds = roc_curve(y_test,pred)
auc_vt = auc(fpr, tpr)
plt.plot([0,1],[0,1],'k--')
plt.plot(fpr,tpr, label='CatBoost Classifier (auc = %0.3f)'% auc_vt )
plt.xlabel('True Positive Rate')
plt.xlabel('False Positive')
plt.title('CatBoost ROC curve')
plt.legend()
plt.show()

From all the analysis, processing and model selection, it is clear that CatBoost model performs the best on the given dataset
