![](http://res.cloudinary.com/grohealth/image/upload/v1581692228/DCUK/Content/iStock-9217203841.jpg)

In [None]:
import pandas as pd
import numpy as np
df = pd.read_csv("../input/pima-indians-diabetes-database/diabetes.csv")
df.head()

In [None]:
df.info()

No Null values!
Shape is (768,9)

In [None]:
#Plot count of outcome variable

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

sns.countplot(x = "Outcome", data =df)

In [None]:
#plot pregnancies

sns.countplot(x = "Pregnancies", data =df)

In [None]:
#plotting correlation
plt.figure(figsize = (12,6))  #figsize is made (12,6) so that there is no congestion/overlap of numbers
sns.heatmap(df.corr(), annot = True) #annot true because we want the numbers on plots

#### Age and pregnancies have strong correlation of about 0.54
#### Outcome and glucose have a pretty good correlation, 0.47
#### Insulin and skin thickness have a strong correlation of 0.44

In [None]:
df.Outcome.value_counts()

This is an imbalanced class, i.e for 1 (diabetes positive) there are only 268 values, whereas for 0 there are 500 values. So a balance is required or else the prediction will be biased towards 0 value.

In [None]:
zero  = df[df['Outcome']==0]   #zero values in outcome column
one = df[df['Outcome']==1]  # one values in outcome column
from sklearn.utils import resample
#minority class that 1, we need to upsample/increase that class so that there is no bias
#n_samples = 500 means we want 500 sample of class 1, since there are 500 samples of class 0
df_minority_upsampled = resample(one, replace = True, n_samples = 500) 
#concatenate
df = pd.concat([zero, df_minority_upsampled])

from sklearn.utils import shuffle
df = shuffle(df) # shuffling so that there is particular sequence

In [None]:
df.corr().abs()['Outcome'].sort_values(ascending = False)

In [None]:
X = df.drop(['Outcome'], axis = 1)
y = df['Outcome']

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
features= X.columns
X[features] = sc.fit_transform(X[features])

In [None]:
#all imports

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from catboost import CatBoostClassifier


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.1)

In [None]:
lr = LogisticRegression(random_state=42)

knn = KNeighborsClassifier()

dt = DecisionTreeClassifier()

rf = RandomForestClassifier()

cb = CatBoostClassifier(loss_function='Logloss', verbose = 0)

para_knn = {'n_neighbors':np.arange(2, 50)}  #parameters of knn
grid_knn = GridSearchCV(knn, param_grid=para_knn, cv=5) #grid search knn for 5 fold cross validation


#parameters for decision tree
para_dt = {'criterion':['gini','entropy'],'max_depth':np.arange(1, 50), 'min_samples_leaf':[1,2,4,5,10,20,30,40,80,100]}
grid_dt = GridSearchCV(dt, param_grid=para_dt, cv=5) #grid search decision tree for 5 fold cv
#"gini" for the Gini impurity and “entropy” for the information gain.
#min_samples_leaf: The minimum number of samples required to be at a leaf node, have the effect of smoothing the model

#parameters for random forest
#n_estimators: The number of trees in the forest.
params_rf = {'n_estimators':[100, 350, 500], 'min_samples_leaf':[2, 10, 30]}
grid_rf = GridSearchCV(rf, param_grid=params_rf, cv=5)


params_cb = {'learning_rate': [0.03, 0.1], 'depth': [4, 6, 10], 'l2_leaf_reg': [1, 3, 5, 7, 9]}
rs_cb = RandomizedSearchCV(cb, param_distributions=params_cb, n_iter=5, scoring='roc_auc', n_jobs=4, cv=3)

In [None]:
#getting the best parametrs
grid_knn.fit(X_train, y_train)
grid_dt.fit(X_train, y_train)
grid_rf.fit(X_train, y_train)
rs_cb.fit(X_train, y_train) 


print("Best parameters for KNN:", grid_knn.best_params_)
print("Best parameters for Decision Tree:", grid_dt.best_params_)
print("Best parameters for Random Forest:", grid_rf.best_params_)
print("Best parameters for CatBoost:", rs_cb.best_params_)

In [None]:
dt = DecisionTreeClassifier(criterion='entropy', max_depth=28, min_samples_leaf=1, random_state=42)
knn = KNeighborsClassifier(n_neighbors=3)
rf = RandomForestClassifier(n_estimators=350, min_samples_leaf=2, random_state=42)
cb = CatBoostClassifier(learning_rate = 0.03, l2_leaf_reg = 7, depth = 10, loss_function = 'Logloss', verbose = 0)

In [None]:
classifiers = [('Logistic Regression', lr), ('K Nearest Neighbours', knn),
               ('Decision Tree', dt), ('Random Forest', rf), ('CatBoost', cb)]

for classifier_name, classifier in classifiers:
 
    # Fit clf to the training set
    classifier.fit(X_train, y_train)    
   
    # Predict y_pred
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    

   
    # Evaluate clf's accuracy on the test set
    print('{:s} : {:.1f}'.format(classifier_name, accuracy))

## Catboost is the best performing model

In [None]:
from sklearn.metrics import classification_report

y_pred_cb= cb.predict(X_test)
print(classification_report(y_test, y_pred_cb))

In [None]:
#XGBoost
from xgboost import XGBClassifier
model= XGBClassifier(n_estimators = 1000,learning_rate = 0.06,max_depth=29,
                     max_leaves = 31,eval_metric = 'logloss', use_label_encoder = False,
                     verbosity = 0)
model.fit(X_train,y_train)

In [None]:
pred_xgb = model.predict(X_test)
accuracy_score(y_test, pred_xgb)

Catboost performed better than XGBoost

## Upvote if you like it or fork it :)