In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('../input/pima-indians-diabetes-database/diabetes.csv')
data.head()

In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
data.nunique().sort_values()

In [None]:
data.describe()

In [None]:
X = data.iloc[:,:-1]
Y = data.iloc[:,-1]

from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(X,Y,test_size=1/3,random_state=1)

In [None]:
# Linear Regression
from sklearn.linear_model import LinearRegression
ls = LinearRegression()
ls_model = ls.fit(xtrain,ytrain)
ypredict = ls_model.predict(xtest)
#Here ypredict has float values (Continous values), But our target is in discrete form (either 1 or 0) . So Regression algorithms are not advisable for this problem

In [None]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
lg = LogisticRegression(solver='liblinear')
lg_model = lg.fit(xtrain,ytrain)
ypredict = lg_model.predict(xtest)
#print(ypredict)
from sklearn.metrics import accuracy_score
print('Accuracy with Logistic Regression Algorithm',accuracy_score(ytest,ypredict))

In [None]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt_model = dt.fit(xtrain,ytrain)
ypredict = dt_model.predict(xtest)
#ypredict
print('Accuracy with Decision Tree Classifier Algorithm',accuracy_score(ytest,ypredict))

In [None]:
# Naive Bayes
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb_model = nb.fit(xtrain,ytrain)
y_predict = nb_model.predict(xtest)
#ypredict
print('Accuracy with Gaussian Naive Bayes Algorithm',accuracy_score(ytest,ypredict))

In [None]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf_model = rf.fit(xtrain,ytrain)
y_predict = rf_model.predict(xtest)
#ypredict
print('Accuracy with Gaussian Random Forest Classifier Algorithm',accuracy_score(ytest,ypredict))

In [None]:
# AdaBoost Classifier
from sklearn.ensemble import AdaBoostClassifier
adb = AdaBoostClassifier()
adb_model = adb.fit(xtrain,ytrain)
y_predict = rf_model.predict(xtest)
#ypredict
print('Accuracy with AdaBoost Classifier Algorithm',accuracy_score(ytest,ypredict))

In [None]:
# Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier()
gb_model = gb.fit(xtrain,ytrain)
ypredict = gb_model.predict(xtest)
#ypredict
print('Accuracy with GradientBoostingClassifier Algorithm',accuracy_score(ytest,ypredict))

In [None]:
# Extreme Gradient Boosting Classifier (XGBoost)
import xgboost
from xgboost import XGBClassifier
xgb = XGBClassifier(use_label_encoder=False)
xgb_model = xgb.fit(xtrain,ytrain,early_stopping_rounds=10,eval_set=[(xtest, ytest)], eval_metric='logloss',verbose=True)
ypredict = xgb_model.predict(xtest)
#ypredict
print('Accuracy with Extreme GradientBoostingClassifier Algorithm',accuracy_score(ytest,ypredict))

In [None]:
# feature importance 
from xgboost import plot_importance
from matplotlib import pyplot as plt
plot_importance(xgb_model)
plt.show()

In [None]:
# XG Boost Tree Diagram
plt.figure(figsize=(100,100))
xgboost.plot_tree(xgb_model,ax=plt.gca())

In [None]:
# Hyper Parameter tunning for XGBoost
parms = {
    'learning_rate':[0.05,0.10,0.15,0.20,0.25,0.30,0.50,1.00],
    'max_depth':[3,4,5,6,7,9,10,12,15,20],
    'min_child_weight':[1,3,5,7],
    'gamma':[0.0,0.1,0.2,0.4],
    'colsample_bytree':[0.3,0.4,0.5,0.7]
}

from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
classifier = xgb
random_search = RandomizedSearchCV(classifier,param_distributions=parms,n_iter=5,n_jobs=-1,cv=5,verbose=3,scoring='roc_auc')
random_search.fit(X,Y)
print(random_search.best_params_)

In [None]:
random_search.best_estimator_

In [None]:
# Tunned XGBoost modelwith randomized search
classifier = xgboost.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.4, gamma=0.2, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=12,
              min_child_weight=1, missing=np.nan, monotone_constraints='()',
              n_estimators=100, n_jobs=2, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)
xgb_tunned_model = classifier.fit(xtrain,ytrain)
ypredict = xgb_tunned_model.predict(xtest)
#ypredict
print('Accuracy with Tunned Extreme GradientBoostingClassifier Algorithm',accuracy_score(ytest,ypredict))

In [None]:
# Save predictions in format used for competition scoring
output = pd.DataFrame({'Id': xtest.index,
                       'Diabetic?': ypredict})
output.to_csv('submission.csv', index=False)