In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import sys
import warnings

In [None]:
data = pd.read_csv("/kaggle/input/pima-indians-diabetes-database/diabetes.csv")

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.columns

In [None]:
data.info()

In [None]:
# Check any null-values is present
data.isnull().values.any()

In [None]:
# Correlation 

import seaborn as sns

# Get correlations of each features in dataset

corr_features = data.corr()

top_corr_features = corr_features.index
plt.figure(figsize=(12,8))

# Plot heatmap

g= sns.heatmap(data[top_corr_features].corr(), annot=True, cmap="Accent")

In [None]:
data.corr()

In [None]:
data.head(5)

In [None]:
diabetes_true_count = len(data.loc[data['Outcome'] == True])
diabetes_false_count = len(data.loc[data['Outcome'] == False])

In [None]:
(diabetes_true_count, diabetes_false_count)

In [None]:
# Split the data into train and test 

from sklearn.model_selection import train_test_split

feature_cols = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
predicted_cols = ['Outcome']

In [None]:
X = data[feature_cols].values
y = data[predicted_cols].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

## Check how many missing (zero) values

In [None]:
print("Total number of rows : {0}".format(len(data)))
print("Number of rows missing in Pregnancies : {0}".format(len(data.loc[data['Pregnancies'] == 0])))
print("Number of rows missing in Glucose : {0}".format(len(data.loc[data['Glucose'] == 0])))
print("Number of rows missing in BloodPressure : {0}".format(len(data.loc[data['BloodPressure'] == 0])))
print("Number of rows missing in SkinThickness: {0}".format(len(data.loc[data['SkinThickness'] == 0])))
print("Number of rows missing in Insulin : {0}".format(len(data.loc[data['Insulin'] == 0])))
print("Number of rows missing in BMI : {0}".format(len(data.loc[data['BMI'] == 0])))
print("Number of rows missing in DiabtesPedigreeFunction : {0}".format(len(data.loc[data['DiabetesPedigreeFunction'] == 0])))
print(print("Number of rows missing in Age : {0}".format(len(data.loc[data['Age'] == 0]))))

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
fill_values = SimpleImputer(missing_values=0, strategy='mean')

X_train = fill_values.fit_transform(X_train)
X_test = fill_values.transform(X_test)

In [None]:
## Apply Algorithm 

from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(random_state=1)

rf_model.fit(X_train, y_train.ravel())

In [None]:
preds = rf_model.predict(X_test)

from sklearn.metrics import accuracy_score, r2_score

print("Accuracy = {0:.3f}".format(accuracy_score(y_test, preds)))

In [None]:
print("R2 Score = {0:.3f}".format(r2_score(y_test, preds)))

In [None]:
# Hyper parameter optimization

params = {
    "learning_rate" : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
    "max_depth": [3, 4, 5, 6, 8, 10, 12, 15],
    "min_child_weight": [1, 3, 5, 7],
    "gamma": [0.0, 0.1, 0.2, 0.3, 0.4],
    "colsample_bytree": [0.3, 0.4, 0.5, 0.7]
}

In [None]:
# Hyperparameter optimization using RandomizedSearchCV

from sklearn.model_selection import RandomizedSearchCV
import xgboost

In [None]:
classifier = xgboost.XGBClassifier()

In [None]:
random_search = RandomizedSearchCV(classifier, param_distributions=params, n_iter=5, scoring='roc_auc', n_jobs=-1, cv=5, verbose=True)

In [None]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print("\n Time Taken: %i hours %i minutes and %i seconds." %(thour, tmin, round(tsec, 2)))

In [None]:
from datetime import datetime

# Here we go

start_time = timer(None)
random_search.fit(X_train, y_train.ravel())
timer(start_time)

In [None]:
random_search.best_estimator_

In [None]:
classifier = xgboost.XGBClassifier(base_score=0.1, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.3, gamma=0.2,
              learning_rate=0.1, max_delta_step=0, max_depth=1,
              min_child_weight=3, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [None]:
classifier.fit(X_train, y_train)

In [None]:
y_preds = classifier.predict(X_test)

In [None]:
from sklearn.model_selection import cross_val_score
score = cross_val_score(classifier, X_train, y_train.ravel(), cv=10)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
cm = confusion_matrix(y_test, y_preds)

score = accuracy_score(y_test, y_preds)

In [None]:
print(cm)


In [None]:
print(score)