In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pickle

In [None]:
# loading the dataset
df = pd.read_csv('../input/pima-indians-diabetes-database/diabetes.csv')

In [None]:
df.head()

In [None]:
df.info()

### Handling Missing Values

In [None]:
df[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']]=df[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.nan)

In [None]:
df.isna().sum()

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values =np.NaN,strategy='median')
imputer.fit(df)
x = imputer.transform(df)

In [None]:
df_new = pd.DataFrame(x,columns=df.columns)

In [None]:
df_new.head()

In [None]:
df_new.info()

### Looking for Correlations

In [None]:
corr_matrix=df_new.corr()

In [None]:
corr_matrix

In [None]:
corr_matrix['Outcome'].sort_values(ascending=False)

In [None]:
sns.heatmap(corr_matrix,annot=True)

### Visualization


In [None]:
df_new.hist(bins=50,figsize=(20,15))

In [None]:
df_new.plot(kind='box',figsize=(20,15))

In [None]:
df_new.plot(kind='scatter',y='Pregnancies',x='Age',figsize=(20,15))

### Splitting the Data

In [None]:
X = df_new.iloc[:,:-1].values
Y = df_new.iloc[:,-1].values

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test =train_test_split(X,Y,test_size=0.2,random_state=42)

In [None]:
# import all the algorithm we want to test
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [None]:
models = []
models.append(('LR', LogisticRegression()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVC', SVC()))
models.append(('RFC', RandomForestClassifier()))
models.append(('DTR', DecisionTreeClassifier()))
models.append(('XG',XGBClassifier()))
models.append(('LGB',LGBMClassifier()))
models.append(('CAT',CatBoostClassifier()))

In [None]:
from sklearn.model_selection import KFold,cross_val_score
names = []
results = []

for name,model in models:
    kfold = KFold(n_splits=10,random_state=7)
    cv_results = cross_val_score(model,X,Y,cv=kfold,scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (
        name, cv_results.mean(), cv_results.std()
    )
    print(msg)

In [None]:
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

In [None]:
model = XGBClassifier(learning_rate =0.01,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
model.fit(X_train,Y_train)

In [None]:
filename = 'model.pkl'
pickle.dump(model,open(filename, 'wb'))

In [None]:
pred = model.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score
cm = confusion_matrix(Y_test,pred)
print(cm)

In [None]:
accuracy_score(Y_test,pred)