In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [None]:

train = pd.read_csv("../input/pima-indians-diabetes-database/diabetes.csv")
pd.set_option('display.max_columns',train.shape[1])
train.head()

In [None]:
df4 = train.pivot_table(index = "Outcome" , values=['Pregnancies', "Glucose","BloodPressure","SkinThickness","Insulin",'BMI','DiabetesPedigreeFunction'],aggfunc = np.mean)
df4

In [None]:
train.shape

In [None]:
train.describe()

In [None]:
train.isnull().sum()

In [None]:
train.info()

In [None]:
print(train['Outcome'].value_counts())
print('\n')

plt.figure(figsize=(8,6))
plt.pie(train['Outcome'].value_counts(),labels=['NOT DIABETIC','DIABETIC'],autopct='%0.1f%%',explode=[0.05,0])
plt.show()


In [None]:
plt.figure(figsize=(10,8))
sns.countplot(y='Pregnancies',data=train,orient='h',color='skyblue',linewidth=2.5,edgecolor='black',order=train.Pregnancies.value_counts().index)
plt.ylabel("Pregnancies", fontweight = "bold", fontsize = 20)
plt.xlabel("Counts", fontweight = "bold",fontsize = 20)
plt.title("Countplot",fontweight = "bold",fontsize = 15)

In [None]:
train.corr()['Outcome']

In [None]:
plt.figure(figsize=(15,8))
sns.heatmap(train.corr(),annot=True,cmap='BrBG')

# Glucose

In [None]:
sns.distplot(train['Glucose'])

In [None]:
#train = train[train["Glucose"]>50]
train.shape

In [None]:
train['Glucose'] = train['Glucose'].replace(0,train['Glucose'].median())

In [None]:
sns.distplot(train['Glucose'])

### Log transformation can work

In [None]:
sns.distplot(np.log(train['Glucose']))

In [None]:
train["Glucose"]=np.log(train.Glucose)

# BloodPressure

In [None]:
train.BloodPressure.nunique()

In [None]:
sns.distplot(train['BloodPressure'])

In [None]:
sns.distplot(train[train.BloodPressure!=0]['BloodPressure'])

In [None]:
train['BloodPressure'].median()

In [None]:
train['BloodPressure'] = train['BloodPressure'].replace(0,train['BloodPressure'].median())

In [None]:
sns.distplot(train['BloodPressure'])

# SkinThickness

In [None]:
sns.distplot(train[train.SkinThickness!=0]['SkinThickness'])

In [None]:
train['SkinThickness'].value_counts()

In [None]:
sns.distplot(train['SkinThickness'])

In [None]:
train['SkinThickness'] = train['SkinThickness'].replace([0,99,7,8],32)
#sns.distplot(train['SkinThickness'])

In [None]:
sns.distplot(train['SkinThickness'])

In [None]:
#train=train.drop('SkinThickness',axis=1)

In [None]:
train.corr()['Outcome']

In [None]:
plt.figure(figsize=(15,8))
sns.heatmap(train.corr(),annot=True)

# Insulin

In [None]:
sns.distplot((train.Insulin))

In [None]:
#train['Insulin']=train['Insulin'].replace(0,np.nan)
train["Insulin"].quantile(0.99)

In [None]:
train[train.Insulin!=0].shape

In [None]:
train[train.Insulin!=0]['Insulin'].mean()

In [None]:
train['Insulin'] = train['Insulin'].replace(0,152.85)

In [None]:
#train = train[train.Insulin < train['Insulin'].quantile(0.99)]

In [None]:
sns.distplot((train["Insulin"]))

In [None]:
#sns.distplot(np.log(train["Insulin"]))

In [None]:
train['Insulin'].isnull().sum()

In [None]:
train.corr()['Insulin']

In [None]:
#j

In [None]:
train.groupby(by ='Glucose')['Insulin'].median()

In [None]:
#gg = train.groupby(by ='Glucose')['Insulin'].median()

In [None]:
'''
def fill(Glucose,Insulin):
    if pd.isnull(Insulin):
        return Insulin == gg[Glucose]
    else:
        return Insulin
'''
    



In [None]:
#train['Insulin'] =train.apply(lambda x: fill(x['Glucose'], x['Insulin']),axis=1)

In [None]:
#train['Insulin'].isnull().sum()

In [None]:
#train = train.drop('Insulin',axis=1)

In [None]:
train.Insulin = train['Insulin'].replace(0,)

In [None]:
train.head()

# BMI

In [None]:
train.BMI.value_counts()

In [None]:
sns.distplot((train.BMI))

In [None]:
train['BMI'] = train['BMI'].replace(0,train.BMI.mean())

In [None]:
sns.distplot((train.BMI))

In [None]:
sns.distplot(np.log(train.BMI))

In [None]:
#train.BMI = np.log(train.BMI)

# DiabetesPedigreefunc

In [None]:
train.DiabetesPedigreeFunction.value_counts()    

In [None]:
sns.distplot((train.DiabetesPedigreeFunction))

In [None]:
train[train.DiabetesPedigreeFunction>1].shape

In [None]:
#train = train[train.DiabetesPedigreeFunction<1]

In [None]:
sns.distplot((train.DiabetesPedigreeFunction))

In [None]:
sns.distplot(np.log(train.DiabetesPedigreeFunction))

In [None]:
train['DiabetesPedigreeFunction'] = np.log(train.DiabetesPedigreeFunction)

# Age

In [None]:
sns.distplot((train.Age))

In [None]:
train.hist(figsize=(20,20))
plt.show()

In [None]:
train.head()

In [None]:
train.shape

In [None]:
x = train.drop('Outcome',axis=1)
y = train.Outcome

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(x)
x = scaler.transform(x)

In [None]:
#sns.pairplot(train ,hue='Outcome')

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)


# decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier().fit(x_train,y_train)
predictions = dtree.predict(x_test)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
print(confusion_matrix(y_test,predictions))
print("\n")
print(classification_report(y_test,predictions))
acc_dtr = round(accuracy_score(y_test,predictions),2)*100

# 2.Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(100).fit(x_train,y_train)
predictions = rfc.predict(x_test)


In [None]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
print(confusion_matrix(y_test,predictions))
print("\n")
print(classification_report(y_test,predictions))
acc_ran = round(accuracy_score(y_test,predictions),2)*100

# SVM

In [None]:
from sklearn.svm import SVC
svc= SVC().fit(x_train,y_train)
predictions = svc.predict(x_test)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
print(confusion_matrix(y_test,predictions))
print("\n")
print(classification_report(y_test,predictions))
acc_svm = round(accuracy_score(y_test,predictions),2)*100

# Xgboost

In [None]:
import xgboost
classifier= xgboost.XGBClassifier()
classifier.fit(x_train,y_train)
predictions = classifier.predict(x_test)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
print(confusion_matrix(y_test,predictions))
print("\n")
print(classification_report(y_test,predictions))
acc_xgb = round(accuracy_score(y_test,predictions),2)*100

In [None]:
'''from sklearn.model_selection import RandomizedSearchCV,GridSearchCV

n_estimators = [50,100,250,500,750,1000,1500]
max_depth = [2,3,5,10,15]
booster = ['gbtree','gblinear']
base_score=[0.25,0.5,0.75,1]
min_child_weight =[0.5,1,2,3,4,5]
learning_rate = [0.05,0.1,0.15,0.20]

hyperparameter_grid={
    'n_estimators':n_estimators,
    'max_depth':max_depth,
    'booster':booster,
    'min_child_weight':min_child_weight,
    'learning_rate':learning_rate    
}'''

In [None]:
'''xgrf = RandomizedSearchCV(estimator=classifier,param_distributions=hyperparameter_grid,n_iter=50,cv=3,
                          random_state=42,verbose=2)
                          
xgrf.fit(x_train,y_train)'''

In [None]:
#xgrf = GridSearchCV(estimator=classifier,param_grid=hyperparameter_grid,cv=3,verbose=2)
                       
                          
#xgrf.fit(x_train,y_train)

In [None]:
'''xgrf2 = xgrf.best_estimator_
predictions = xgrf2.predict(x_test)
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test,predictions))
print("\n")
print(classification_report(y_test,predictions))'''

In [None]:
'''
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.2, max_delta_step=0, max_depth=2,
              min_child_weight=5, missing=nan, monotone_constraints='()',
              n_estimators=50, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)
              '''

# Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression
log = LogisticRegression()
log.fit(x_train,y_train)
predictions = log.predict(x_test)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
print(confusion_matrix(y_test,predictions))
print("\n")
print(classification_report(y_test,predictions))
acc_log = round(accuracy_score(y_test,predictions),2)*100

In [None]:
model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

In [None]:
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_search.fit(x_train, y_train)



In [None]:
grid_search.best_estimator_

In [None]:
best_grid = grid_search.best_estimator_

In [None]:
predictions = best_grid.predict(x_test)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test,predictions))
print("\n")
print(classification_report(y_test,predictions))

# KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(29)
knn.fit(x_train,y_train)
predictions = knn.predict(x_test)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
print(confusion_matrix(y_test,predictions))
print("\n ")
print(classification_report(y_test,predictions))


In [None]:
model_KNN = KNeighborsClassifier()

n_neighbors = range(1,30)
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']
# define grid search
grid = dict(n_neighbors=n_neighbors,weights=weights,metric=metric)


In [None]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model_KNN, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_search.fit(x_train, y_train)



In [None]:
best_grid = grid_search.best_estimator_
predictions = best_grid.predict(x_test)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test,predictions))
print("\n")
print(classification_report(y_test,predictions))
acc_knn=round(accuracy_score(y_test,predictions),4)*100

# results

In [None]:
matrix = pd.DataFrame({
    'MODELS' : ['Decision tree' ,'Random Forest','SVM','Xgboost','Logistic Regression','KNN'],
    'ACCURACY' : [acc_dtr,acc_ran,acc_svm,acc_xgb,acc_log,acc_knn]
})

In [None]:
matrix_new=matrix.sort_values(by='ACCURACY',ascending=False)
matrix_new

In [None]:
plt.figure(figsize=(14,8))
sns.barplot(y=matrix_new['ACCURACY'],x= matrix_new['MODELS'])

In [None]:
plt.figure(figsize=(10,6))
sns.pointplot(y=matrix_new['ACCURACY'],x= matrix_new['MODELS'],linestyles='--',markers='o', markerfacecolor='black')
plt.show()


In [None]:
train.head()

In [None]:
sns.scatterplot(train['SkinThickness'],train["BMI"],hue=train['Outcome'])
