In [None]:
import numpy as np 
from pandas import *
from matplotlib.pyplot import *
import seaborn as sns
%matplotlib inline 

In [None]:
df = read_csv('/kaggle/input/pima-indians-diabetes-database/diabetes.csv')

In [None]:
df

In [None]:
df.isnull().sum()

In [None]:
df.corr()

# **From the above table , we can infer that there are no correlations between the independent variables . So , we do not need to drop any parameter to eliminate multicollinearity .** 

In [None]:
df['Outcome'].value_counts()

In [None]:
x = df.iloc[:,:-1].values
y = df.iloc[:,-1].values

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,random_state = 0)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [None]:
Raw_models = [(LogisticRegression(),[{'C':[0.25,0.5,0.75,1],'random_state':[0]}]),
             (KNeighborsClassifier(),[{'n_neighbors':[5,10,6,7]}]),
             (SVC(),[{'C':[0.25,0.5,0.75,1],'kernel':['linear'],'random_state':[0]},{'C':[0.25,0.5,0.75,1],'kernel':['rbf'],'gamma':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9],'random_state':[0]}]),
             
             (DecisionTreeClassifier(),[{'criterion':['gini','entropy'],'random_state':[0]}]),
             (RandomForestClassifier(),[{'n_estimators':[10,100,50,150,200],'criterion':['gini','entropy'],'random_state':[0]}])]

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
for i,j in Raw_models:
    grid = GridSearchCV(estimator=i,param_grid = j, scoring = 'accuracy',cv = 10)
    grid.fit(x_train,y_train)
    best_accuracy = grid.best_score_
    best_param = grid.best_params_
    print('{} Best Accuracy : {:.2f}%'.format(i,best_accuracy*100))
    print('Best Parameters : ',best_param)

In [None]:
reg = GaussianNB()
reg.fit(x_train,y_train)

In [None]:
y_NB = reg.predict(x_test)

In [None]:
from sklearn.metrics import confusion_matrix , accuracy_score
cm = confusion_matrix(y_test,y_NB)
print(cm)
accuracy_score(y_test,y_NB)

In [None]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = reg , X = x_train,y = y_train , scoring = 'accuracy',cv = 10)
print('Accuracy of NB : {:.2f}%'.format(accuracies.mean()*100))

In [None]:
xg = XGBClassifier(use_label_encoder = False,eval_metric = 'error')
xg.fit(x_train,y_train)

In [None]:
y_xg = xg.predict(x_test)

In [None]:
cm = confusion_matrix(y_test,y_xg)
print(cm)
accuracy_score(y_test,y_xg)

In [None]:
accu = cross_val_score(estimator = xg , X = x_train,y = y_train , scoring = 'accuracy',cv = 10)
print('Accuracy of XGBoost: {:.2f}%'.format(accu.mean()*100))

In [None]:
lr = LogisticRegression(C= 0.25, random_state = 0)
lr.fit(x_train,y_train)

In [None]:
y_lr = lr.predict(x_test)

In [None]:
cm = confusion_matrix(y_test,y_lr)
print(cm)
accuracy_score(y_test,y_lr)

In [None]:
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(x_train,y_train)

In [None]:
sv = SVC(C= 0.25, gamma= 0.1, kernel= 'rbf', random_state= 0)

In [None]:
dt = DecisionTreeClassifier(criterion = 'gini',random_state = 0)

In [None]:
rf = RandomForestClassifier(criterion = 'entropy',n_estimators = 200,random_state = 0)

In [None]:
l = []
l.append( knn)
l.append(sv)
l.append(dt)
l.append(rf)
l.append(lr)

In [None]:
for i in l:
    i.fit(x_train,y_train)
    pr = i.predict(x_test)
    cm = confusion_matrix(y_test,pr)
    print(i , cm)
    a = accuracy_score(y_test,pr)
    print(a)

In [None]:
df1 = DataFrame({'Model':['LogisticRegression','KNN','svm','Naive_Bayes','DecisionTree','RandomForest','XGBoost'],'Accuracy on Test Set':['82.46%','79.87%','81.81%','79.22%','76.62%','80.51%','81.81%'],
                'Accuracy with K-Fold':['75.89%','72.80%','76.39%','74.27%','70.70%','76.07%','75.08%']})

In [None]:
df1

In [None]:
figure(figsize = (12,8))
sns.barplot(x = 'Model',y = 'Accuracy on Test Set',data = df1)

In [None]:
figure(figsize = (12,8))
sns.barplot(x = 'Model',y = 'Accuracy with K-Fold',data = df1)

# **Conclusion**

***From the above DataFrame and Barplots , it is evident that Logistic regression model gives us good accuracy on this test set . But , considering the accuracy obtained using K-Fold cross validation ,  SVC model with 'rbf' kernel function might perform better with new unseen data .***