In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, recall_score, f1_score, precision_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from numpy.random import randint

Kaggle Dataset link:https://www.kaggle.com/datasets/akshaydattatraykhare/diabetes-dataset

In [4]:
df =pd.read_csv('/content/drive/MyDrive/ml training/diabetes.csv')

In [5]:
df.isna().sum()
df.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


In [7]:
x=df.drop(['Outcome'], axis=1)
y=df['Outcome']
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)


In [8]:
sc=StandardScaler()
x_train_sc = sc.fit_transform(x_train)
x_test_sc = sc.transform(x_test)

In [9]:
# Decision Tree Classifier
dt_clf = DecisionTreeClassifier(random_state=42)
dt_clf.fit(x_train_sc,y_train)

In [10]:
y_pred_dt=dt_clf.predict(x_test_sc)
print('Accuracy score:',accuracy_score(y_test,y_pred_dt))
print('Classification report:',classification_report(y_test,y_pred_dt))

Accuracy score: 0.7467532467532467
Classification report:               precision    recall  f1-score   support

           0       0.83      0.76      0.79        99
           1       0.62      0.73      0.67        55

    accuracy                           0.75       154
   macro avg       0.73      0.74      0.73       154
weighted avg       0.76      0.75      0.75       154



In [11]:
# SVM
svm = SVC(kernel='linear',random_state=42)
svm.fit(x_train_sc,y_train)

In [13]:
y_pred_svm=svm.predict(x_test_sc)
print('Accuracy score:',accuracy_score(y_test,y_pred_svm))
print('Classification report:',classification_report(y_test,y_pred_svm))

Accuracy score: 0.7597402597402597
Classification report:               precision    recall  f1-score   support

           0       0.81      0.82      0.81        99
           1       0.67      0.65      0.66        55

    accuracy                           0.76       154
   macro avg       0.74      0.74      0.74       154
weighted avg       0.76      0.76      0.76       154



In [14]:
# Random forest
rf = RandomForestClassifier(random_state=42)
rf.fit(x_train_sc,y_train)

In [15]:
y_pred_rf = rf.predict(x_test_sc)
print('Accuracy score:',accuracy_score(y_test,y_pred_rf))
print('Classification report:',classification_report(y_test,y_pred_rf))

Accuracy score: 0.7207792207792207
Classification report:               precision    recall  f1-score   support

           0       0.79      0.78      0.78        99
           1       0.61      0.62      0.61        55

    accuracy                           0.72       154
   macro avg       0.70      0.70      0.70       154
weighted avg       0.72      0.72      0.72       154



In [16]:
# Logistic Regression
lr = LogisticRegression(random_state=42)
lr.fit(x_train_sc,y_train)

In [17]:
y_pred_lr = lr.predict(x_test_sc)
print('Accuracy score',accuracy_score(y_test,y_pred_lr))
print('Classification report:',classification_report(y_test,y_pred_lr))

Accuracy score 0.7532467532467533
Classification report:               precision    recall  f1-score   support

           0       0.81      0.80      0.81        99
           1       0.65      0.67      0.66        55

    accuracy                           0.75       154
   macro avg       0.73      0.74      0.73       154
weighted avg       0.76      0.75      0.75       154



In [20]:
knc = KNeighborsClassifier()
knc.fit(x_train_sc, y_train)

In [21]:
y_pred_knc = knc.predict(x_test_sc)
print('Accuracy score:',accuracy_score(y_test,y_pred_knc))
print('Classification report:',classification_report(y_test,y_pred_knc))

Accuracy score: 0.6948051948051948
Classification report:               precision    recall  f1-score   support

           0       0.75      0.80      0.77        99
           1       0.58      0.51      0.54        55

    accuracy                           0.69       154
   macro avg       0.66      0.65      0.66       154
weighted avg       0.69      0.69      0.69       154



Best performing model is SVM with accuracy of 75.3% so let's tune its hyperparameter using randomized search cv

In [23]:
param ={'C':[0.01, 0.1, 1, 10, 100],
  'kernel':['linear', 'poly', 'rbf', 'sigmoid'],
  'gamma':[0.1, 1, 0.001, 0.01]
}
grid = RandomizedSearchCV(svm, param, refit=True, verbose=3, cv=5)
grid.fit(x_train_sc, y_train)
y_pred_grid = grid.predict(x_test_sc)
print('Accuracy score:',accuracy_score(y_test,y_pred_grid))
print('Classification report:',classification_report(y_test,y_pred_grid))

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END ....C=1, gamma=0.1, kernel=sigmoid;, score=0.740 total time=   0.0s
[CV 2/5] END ....C=1, gamma=0.1, kernel=sigmoid;, score=0.740 total time=   0.0s
[CV 3/5] END ....C=1, gamma=0.1, kernel=sigmoid;, score=0.715 total time=   0.0s
[CV 4/5] END ....C=1, gamma=0.1, kernel=sigmoid;, score=0.691 total time=   0.0s
[CV 5/5] END ....C=1, gamma=0.1, kernel=sigmoid;, score=0.713 total time=   0.0s
[CV 1/5] END .C=10, gamma=0.001, kernel=sigmoid;, score=0.732 total time=   0.0s
[CV 2/5] END .C=10, gamma=0.001, kernel=sigmoid;, score=0.813 total time=   0.0s
[CV 3/5] END .C=10, gamma=0.001, kernel=sigmoid;, score=0.748 total time=   0.0s
[CV 4/5] END .C=10, gamma=0.001, kernel=sigmoid;, score=0.748 total time=   0.0s
[CV 5/5] END .C=10, gamma=0.001, kernel=sigmoid;, score=0.762 total time=   0.0s
[CV 1/5] END ..C=100, gamma=0.1, kernel=sigmoid;, score=0.707 total time=   0.0s
[CV 2/5] END ..C=100, gamma=0.1, kernel=sigmoid;

In [24]:
print(grid.best_params_)
print(grid.best_estimator_)

{'kernel': 'rbf', 'gamma': 0.01, 'C': 1}
SVC(C=1, gamma=0.01, random_state=42)


In [26]:
svm = SVC(C=1, kernel='rbf',gamma=0.01)
svm.fit(x_train_sc,y_train)
y_pred_r = svm.predict(x_test_sc)
print('Accuracy score:',accuracy_score(y_test,y_pred_r))
print('Classification report:',classification_report(y_test,y_pred_r))

Accuracy score: 0.7662337662337663
Classification report:               precision    recall  f1-score   support

           0       0.81      0.84      0.82        99
           1       0.69      0.64      0.66        55

    accuracy                           0.77       154
   macro avg       0.75      0.74      0.74       154
weighted avg       0.76      0.77      0.76       154

