In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import f1_score, accuracy_score, classification_report, recall_score

In [2]:
data = pd.read_csv("../diabetes.csv")
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
columns = list(data.columns)
columns.remove("Outcome")
print (columns)

['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']


In [4]:
length = len(data)
print (length)

768


In [5]:
# Fill missing values
for col in columns:
    median = data[col].mean()
    this_col = np.zeros(length, dtype=np.float32)
    for i in range(length):
        if data[col][i] == 0:
            this_col[i] = median
            continue
        this_col[i] = data[col][i]
    data[col] = this_col

In [6]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,148.0,72.0,35.0,79.799477,33.599998,0.627,50.0,1
1,1.0,85.0,66.0,29.0,79.799477,26.6,0.351,31.0,0
2,8.0,183.0,64.0,20.536459,79.799477,23.299999,0.672,32.0,1
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0
4,3.845052,137.0,40.0,35.0,168.0,43.099998,2.288,33.0,1


In [7]:
def show_accuracy(y_true, y_pred):
    print (f"Accuracy: {accuracy_score(y_true, y_pred)}")
    print (f"f1 score: {f1_score(y_true, y_pred)}")
    print (f"Recall  : {recall_score(y_true, y_pred)}")
    print (classification_report(y_true, y_pred))

In [8]:
Xs, Ys = data[columns], data["Outcome"]
print (data.shape)
print (Xs.shape, Ys.shape)

(768, 9)
(768, 8) (768,)


In [9]:
xtr, xte, ytr, yte = train_test_split(Xs, Ys, random_state=123, test_size=0.2)

In [10]:
from sklearn.model_selection import GridSearchCV
params = {'C': [0.1, 1, 2], "gamma": ["scale", "auto"]}
grid_search = GridSearchCV(SVC(), params, cv=5, scoring='recall')
grid_search.fit(xtr, ytr)
print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

Best parameters:  {'C': 2, 'gamma': 'scale'}
Best score:  0.4523809523809524


In [11]:
# Best model
svm_model = SVC(C=2, gamma="scale")
svm_model.fit(xtr, ytr)

y_pred = svm_model.predict(xte)

show_accuracy(yte, y_pred)

Accuracy: 0.7857142857142857
f1 score: 0.6451612903225806
Recall  : 0.5172413793103449
              precision    recall  f1-score   support

           0       0.76      0.95      0.85        96
           1       0.86      0.52      0.65        58

    accuracy                           0.79       154
   macro avg       0.81      0.73      0.75       154
weighted avg       0.80      0.79      0.77       154

