## Imports

In [1]:
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

## Loading data

In [2]:
data = pd.read_csv("data.csv")
data.to_numpy()
data.head()

Unnamed: 0,pregnancies,glucose,bpressure,skinfold,insulin,bmi,pedigree,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


## Feature Extraction and train test split

In [21]:
X = data.iloc[:,:8]
y = data['class']
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.33, random_state=69)
print(X,X_test.shape) ## to check if ratio is correct

     pregnancies  glucose  bpressure  skinfold  insulin   bmi  pedigree  age
0              6      148         72        35        0  33.6     0.627   50
1              1       85         66        29        0  26.6     0.351   31
2              8      183         64         0        0  23.3     0.672   32
3              1       89         66        23       94  28.1     0.167   21
4              0      137         40        35      168  43.1     2.288   33
..           ...      ...        ...       ...      ...   ...       ...  ...
763           10      101         76        48      180  32.9     0.171   63
764            2      122         70        27        0  36.8     0.340   27
765            5      121         72        23      112  26.2     0.245   30
766            1      126         60         0        0  30.1     0.349   47
767            1       93         70        31        0  30.4     0.315   23

[768 rows x 8 columns] (254, 8)


# Normalizing data

In [4]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
X_test[:5,:]

array([[-0.57839612, -0.97902812, -0.59720886,  0.35125017, -0.53870698,
        -0.42586261,  0.83950377, -0.98322942],
       [ 2.69840914,  0.96402522,  1.05231109,  0.79175144, -0.4174253 ,
        -0.62475489,  0.73684272,  0.8055753 ],
       [ 0.61316943, -0.67542604, -0.59720886,  0.79175144,  1.08460155,
         0.27026036, -0.14604234,  0.8055753 ],
       [-0.28050473,  0.1746598 ,  0.9492161 ,  1.29518145,  1.50442272,
         0.92909104,  0.65764705, -0.55732354],
       [ 1.50684359, -0.8879475 , -0.08173387, -1.28489738, -0.68797673,
        -0.94795485, -0.82067212,  2.08329295]])

# Training SVM with hyper parameter optimization

In [48]:
for k in ('linear','poly','rbf','sigmoid'):
    clf = svm.SVC(kernel=k)
    clf.fit(X_train,y_train)
    y_pred= clf.predict(X_train)
    score = accuracy_score(y_train, y_pred)*100
    print("accuracy for kernel",k,"is",round(score,2),'%')  b

accuracy for kernel linear is 77.82 %
accuracy for kernel poly is 78.21 %
accuracy for kernel rbf is 77.24 %
accuracy for kernel sigmoid is 52.92 %


# Training SVM with Polynomial kernel

In [50]:
clf= svm.SVC(kernel = 'poly')
clf.fit(X_train, y_train)


SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='poly',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

# Testing accuracy on test data set

In [54]:
y_pred = clf.predict(X_test)
score = accuracy_score(y_test, y_pred)*100
print(y_pred)
print("accuracy :",round(score,2),'%')  

[0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0
 0 0 0 0 1 1 1 0 1 0 0 0 0 0 0 1 1 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
 1 0 1 1 0 0 1 1 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 1 0
 0 0 1 0 0 1 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0
 0 0 0 0 1 0 0 0 0 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
accuracy : 74.8 %


# Testing with zero values

In [61]:
y_zeros = np.zeros(y_test.shape)
score = accuracy_score(y_test, y_zeros)*100
print(y_pred)
print("accuracy :",round(score,2),'%')  

[0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0
 0 0 0 0 1 1 1 0 1 0 0 0 0 0 0 1 1 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
 1 0 1 1 0 0 1 1 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 1 0
 0 0 1 0 0 1 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0
 0 0 0 0 1 0 0 0 0 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
accuracy : 64.57 %


it means that 64.57% people do not have diabetes making our data set unbalanced!

# Precision Testing and Recall

In [62]:
print(classification_report(y_test,y_pred)  )

              precision    recall  f1-score   support

           0       0.75      0.93      0.83       164
           1       0.76      0.42      0.54        90

    accuracy                           0.75       254
   macro avg       0.75      0.67      0.68       254
weighted avg       0.75      0.75      0.73       254

