In [2]:
%matplotlib inline

In [3]:
#Dependencies
import os
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier

In [4]:
os.chdir('/Users/ruddysimonpour/Desktop/Diabetes/source')
diabetes_file = pd.read_csv("pima-data.csv", low_memory=False)

In [5]:
diabetes_file

Unnamed: 0,num_preg,glucose_conc,diastolic_bp,thickness,insulin,bmi,diab_pred,age,skin,diabetes
0,6,148,72,35,0,33.6,0.627,50,1.3790,True
1,1,85,66,29,0,26.6,0.351,31,1.1426,False
2,8,183,64,0,0,23.3,0.672,32,0.0000,True
3,1,89,66,23,94,28.1,0.167,21,0.9062,False
4,0,137,40,35,168,43.1,2.288,33,1.3790,True
...,...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,1.8912,False
764,2,122,70,27,0,36.8,0.340,27,1.0638,False
765,5,121,72,23,112,26.2,0.245,30,0.9062,False
766,1,126,60,0,0,30.1,0.349,47,0.0000,True


In [7]:
diabetes_file.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
num_preg,768.0,3.845052,3.369578,0.0,1.0,3.0,6.0,17.0
glucose_conc,768.0,120.894531,31.972618,0.0,99.0,117.0,140.25,199.0
diastolic_bp,768.0,69.105469,19.355807,0.0,62.0,72.0,80.0,122.0
thickness,768.0,20.536458,15.952218,0.0,0.0,23.0,32.0,99.0
insulin,768.0,79.799479,115.244002,0.0,0.0,30.5,127.25,846.0
bmi,768.0,31.992578,7.88416,0.0,27.3,32.0,36.6,67.1
diab_pred,768.0,0.471876,0.331329,0.078,0.24375,0.3725,0.62625,2.42
age,768.0,33.240885,11.760232,21.0,24.0,29.0,41.0,81.0
skin,768.0,0.809136,0.628517,0.0,0.0,0.9062,1.2608,3.9006


### Checking for Null values

In [8]:
# To check if there is null values
diabetes_file.isnull()

Unnamed: 0,num_preg,glucose_conc,diastolic_bp,thickness,insulin,bmi,diab_pred,age,skin,diabetes
0,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
763,False,False,False,False,False,False,False,False,False,False
764,False,False,False,False,False,False,False,False,False,False
765,False,False,False,False,False,False,False,False,False,False
766,False,False,False,False,False,False,False,False,False,False


In [9]:
diabetes_file.isnull().values.any()

False

## Matrix of correlation between attribitues

In [10]:
# find correlations between different trends
corr = diabetes_file.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,num_preg,glucose_conc,diastolic_bp,thickness,insulin,bmi,diab_pred,age,skin,diabetes
num_preg,1.0,0.129459,0.141282,-0.0816718,-0.0735346,0.0176831,-0.0335227,0.544341,-0.0816718,0.221898
glucose_conc,0.129459,1.0,0.15259,0.0573279,0.331357,0.221071,0.137337,0.263514,0.0573279,0.466581
diastolic_bp,0.141282,0.15259,1.0,0.207371,0.0889334,0.281805,0.0412649,0.239528,0.207371,0.0650684
thickness,-0.0816718,0.0573279,0.207371,1.0,0.436783,0.392573,0.183928,-0.11397,1.0,0.0747522
insulin,-0.0735346,0.331357,0.0889334,0.436783,1.0,0.197859,0.185071,-0.042163,0.436783,0.130548
bmi,0.0176831,0.221071,0.281805,0.392573,0.197859,1.0,0.140647,0.0362419,0.392573,0.292695
diab_pred,-0.0335227,0.137337,0.0412649,0.183928,0.185071,0.140647,1.0,0.0335613,0.183928,0.173844
age,0.544341,0.263514,0.239528,-0.11397,-0.042163,0.0362419,0.0335613,1.0,-0.11397,0.238356
skin,-0.0816718,0.0573279,0.207371,1.0,0.436783,0.392573,0.183928,-0.11397,1.0,0.0747522
diabetes,0.221898,0.466581,0.0650684,0.0747522,0.130548,0.292695,0.173844,0.238356,0.0747522,1.0


Correlated features will not always worsen your model, but they will not always improve it either.
There are three main reasons why you would remove correlated features:

1- Make the learning algorithm faster
Due to the curse of dimensionality, less features usually mean high improvement in terms of speed.

If speed is not an issue, perhaps don't remove these features right away (see next point)

2- Decrease harmful bias
The keyword being harmful. If you have correlated features but they are also correlated to the target, you want to keep them. You can view features as hints to make a good guess, if you have two hints that are essentially the same, but they are good hints, it may be wise to keep them.

Some algorithms like Naive Bayes actually directly benefit from "positive" correlated features. And others like random forest may indirectly benefit from them.

Imagine having 3 features A, B, and C. A and B are highly correlated to the target and to each other, and C isn't at all. If you sample out of the 3 features, you have 2/3 chance to get a "good" feature, whereas if you remove B for instance, this chance drops to 1/2

Of course, if the features that are correlated are not super informative in the first place, the algorithm may not suffer much.

So moral of the story, removing these features might be necessary due to speed, but remember that you might make your algorithm worse in the process. Also, some algorithms like decision trees have feature selection embedded in them.

A good way to deal with this is to use a wrapper method for feature selection. It will remove redundant features only if they do not contribute directly to the performance. If they are useful like in naive bayes, they will be kept. (Though remember that wrapper methods are expensive and may lead to overfitting)

3- Interpretability of your model
If your model needs to be interpretable, you might be forced to make it simpler. Make sure to also remember Occam's razor. If your model is not "that much" worse with less features, then you should probably use less features.

In [11]:
# Removing correlated columns (skin and thikness are corrolated to each other)
del diabetes_file['skin']

In [12]:
diabetes_file

Unnamed: 0,num_preg,glucose_conc,diastolic_bp,thickness,insulin,bmi,diab_pred,age,diabetes
0,6,148,72,35,0,33.6,0.627,50,True
1,1,85,66,29,0,26.6,0.351,31,False
2,8,183,64,0,0,23.3,0.672,32,True
3,1,89,66,23,94,28.1,0.167,21,False
4,0,137,40,35,168,43.1,2.288,33,True
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,False
764,2,122,70,27,0,36.8,0.340,27,False
765,5,121,72,23,112,26.2,0.245,30,False
766,1,126,60,0,0,30.1,0.349,47,True


### Changing diabetes values to 0 and 1

In [13]:
mapping = {True:1, False:0}
diabetes_file["diabetes"] = diabetes_file["diabetes"].map(mapping)
diabetes_file

Unnamed: 0,num_preg,glucose_conc,diastolic_bp,thickness,insulin,bmi,diab_pred,age,diabetes
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


## Using SVC Algorithm


In [14]:
# Define the target
target = diabetes_file["diabetes"]
target_names = ["positive", "negative"]

In [15]:
del diabetes_file["diabetes"]
diabetes_file

Unnamed: 0,num_preg,glucose_conc,diastolic_bp,thickness,insulin,bmi,diab_pred,age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63
764,2,122,70,27,0,36.8,0.340,27
765,5,121,72,23,112,26.2,0.245,30
766,1,126,60,0,0,30.1,0.349,47


In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(diabetes_file, target, random_state = 42) 

In [21]:
# Support vector machine linear classifier
from sklearn.svm import SVC 
model = SVC(kernel='linear')
model.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [22]:
# Model Accuracy
print('Test Acc: %.3f' % model.score(X_test, y_test))

Test Acc: 0.729


In [23]:
# Calculate classification report
from sklearn.metrics import classification_report
predictions = model.predict(X_test)
print(classification_report(y_test, predictions,
                            target_names=target_names))

              precision    recall  f1-score   support

    positive       0.79      0.78      0.79       123
    negative       0.62      0.64      0.63        69

    accuracy                           0.73       192
   macro avg       0.71      0.71      0.71       192
weighted avg       0.73      0.73      0.73       192

