In [74]:
def calc_sensitivity(truePositive, falseNegative):
	return (truePositive) / (truePositive + falseNegative);

def calc_specificity(trueNegative, falsePositive):
	return (trueNegative) / (trueNegative + falsePositive)

# Reading dataset and describing

In [75]:
# Reading and storage
import pandas as pd
# Sql queries for dataframes, for easy filtering
from pandasql import sqldf as sql
dataset = pd.read_csv("diabetes_binary_classification_data.csv")
dataset.describe()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
count,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,...,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0
mean,0.139333,0.429001,0.424121,0.96267,28.382364,0.443169,0.040571,0.094186,0.756544,0.634256,...,0.951053,0.084177,2.511392,3.184772,4.242081,0.168224,0.440342,8.032119,5.050434,6.053875
std,0.346294,0.494934,0.49421,0.189571,6.608694,0.496761,0.197294,0.292087,0.429169,0.481639,...,0.215759,0.277654,1.068477,7.412847,8.717951,0.374066,0.496429,3.05422,0.985774,2.071148
min,0.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
25%,0.0,0.0,0.0,1.0,24.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,6.0,4.0,5.0
50%,0.0,0.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,8.0,5.0,7.0
75%,0.0,1.0,1.0,1.0,31.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,3.0,2.0,3.0,0.0,1.0,10.0,6.0,8.0
max,1.0,1.0,1.0,1.0,98.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,1.0,13.0,6.0,8.0


# Outlining dataset features

In [76]:
for col in dataset.columns: print(col)

Diabetes_binary
HighBP
HighChol
CholCheck
BMI
Smoker
Stroke
HeartDiseaseorAttack
PhysActivity
Fruits
Veggies
HvyAlcoholConsump
AnyHealthcare
NoDocbcCost
GenHlth
MentHlth
PhysHlth
DiffWalk
Sex
Age
Education
Income


# Finding correlations
Setting up to find correlations between the different variables in order to find which factors are irrelevant.

In [77]:
feature = "Diabetes_binary"
correlations = dataset.corr()[feature].sort_values(ascending=False).drop(feature)
correlations

GenHlth                 0.293569
HighBP                  0.263129
DiffWalk                0.218344
BMI                     0.216843
HighChol                0.200276
Age                     0.177442
HeartDiseaseorAttack    0.177282
PhysHlth                0.171337
Stroke                  0.105816
MentHlth                0.069315
CholCheck               0.064761
Smoker                  0.060789
NoDocbcCost             0.031433
Sex                     0.031430
AnyHealthcare           0.016255
Fruits                 -0.040779
Veggies                -0.056584
HvyAlcoholConsump      -0.057056
PhysActivity           -0.118133
Education              -0.124456
Income                 -0.163919
Name: Diabetes_binary, dtype: float64

## Looking at feature relevancy
A good way of determining if a feature is likely to be relevant or not, is to predetermine an alpha value for the minimum required correlation.

### Likely Irrelevant Features

In [78]:
alpha = 0.05

correlation_lower_relevancy = correlations.between(-alpha, alpha)
correlation_lower_relevancy[correlation_lower_relevancy == True]

NoDocbcCost      True
Sex              True
AnyHealthcare    True
Fruits           True
Name: Diabetes_binary, dtype: bool

### Likely Relevant Features

In [79]:
alpha = 0.2
correlation_upper_relevancy = ~(correlations.between(-alpha, alpha))
correlation_upper_relevancy[correlation_upper_relevancy == True]

GenHlth     True
HighBP      True
DiffWalk    True
BMI         True
HighChol    True
Name: Diabetes_binary, dtype: bool

# Preprocessing

In [80]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np

y_axis = "Diabetes_binary"

scaler = StandardScaler()

data = dataset.copy().drop(y_axis, axis=1)
label = np.array(dataset[y_axis]).reshape(-1, 1)

x_train, x_test, y_train, y_test = train_test_split(data, label, test_size=0.2, random_state=42)

#x_train = scaler.fit_transform(x_train)
#x_test = scaler.transform(x_test)

# SGD classification

In [81]:
from sklearn.linear_model import SGDClassifier

SGD_model = SGDClassifier(loss="log_loss", class_weight="balanced", random_state=42)
SGD_model.fit(x_train, y_train.ravel())

In [82]:
from sklearn.metrics import accuracy_score, confusion_matrix

y_predict = SGD_model.predict(x_test)
score = accuracy_score(y_test, y_predict)
confusion = confusion_matrix(y_test, y_predict)

TN = confusion[0][0]
FP = confusion[0][1]
FN = confusion[1][0]
TP = confusion[1][1]

print("Score:", score)
print("Sensitivity:", calc_sensitivity(TP, FN))
print("Specificity:", calc_specificity(TN, FP))


print(f"{TP}|{FP}\n----------\n{FN}|{TN}")

Score: 0.7236281929990539
Sensitivity: 0.7824782049449764
Specificity: 0.7142138594846704
5475|12500
----------
1522|31239


Using our inputs, the model has the desired result of predicting very few false negatives, and many true positives. However, the model very often predicts false positives.

# SVM classification

In [86]:
from sklearn import svm

svm_model = svm.LinearSVC(random_state=42, verbose=True)
svm_model.fit(x_train, y_train.ravel())





[LibLinear]


Liblinear failed to converge, increase the number of iterations.

