In [410]:
# A simple function to calculate the sensitivity
def calc_sensitivity(truePositive, falseNegative):
	return (truePositive) / (truePositive + falseNegative);

# A simple function to calculate the specificity
def calc_specificity(trueNegative, falsePositive):
	return (trueNegative) / (trueNegative + falsePositive)

# Reading dataset and describing

In [411]:
# Reading and storage
import pandas as pd
dataset = pd.read_csv("diabetes_binary_classification_data.csv")
dataset.describe()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
count,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,...,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0
mean,0.139333,0.429001,0.424121,0.96267,28.382364,0.443169,0.040571,0.094186,0.756544,0.634256,...,0.951053,0.084177,2.511392,3.184772,4.242081,0.168224,0.440342,8.032119,5.050434,6.053875
std,0.346294,0.494934,0.49421,0.189571,6.608694,0.496761,0.197294,0.292087,0.429169,0.481639,...,0.215759,0.277654,1.068477,7.412847,8.717951,0.374066,0.496429,3.05422,0.985774,2.071148
min,0.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
25%,0.0,0.0,0.0,1.0,24.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,6.0,4.0,5.0
50%,0.0,0.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,8.0,5.0,7.0
75%,0.0,1.0,1.0,1.0,31.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,3.0,2.0,3.0,0.0,1.0,10.0,6.0,8.0
max,1.0,1.0,1.0,1.0,98.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,1.0,13.0,6.0,8.0


# Outlining dataset features

In [412]:
for col in dataset.columns: print(col)

Diabetes_binary
HighBP
HighChol
CholCheck
BMI
Smoker
Stroke
HeartDiseaseorAttack
PhysActivity
Fruits
Veggies
HvyAlcoholConsump
AnyHealthcare
NoDocbcCost
GenHlth
MentHlth
PhysHlth
DiffWalk
Sex
Age
Education
Income


# Finding correlations
Setting up to find correlations between the different variables in order to find which factors are irrelevant.

In [413]:
feature = "Diabetes_binary"
correlations = dataset.corr()[feature].sort_values(ascending=False)
correlations

Diabetes_binary         1.000000
GenHlth                 0.293569
HighBP                  0.263129
DiffWalk                0.218344
BMI                     0.216843
HighChol                0.200276
Age                     0.177442
HeartDiseaseorAttack    0.177282
PhysHlth                0.171337
Stroke                  0.105816
MentHlth                0.069315
CholCheck               0.064761
Smoker                  0.060789
NoDocbcCost             0.031433
Sex                     0.031430
AnyHealthcare           0.016255
Fruits                 -0.040779
Veggies                -0.056584
HvyAlcoholConsump      -0.057056
PhysActivity           -0.118133
Education              -0.124456
Income                 -0.163919
Name: Diabetes_binary, dtype: float64

## Looking at feature relevancy
A good way of determining if a feature is likely to be relevant or not, is to predetermine an alpha value for the minimum required correlation.

### Likely Irrelevant Features

In [414]:
alpha = 0.1

# Finding all the correlations with a coefficient between -0.1 and 0.1
correlation_lower_relevancy = correlations.between(-alpha, alpha)
lowly_correlated = correlation_lower_relevancy[correlation_lower_relevancy == True].index.tolist()
lowly_correlated

['MentHlth',
 'CholCheck',
 'Smoker',
 'NoDocbcCost',
 'Sex',
 'AnyHealthcare',
 'Fruits',
 'Veggies',
 'HvyAlcoholConsump']

### Likely Relevant Features
We're using an automated approach to feature engineering in order to determine which features are very likely to be correlated with the result. Through this automated approach, we eliminate bias in feature engineering, but this bias might be beneficial in some cases in terms of strange correlations (such as income, in this case).

In [415]:
alpha = 0.15
# Finding all the correlations with an absolute coefficient higher than 0.15
correlation_upper_relevancy = ~(correlations.between(-alpha, alpha))
highly_correlated = correlation_upper_relevancy[correlation_upper_relevancy == True].keys()


datasetLimited = dataset[highly_correlated]
datasetLimited

Unnamed: 0,Diabetes_binary,GenHlth,HighBP,DiffWalk,BMI,HighChol,Age,HeartDiseaseorAttack,PhysHlth,Income
0,0.0,5.0,1.0,1.0,40.0,1.0,9.0,0.0,15.0,3.0
1,0.0,3.0,0.0,0.0,25.0,0.0,7.0,0.0,0.0,1.0
2,0.0,5.0,1.0,1.0,28.0,1.0,9.0,0.0,30.0,8.0
3,0.0,2.0,1.0,0.0,27.0,0.0,11.0,0.0,0.0,6.0
4,0.0,2.0,1.0,0.0,24.0,1.0,11.0,0.0,0.0,4.0
...,...,...,...,...,...,...,...,...,...,...
253675,0.0,3.0,1.0,0.0,45.0,1.0,5.0,0.0,5.0,7.0
253676,1.0,4.0,1.0,1.0,18.0,1.0,11.0,0.0,0.0,4.0
253677,0.0,1.0,0.0,0.0,28.0,0.0,2.0,0.0,0.0,2.0
253678,0.0,3.0,1.0,0.0,23.0,0.0,7.0,0.0,0.0,1.0


# Preprocessing

In [416]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np

y_axis = "Diabetes_binary"

# Creating two different scalers, one for the
# regular dataset and one for the limited dataset
scaler = StandardScaler()
lim_scaler = StandardScaler()

data = dataset.copy().drop(y_axis, axis=1)
limdata = datasetLimited.copy().drop(y_axis, axis=1)
label = np.array(dataset[y_axis]).reshape(-1, 1)

# Splitting the dataset into training and testing
x_train, x_test, y_train, y_test = train_test_split(data, label, test_size=0.2, random_state=42)
x_lim_train, x_lim_test, y_lim_train, y_lim_test = train_test_split(limdata, label, test_size=0.2, random_state=42)

# Scaling the data to separate variables,
# since random trees do not require scaling.
sx_train = scaler.fit_transform(x_train)
sx_test = scaler.transform(x_test)
sx_lim_train = lim_scaler.fit_transform(x_lim_train)
sx_lim_test = lim_scaler.fit_transform(x_lim_test)

In [417]:
from sklearn.metrics import accuracy_score, confusion_matrix

# Prints out some simple metrics about a model,
# including a confusion matrix, the accuracy score,
# and sensitivity/specificity.
def get_metrics(model, x_test, y_test):
	y_predict = model.predict(x_test)
	score = accuracy_score(y_test, y_predict)
	confusion = confusion_matrix(y_test, y_predict)

	TN = confusion[0][0]
	FP = confusion[0][1]
	FN = confusion[1][0]
	TP = confusion[1][1]

	print("Score:", score)
	print("Sensitivity:", calc_sensitivity(TP, FN))
	print("Specificity:", calc_specificity(TN, FP))


	print(f"{TP}|{FP}\n----------\n{FN}|{TN}")

# SGD classification

In [418]:
from sklearn.linear_model import SGDClassifier

# Function for training SGD using an input training dataset
def train_sgd(x, y):
	model = SGDClassifier(
		loss="log_loss",
		class_weight="balanced",
		random_state=42)
	model.fit(x, y.ravel())
	print("Finished training model")
	return model

SGD_model = train_sgd(sx_train, y_train)
SGD_model_lim = train_sgd(sx_lim_train, y_lim_train)

Finished training model
Finished training model


In [419]:
print("Full model:")
get_metrics(SGD_model, sx_test, y_test)
print("\nLimited model:")
get_metrics(SGD_model_lim, sx_lim_test, y_lim_test)

Full model:
Score: 0.7236281929990539
Sensitivity: 0.7824782049449764
Specificity: 0.7142138594846704
5475|12500
----------
1522|31239

Limited model:
Score: 0.734271523178808
Sensitivity: 0.7558953837358868
Specificity: 0.7308123185258008
5289|11774
----------
1708|31965


Using our inputs, the model has the desired result of predicting very few false negatives, and many true positives. However, the model very often predicts false positives. More so than it predicts true positives.

# Random Forest

In [420]:
# We opted to try mixing a baggingclassifier together with a decisiontreeclassifier.
# We could use RandomForestRegressor, but this felt like it would give more learning.
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

# Function for training a random forest using an input training dataset
def train_random_forest(x, y):
	bagger = BaggingClassifier(
		estimator=DecisionTreeClassifier(
			max_depth=3,
			class_weight="balanced",
			random_state=42
		),
		max_samples=500,
		n_estimators=500,
		bootstrap=True,
		n_jobs=-1,
		random_state=42)
	bagger.fit(x, y.ravel())
	print("Finished training model")
	return bagger

In [421]:
ranfor_model = train_random_forest(x_train, y_train)
ranfor_model_lim = train_random_forest(x_lim_train, y_lim_train)

print("Full model:")
get_metrics(ranfor_model, x_test, y_test)
print("\nLimited model:")
get_metrics(ranfor_model_lim, x_lim_test, y_lim_test)

Finished training model
Finished training model
Full model:
Score: 0.7609981078524125
Sensitivity: 0.7111619265399457
Specificity: 0.7689704840074075
4976|10105
----------
2021|33634

Limited model:
Score: 0.7578051087984863
Sensitivity: 0.7167357438902386
Specificity: 0.7643750428679211
5015|10306
----------
1982|33433


# Decision Tree (No bagging)

In [422]:
# Function for training a decision tree using an input training dataset
def train_decision_tree(x,y, depth=10):
	model = DecisionTreeClassifier(
		max_depth=depth,
		class_weight="balanced",
		random_state=42
	)
	model.fit(x,y.ravel())
	return model

In [423]:
decision_tree_model = train_decision_tree(x_train, y_train)
decision_tree_model_lim = train_decision_tree(x_lim_train, y_lim_train)

print("Full model:")
get_metrics(decision_tree_model, x_test, y_test)
print("\nLimited model:")
get_metrics(decision_tree_model_lim, x_lim_test, y_lim_test)

Full model:
Score: 0.7209673604541155
Sensitivity: 0.772188080605974
Specificity: 0.7127734973364732
5403|12563
----------
1594|31176

Limited model:
Score: 0.715054399243141
Sensitivity: 0.7857653279977133
Specificity: 0.7037426552961887
5498|12958
----------
1499|30781


# AdaBoost

In [424]:
from sklearn.ensemble import AdaBoostClassifier

# Function for training an adaboost classifier utilizing decision trees,
# using an input training dataset
def train_adaboost(x, y, tree_depth=2, estimators=200):
	adabooster = AdaBoostClassifier(
		estimator=DecisionTreeClassifier(
			max_depth=tree_depth,
			class_weight="balanced",
			random_state=42),
		n_estimators=estimators,
		algorithm="SAMME.R",
		learning_rate=0.5,
		random_state=42,
	)
	adabooster.fit(x, y.ravel())

	print("Finished training model")
	return adabooster

In [425]:
adaboost_model = train_adaboost(x_train, y_train, 2, 150)
adaboost_model_lim = train_adaboost(x_lim_train, y_lim_train, 2, 150)

print("Full model:")
get_metrics(adaboost_model, x_test, y_test)
print("\nLimited model:")
get_metrics(adaboost_model_lim, x_lim_test, y_lim_test)

Finished training model
Finished training model
Full model:
Score: 0.7198044780826238
Sensitivity: 0.8117764756324138
Specificity: 0.7050915658794211
5680|12899
----------
1317|30840

Limited model:
Score: 0.7152514979501734
Sensitivity: 0.8030584536229812
Specificity: 0.7012048743684126
5619|13069
----------
1378|30670
