In [20]:
import warnings
import random

import pandas as pd

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.exceptions import ConvergenceWarning

from collections import Counter

warnings.simplefilter(action='ignore', category=ConvergenceWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

Notes: 
- for final presentation, maybe mention development date for algorithms ? 

# **0. Data Prep**

In [2]:
df = pd.read_csv('./data/clean_data.csv')

In [3]:
df = df.drop(df.columns[0],axis = 1)

# remove colums containing NaN values
df = df.dropna(axis=1)

df.set_axis(range(len(df)), inplace=True)

# removing target('diabetes') from features
target = df['diabetes']
features = df.drop(['diabetes'],axis=1)

# splitting into training and test data
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2, random_state=42)

# print(features_train.shape)
# print(features_test.shape)
# print(target_train.shape)
# print(target_test.shape)

  df.set_axis(range(len(df)), inplace=True)


# **1. Support Vector Machines**

**Conclusion:**

- Using a Pipepline/StandartScaler: Best reachable Accuracy: 76 % with max_iter = 93

-> Dont use, 2. & 3. perform better

In [4]:
# Optimizing max_iter to reach the highest possible Accuracy

# MAX_EVALS should be the same as/max the SEARCHSPACE so all possibilities are tried out
# MAX_EVALS = 100
# SEARCH_SPACE = [hp.randint('max_iter',100)]

### Optimizaion ##############################################################################################################
# def cost_function(max_iter):
#     max_iter = max_iter[0]
#     print(max_iter)
#     if max_iter == 0:
#         return 0
#     svm_classifier = make_pipeline(StandardScaler(), svm.SVC(max_iter=max_iter)).fit(features_train, target_train)
#     svm_predictions = svm_classifier.predict(features_test)
#    svm_accuracy = accuracy_score(target_test, svm_predictions)
# 
#     return {'loss': - svm_accuracy , 'status': STATUS_OK }

# trials = Trials()
# best = fmin(cost_function,
#     space = SEARCH_SPACE,
#     algo = tpe.suggest,
#     max_evals = MAX_EVALS, 
#     trials = trials)

# print(best)

45                                                     
Accuracy : 66.83766360169928                           
95                                                                                
Accuracy : 70.80761699740069                                                      
42                                                                                
Accuracy : 66.33154320916971                                                      
86                                                                                
Accuracy : 68.39724725469765                                                      
22                                                                                
Accuracy : 65.9204635238346                                                       
21                                                                                
Accuracy : 63.37841087357295                                                      
25                                                        

In [5]:
# Predicting for one specific 'person' #################################################################################################

# max_iter = best['max_iter']
# svm_classifier = make_pipeline(StandardScaler(), svm.SVC(max_iter=max_iter)).fit(features_train, target_train)

# taking a random row from the test data to predict a result for:
# def make_test_prediction(svm_classifier):
#     rand_index = random.randint(0, 32581)
#     test_row = features_test.iloc[rand_index] #.values.flatten().tolist()
#     test_groundtruth = target_test.iloc[rand_index]
#     prediction = svm_classifier.predict([test_row])
#     
#     return (prediction, test_groundtruth)

# prediction, test_groundtruth = make_test_prediction(svm_classifier)
# print (prediction == test_groundtruth)

[ True]




# **2. Stocastic Gradient Descent**

**Conclusion:**

- Using a Pipepline/StandartScaler: Best reachable Accuracy:  88%, with max_iter = 14 

In [7]:
# Optimizing max_iter to reach the highest possible Accuracy

# MAX_EVALS should be the same as/max the SEARCHSPACE so all possibilities are tried out
MAX_EVALS = 10
SEARCH_SPACE = [hp.randint('max_iter',100)]

### Optimizaion ##############################################################################################################
def cost_function(max_iter):
    max_iter = max_iter[0]
    if max_iter == 0:
        return 0
    sgd_classifier = make_pipeline(StandardScaler(), SGDClassifier(max_iter=max_iter)).fit(features_train, target_train)
    sgd_predictions = sgd_classifier.predict(features_test)
    sgd_accuracy = accuracy_score(target_test, sgd_predictions)

    return {'loss': - sgd_accuracy , 'status': STATUS_OK }

trials = Trials()
best = fmin(cost_function,
    space = SEARCH_SPACE,
    algo = tpe.suggest,
    max_evals = MAX_EVALS, 
    trials = trials)

print(best)

100%|██████████| 10/10 [00:37<00:00,  3.75s/trial, best loss: -0.8882985423274667]
{'max_iter': 14}


In [14]:
max_iter = best['max_iter']
sgd_classifier = make_pipeline(StandardScaler(), SGDClassifier(max_iter=max_iter)).fit(features_train, target_train)

In [15]:
# Predicting for one specific 'person'  #################################################################################################

# taking a random row from the test data to predict a result for:
def make_test_prediction(sgd_classifier):
    rand_index = random.randint(0, 32581)
    test_row = features_test.iloc[rand_index]
    test_groundtruth = target_test.iloc[rand_index]
    prediction = sgd_classifier.predict([test_row])

    return (prediction, test_groundtruth)

prediction, test_groundtruth = make_test_prediction(sgd_classifier)
print (prediction == test_groundtruth)

[False]




# **3. Stocastic Gradient Boosting**

In [24]:
# Optimizing n_estimators to reach the highest possible Accuracy

# N_ESTIMATORS should be max the SEARCHSPACE so all possibilities are tried once
N_ESTIMATORS = 100
SEARCH_SPACE = [hp.randint('n_estimators',400)]

### Optimizaion ##############################################################################################################
def cost_function(n_estimators):
    n_estimators = n_estimators[0]
    if n_estimators == 0:
        return 0
    sgb_classifier = make_pipeline(StandardScaler(), GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=0.5, random_state=0)).fit(features_train, target_train)
    sgb_predictions = sgb_classifier.predict(features_test)
    sgb_accuracy = accuracy_score(target_test, sgb_predictions)
    return {'loss': - sgb_accuracy , 'status': STATUS_OK }

trials = Trials()
best = fmin(cost_function,
    space = SEARCH_SPACE,
    algo = tpe.suggest,
    max_evals = N_ESTIMATORS, 
    trials = trials)

print(best)

 29%|██▉       | 29/100 [3:00:29<9:20:26, 473.62s/trial, best loss: -0.898489654303741] 

In [21]:
n_estimators = best['n_estimators']

sgb_classifier = make_pipeline(StandardScaler(), GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=0.5, random_state=0)).fit(features_train, target_train)

In [None]:
import pickle

model = pickle.dumps(sgb_classifier)
model = pickle.loads(model)

In [22]:
# Verify performance / See false nagatives vs. false positives (ca: 7157 - 1855)

predictions = sgb_classifier.predict(features_test)

def calc_metrics(predictions, target_test):
    prediction_true = false_negative = false_positive = 0

    for i in range(0, 87331):
        if target_test.iloc[i] == predictions[i]:
            prediction_true+= 1
        if (target_test.iloc[i] == 1 and predictions[i] == 0):
            false_negative +=1
        if (target_test.iloc[i] == 0 and predictions[i] == 1):
            false_positive +=1
    accuracy = 1 - (false_negative + false_positive) / len(predictions)

    return (prediction_true, false_negative, false_positive, accuracy)

prediction_true, false_negative, false_positive, accuracy = calc_metrics(predictions, target_test)
print(prediction_true, false_negative, false_positive, accuracy)

78468 6894 1969 0.8985125556789685


In [19]:
# Predicting for one specific 'person' #################################################################################################

# taking a random row from the test data to predict a result for:
def make_test_prediction(sgb_classifier):
    rand_index = random.randint(0, 32581)
    test_row = features_test.iloc[rand_index] #.values.flatten().tolist()
    test_groundtruth = target_test.iloc[rand_index]
    prediction = sgb_classifier.predict([test_row])
    
    return (prediction, test_groundtruth)

prediction, test_groundtruth = make_test_prediction(sgb_classifier)
print (prediction)
print (test_groundtruth)

[0.]
0.0




**Conclusion:**

- simple SGB: Best reachable Accuracy:  87.33042784359463%, with max_iter= 74
- Using a Pipepline/StandartScaler: Best reachable Accuracy:  100%, with max_iter = 11 