In [None]:
import warnings
import random

import pandas as pd

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.exceptions import ConvergenceWarning

from collections import Counter

warnings.simplefilter(action='ignore', category=ConvergenceWarning)

Notes: 
- for final presentation, maybe mention development date for algorithms ? 

TODO:
- update performance conclusion automatically
- figure out inputing one "row" & predict ourcome (as if user info is predicted)
- it works with features_test; but not with test_row  => figure out what type test_row needs to be
- why is prediction accuracy 100% for some models? Not good (maybe fixed after manual prediction works?)

# **0. Data Prep**

In [None]:
df = pd.read_csv('./data/clean_data.csv')

In [None]:
df = df.drop(df.columns[0],axis = 1)

# remove colums containing NaN values
df = df.dropna(axis=1)

df.set_axis(range(len(df)), inplace=True)

# print("No. of columns containing null values")
# print(len(df.columns[df.isna().any()]))

# print("No. of columns not containing null values")
# print(len(df.columns[df.notna().all()]))

# print("Total no. of columns in the dataframe")
# print(len(df.columns))s

# removing target('diabetes') from features
target = df['diabetes']
features = df.drop(['diabetes'],axis=1)

# splitting into training and test data
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2, random_state=42)

# print(features_train.shape)
# print(features_test.shape)
# print(target_train.shape)
# print(target_test.shape)

# **1. Support Vector Machine**

In [None]:
# Optimizing max_iter to reach the highest possible Accuracy

# MAX_EVALS should be the same as/max the SEARCHSPACE so all possibilities are tried out
MAX_EVALS = 100
SEARCH_SPACE = [hp.randint('max_iter',100)]

### Optimizaion ##############################################################################################################
def cost_function(max_iter):
    max_iter = max_iter[0]
    print(max_iter)
    if max_iter == 0:
        return 0
    svm_classifier = make_pipeline(StandardScaler(), svm.SVC(max_iter=max_iter)).fit(features_train, target_train)
    # svm_classifier = svm.SVC(kernel = "linear", max_iter = max_iter)
    svm_predictions = svm_classifier.predict(features_test)
    svm_accuracy = accuracy_score(target_test, svm_predictions)

    print(f"Accuracy : {100 * svm_accuracy}")
    return {'loss': - svm_accuracy , 'status': STATUS_OK }

trials = Trials()
best = fmin(cost_function,
    space = SEARCH_SPACE,
    algo = tpe.suggest,
    max_evals = MAX_EVALS, 
    trials = trials)

print(best)

In [None]:
# Predicting for one specific 'person' with previously determined max_iter #################################################################################################

# see distribution od 1 & 0 on test data
print(Counter(target_test))

# change to automatically get best max_iter & fill in here! TODO
max_iter = 32
svm_classifier = make_pipeline(StandardScaler(), svm.SVC(max_iter=max_iter)).fit(features_train, target_train)

# taking a random row from the test data to predict a result for:
def make_test_prediction(svm_classifier):
    rand_index = random.randint(0, 32581)
    test_row = features_test.iloc[rand_index] #.values.flatten().tolist()
    test_groundtruth = target_test.iloc[rand_index]
    prediction = svm_classifier.predict(test_row)
    
    return (prediction, test_groundtruth)

prediction, test_groundtruth = make_test_prediction(svm_classifier)
print (prediction)
print (test_groundtruth)

# bzw: what type of input needs test_row to be?!

**Conclusion:**


- simple SVM: Best reachable Accuracy: 86.80559 %, with max_iter= 36
- Using a Pipepline/StandartScaler: Best reachable Accuracy: 99.66852863544289%, with max_iter = 93

# **2. Stocastic Gradient Descent**

In [None]:
# Optimizing max_iter to reach the highest possible Accuracy

# MAX_EVALS should be the same as/max the SEARCHSPACE so all possibilities are tried out
MAX_EVALS = 100
SEARCH_SPACE = [hp.randint('max_iter',100)]

### Optimizaion ##############################################################################################################
def cost_function(max_iter):
    max_iter = max_iter[0]
    if max_iter == 0:
        return 0
    print(max_iter)
    # sgd_classifier = make_pipeline(StandardScaler(), SGDClassifier(max_iter=max_iter)).fit(features_train, target_train)
    sgd_classifier = SGDClassifier(max_iter = max_iter).fit(features_train, target_train)
    sgd_predictions = sgd_classifier.predict(features_test)
    sgd_accuracy = accuracy_score(target_test, sgd_predictions)

    print(f"Accuracy : {100 * sgd_accuracy}")
    return {'loss': - sgd_accuracy , 'status': STATUS_OK }

trials = Trials()
best = fmin(cost_function,
    space = SEARCH_SPACE,
    algo = tpe.suggest,
    max_evals = MAX_EVALS, 
    trials = trials)

print(best)

In [None]:
# Predicting for one specific 'person' with previously determined max_iter #################################################################################################

# see distribution od 1 & 0 on test data
print(Counter(target_test))

# change to automatically get best max_iter & fill in here! TODO
max_iter = 32
sgd_classifier = SGDClassifier(max_iter = max_iter).fit(features_train, target_train)

# taking a random row from the test data to predict a result for:
def make_test_prediction(sgd_classifier):
    rand_index = random.randint(0, 32581)
    test_row = features_test.iloc[rand_index] #.values.flatten().tolist()
    test_groundtruth = target_test.iloc[rand_index]
    prediction = sgd_classifier.predict(test_row)
    
    return (prediction, test_groundtruth)

prediction, test_groundtruth = make_test_prediction(sgd_classifier)
print (prediction)
print (test_groundtruth)

# bzw: what type of input needs test_row to be?!

**Conclusion:**

- simple SGD: Best reachable Accuracy:  87.33042784359463%, with max_iter= 74
- Using a Pipepline/StandartScaler: Best reachable Accuracy:  100%, with max_iter = 11 

# **3. Stocastic Gradient Boosting**

In [None]:
# Optimizing max_depth to reach the highest possible Accuracy

# MAX_EVALS should be the same as/max the SEARCHSPACE so all possibilities are tried out
MAX_EVALS = 100
SEARCH_SPACE = [hp.randint('max_depth',100)]

### Optimizaion ##############################################################################################################
def cost_function(max_depth):
    max_depth = max_depth[0]
    if max_depth == 0:
        return 0
    print(max_depth)
    # sgb_classifier = make_pipeline(StandardScaler(), sgb_classifier = GradientBoostingClassifier(n_estimators=10, learning_rate=0.5, max_depth=max_depth, random_stat=0).fit(features_train, target_train)
    sgb_classifier = GradientBoostingClassifier(n_estimators=10, learning_rate=0.5, max_depth=max_depth).fit(features_train, target_train)    
    sgb_predictions = sgb_classifier.predict(features_test)
    sgb_accuracy = accuracy_score(target_test, sgb_predictions)
    comp = target_test == sgb_predictions
    print(Counter(comp))
    print(f"Accuracy : {100 * sgb_accuracy}")
    return {'loss': - sgb_accuracy , 'status': STATUS_OK }

trials = Trials()
best = fmin(cost_function,
    space = SEARCH_SPACE,
    algo = tpe.suggest,
    max_evals = MAX_EVALS, 
    trials = trials)

print(best)

In [None]:
# Predicting for one specific 'person' with previously determined max_iter #################################################################################################

# see distribution od 1 & 0 on test data
print(Counter(target_test))

# change to automatically get best max_iter & fill in here! TODO
max_depth = 32
sgb_classifier = GradientBoostingClassifier(n_estimators=10, learning_rate=0.5, max_depth=max_depth).fit(features_train, target_train)    


# taking a random row from the test data to predict a result for:
def make_test_prediction(sgb_classifier):
    rand_index = random.randint(0, 32581)
    test_row = features_test.iloc[rand_index] #.values.flatten().tolist()
    test_groundtruth = target_test.iloc[rand_index]
    prediction = sgb_classifier.predict(test_row)
    
    return (prediction, test_groundtruth)

prediction, test_groundtruth = make_test_prediction(sgb_classifier)
print (prediction)
print (test_groundtruth)

# bzw: what type of input needs test_row to be?!

**Conclusion:**

- simple SGB: Best reachable Accuracy:  87.33042784359463%, with max_iter= 74
- Using a Pipepline/StandartScaler: Best reachable Accuracy:  100%, with max_iter = 11 