In [6]:
import  numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import graphviz 

from sklearn.model_selection import train_test_split#
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.neural_network import MLPClassifier
from sklearn import tree
from sklearn.metrics import accuracy_score


px_template = "simple white"

In [7]:
# df = pd.read_csv('./data/DIABETE4_health_indicators_BRFSS2015.csv')

# **1. model**

In [8]:
df = pd.read_csv('./data/clean_data.csv')

In [9]:
df = df.drop(df.columns[0],axis = 1)
# df

In [10]:
# search for columns containing 'DIABETE' to find target var

diabate_cols = [col for col in df.columns if 'DIABETE' in col]
diabate_cols.append([col for col in df.columns if 'diabete' in col])
print(diabate_cols)

['DIABETE4', ['diabetes']]


In [11]:
# remove colums containing string values
df = df.dropna(axis=1)
df.shape

print("No. of columns containing null values")
print(len(df.columns[df.isna().any()]))

print("No. of columns not containing null values")
print(len(df.columns[df.notna().all()]))

print("Total no. of columns in the dataframe")
print(len(df.columns))

No. of columns containing null values
0
No. of columns not containing null values
176
Total no. of columns in the dataframe
176


In [12]:
# removing target var from feature list
target = df['diabetes']
# target = target.to_frame()
features = df.drop(['diabetes'],axis=1)

In [13]:
# splitting into training and test data

features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [14]:
print(features_train.shape)
print(features_test.shape)
print(target_train.shape)
print(target_test.shape)

(349323, 175)
(87331, 175)
(349323,)
(87331,)


In [15]:
# training a model
model = tree.DecisionTreeClassifier()
model = model.fit(features_train, target_train)

In [16]:
# tree vizualisation

fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(model, 
                   feature_names=features_train.columns,  
                   class_names='diabetes',
                   filled=True)

In [None]:
predictions = model.predict(features_test)

In [None]:
print(predictions)
predictions.shape

In [None]:
accuracy = accuracy_score(target_test, predictions)
acc_percentage = accuracy * 100
acc_percentage

In [None]:
# training a cnn 
# model = MLPClassifier(hidden_layer_sizes=(50,20, 10), random_state=1,
#               solver='adam')
# model.fit(features_train, target_train)

In [None]:
# fixing dimensionality 
# print(target_train.shape)
# print(features_train.shape)

In [None]:
# predicting for test data
# predictions = model.predict([features_test])
# predictions

# **3.Hyperopt**

In [None]:
df = pd.read_csv('./df/DIABETE4_health_indicators_BRFSS2015.csv')
features = df.drop('DIABETE4',axis=1)
groundtruth = df['DIABETE4']

In [None]:
#target variable DIABETE4:
#- 0 : no diabetes/ only during pregnancy
#- 1 : prediabetes
#- 2 : diabetes

### Classifier: #######################################################################################################

def no_diabetes_classifier(groundtruth) -> bool:
    if (groundtruth == 0):
        return (True)
    else: 
        return False

def diabetes_classifier(groundtruth) -> bool:     
    if (groundtruth == 2.0):
        return(True) 
    else:
        return(False)

def classifier(features):
    predictions = []
    for i in range(0, len(features)):
        # is it diabetes?
        if ((diabetes_classifier(features[i]))):
            predictions.append(2.0)
        # is it no diabetes?
        elif (no_diabetes_classifier(features[i])):
            predictions.append(0.0)
        # its neither, so patient is at risk
        else: predictions.append(1.0)
    # print(predictions)
    return predictions

In [None]:
# define performance metrics (True negatives, false negatives, true positives, false positives, recall, specificity, precision)
    # recall = Sensitifity = True Positive Rate
    # specificity = True Negative Rate
    # precision = TP / (TP + FP)

def calc_metrics(predictions, groundtruth):
    # predictions = predictions
    TP_no_diabetes = TN_prediabetes = TN_diabetes = TN_no_diabetes = TN_diabetes = TP_prediabetes = TP_diabetes = FP_no_diabetes = FN_diabetes = FN_prediabetes = FP_diabetes = FP_prediabetes = FN_no_diabetes = 0
    for i in range(0, len(predictions)):
        if (predictions[i] == groundtruth[i]):
            if (predictions[i] == 0):
                TP_no_diabetes += 1
                TN_prediabetes += 1
                TN_diabetes += 1
            elif (predictions[i] ==1):
                TP_prediabetes += 1
                TN_diabetes += 1
                TN_no_diabetes += 1
            else : 
                TP_diabetes += 1
                TN_no_diabetes += 1
                TN_prediabetes += 1
                
        elif (predictions[i] == 0):
            if (groundtruth[i] == 1):
                FP_no_diabetes += 1
                FN_prediabetes += 1
                TN_diabetes += 1
            elif (groundtruth[i] == 2):
                FP_no_diabetes += 1
                FN_diabetes += 1
                TN_prediabetes += 1
                
        elif (predictions[i] == 1):
            if (groundtruth[i] == 0):
                FP_prediabetes += 1
                FN_no_diabetes += 1
                TN_diabetes += 1
            elif(groundtruth[i] == 2):
                FP_prediabetes += 1
                FN_diabetes += 1
                TN_no_diabetes+= 1
                
        elif (predictions[i] == 2):
            if (groundtruth[i] == 0):
                FP_diabetes += 1
                FN_no_diabetes += 1
                TN_prediabetes += 1
            elif(groundtruth[i] == 1):
                FP_diabetes += 1
                FN_prediabetes += 1
                TN_no_diabetes += 1
                
    TP_total = TP_diabetes + TP_no_diabetes + TP_prediabetes 
    TN_total = TN_diabetes + TN_no_diabetes + TN_prediabetes
    FP_total = FP_diabetes + FP_no_diabetes + FP_prediabetes
    FN_total = FN_diabetes + FN_no_diabetes + FN_prediabetes 
    recall = TP_total/ (TP_total + FN_total)
    specificity =  TN_total / (TN_total + FP_total)
    precision = (TP_total / (TP_total + FP_total))
    return (recall, specificity, precision)


recall, specificity, precision = calc_metrics(predictions, groundtruth)
print (recall, specificity, precision)
    

In [None]:
# Hyperopt for number of optimal components with most information/ number ob components

### Constants ##############################################################################################################

# amount of trials
MAX_EVALS = 3

# One 'run' equals one fmin-execution where each run for a number of x trials
SEARCH_SPACE = [hp.uniform('default', 0, 1)]

### Optimizaion ##############################################################################################################

def cost_function(features):
    predictions = classifier(groundtruth)
    recall, specifity, precision = calc_metrics(predictions, groundtruth)
    # print(f"recall: {recall}")
    # print(f"specificity: {specifity}")
    # specifity is maximized (= minimizes not recognized diabetes cases)
    print(f"Specifity (TN-Rate): {specifity}")
    return {'loss': - specifity , 'status': STATUS_OK }

trials = Trials()
best = fmin(cost_function,
    space = SEARCH_SPACE,
    algo = tpe.suggest,
    max_evals = MAX_EVALS, 
    trials = trials)

print(best)

# 4. Optimal Component configuration:

In [None]:
import  numpy as np
import pandas as pd

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


In [None]:
df = pd.read_csv('./df/DIABETE4_health_indicators_BRFSS2015.csv')
features = df.drop('DIABETE4',axis=1)
groundtruth = df['DIABETE4']

In [None]:
print(pd.unique(groundtruth))

In [None]:
# Standartizing and normalizing df
feature_names = list(features.columns)

x = df.loc[:,feature_names].values
x = StandardScaler().fit_transform(x)
# np.mean(x), np.std(x)

## Format back into dfframe
feature_columns = ['feature_names' + str(i) for i in range (x.shape[1])]
normalized_df = pd.dfFrame(x, columns=feature_columns)

In [None]:
#to wich number of features should be reduced? (max 79)
n_components = 5 

# actual PCA
pca_df = PCA(n_components=n_components)
principal_components = pca_df.fit_transform(x)

column_names = []
for i in range (0, n_components):
    count = str(i)
    column_names.append('PC'+count)

column_names = np.array(column_names)
print(column_names)

components_DF = pd.dfFrame(df = principal_components, columns=column_names)

In [None]:
# Creating dfframe with percentage each component adds (output is sorted)
explained_var = pca_df.explained_variance_ratio_
explained_percentage = 0

explained_var_per = []
for i in range (len(explained_var)):
    explained_percentage += (explained_var[i] * 100)   
    explained_var_per.append(explained_var[i] *100)

explained_var_df = pd.dfFrame(explained_var_per)

explained_var_df.columns =["% of Information"]
explained_var_df["feature_nr"] = column_names.tolist()
# explained_var_df
explained_percentage

In [None]:
### Constants ##############################################################################################################

# amount of trials
MAX_EVALS = 2
SEARCH_SPACE = [hp.uniform('number_of_components',0,20)]

### df preperation ##############################################################################################################

# Standartizing and normalizing df
def df_prep(df):
    feature_names = list(features.columns)
    x = df.loc[:,feature_names].values
    x = StandardScaler().fit_transform(x)
    return x

# PCA
def pca (number_of_components):
    pca_df = PCA(n_components=number_of_components)
    explained_percentage = 0
    explained_var_percentages = []
    column_names = []
    x = df_prep(df)
    principal_components = pca_df.fit_transform(x)
    
    for i in range (0, number_of_components):
        count = str(i)
        column_names.append('PC'+count)

    column_names = np.array(column_names)
    explained_var = pca_df.explained_variance_ratio_

    for i in range (len(explained_var)):
        explained_percentage += (explained_var[i] * 100)   
        explained_var_percentages.append(explained_var[i] *100)

    explained_var_df = pd.dfFrame(explained_var_percentages)
    explained_var_df.columns =["% of Information"]
    explained_var_df["feature_nr"] = column_names.tolist()
    
    return explained_percentage, number_of_components


### Optimizaion ##############################################################################################################

def cost_function_pca(number_of_components):
    number_of_components = int(number_of_components[0])
    #print(number_of_components)
    explained_percentage, number_of_components = pca(number_of_components)
    information_component_ratio = (explained_percentage / number_of_components)
    return {'loss': - information_component_ratio , 'status': STATUS_OK }

trials = Trials()
best = fmin(cost_function_pca,
    space = SEARCH_SPACE,
    algo = tpe.suggest,
    max_evals = MAX_EVALS, 
    trials = trials)

def cleanup_loss():
    raw_loss = float(trials.best_trial['result']['loss'])
    #raw_loss.dtype
    print(raw_loss)
    n = int(trials.best_trial['misc']['vals']['number_of_components'])
    print (n)
    clean__reached_percentage = raw_loss * n

    return clean__reached_percentage



perc = cleanup_loss()
print(trials.best_trial)
print("percentage for {n} coponents: {perc}")


In [None]:
### Constants ##############################################################################################################
MAX_EVALS = 3
n_components_fixed = 10

### df preperation ##############################################################################################################

# Standartizing and normalizing df

def df_prep():
    feature_names = list(features.columns)

    x = df.loc[:,feature_names].values
    x = StandardScaler().fit_transform(x)

    feature_columns = ['feature_names' + str(i) for i in range (x.shape[1])]
    normalized_df = pd.dfFrame(x, columns=feature_columns)

    return feature_names, feature_columns


# actual PCA

def pca (n_components):
    pca_df = PCA(n_components=n_components)
    feature_names, feature_columns = df_prep()
    principal_components = pca_df.fit_transform(x)

    column_names = []
    for i in range (0, n_components):
        count = str(i)
        column_names.append('PC'+count)

    column_names = np.array(column_names)
    #components_DF = pd.dfFrame(df = principal_components, columns=column_names)
    explained_var = pca_df.explained_variance_ratio_

    explained_var_percentage = []
    explained_percentage = 0
    for i in range (len(explained_var)):
        explained_percentage += (explained_var[i] * 100)   
        explained_var_percentage.append(explained_var[i] *100)

    explained_var_df = pd.dfFrame(explained_var_percentage)

    explained_var_df.columns =["% of Information"]
    explained_var_df["feature_nr"] = column_names.tolist()
    
    return explained_percentage, n_components


### Optimizaion ##############################################################################################################

def cost_function_pca(n_components_fixed):
    n_components = n_components_fixed
    explained_percentage, n_components = pca(n_components)
    information_component_ratio = (explained_percentage / n_components)
    return {'loss': - information_component_ratio , 'status': STATUS_OK }

loss = cost_function_pca(n_components_fixed)
print(loss)