In [72]:
import pandas as pd
import numpy as np
from sklearn import tree
import statsmodels.api as sm
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import validation_curve
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing

In [44]:
df = pd.read_csv('/Users/ryanc/Desktop/ObesityDataSet_raw_and_data_sinthetic.csv')
weight_scale = {'Insufficient_Weight':0, 'Normal_Weight':1, 'Overweight_Level_I':2, 'Overweight_Level_II':3, 'Obesity_Type_I':4, 'Obesity_Type_II':5, 'Obesity_Type_III':6 }
likert_scale = {'no':0, 'Sometimes':1, 'Frequently':2, 'Always':3}
dummies = ['Gender', 'family_history_with_overweight', 'FAVC', 'SMOKE', 'SCC', 'MTRANS']
df['NObeyesdad'] = df['NObeyesdad'].replace(weight_scale)
df['CAEC'] = df['CAEC'].replace(likert_scale)
df['CALC'] = df['CALC'].replace(likert_scale)
df = df.join(pd.get_dummies(df[dummies]))
df.drop(dummies, axis = 1, inplace = True)
target = df.iloc[:,10].copy()
df.drop('NObeyesdad', axis = 1, inplace = True)

In [45]:
dt = tree.DecisionTreeClassifier()
gnb = GaussianNB()
lr = LogisticRegression(multi_class = 'multinomial', solver = 'newton-cg')
kf = KFold(n_splits = 10, shuffle = True, random_state = 0)
lab_enc = preprocessing.LabelEncoder()

In [46]:
def classification_results(y_true, y_pred, model):

    
    precision = metrics.precision_score(y_true, y_pred, average = 'micro')
    recall = metrics.recall_score(y_true, y_pred, average = 'micro')

    print('precision score: ', precision)
    print('recall score: ', recall)

In [88]:
X_train, X_test, y_train, y_test = train_test_split(df, target, test_size = .3, random_state = 0)
parameters = {'max_leaf_nodes': list(range(2, 100)), 'min_samples_split': list(range(2, 5))}
grid = GridSearchCV(dt, parameters, cv = kf, scoring = 'precision_micro')
grid.fit(X_train, y_train)
model = grid.best_estimator_
y_pred = model.predict(X_test)
y_true = y_test
print('Decision Tree')
classification_results(y_true, y_pred, 'dt')

Decision Tree
precision score:  0.9321766561514195
recall score:  0.9321766561514195


In [87]:
X_train, X_test, y_train, y_test = train_test_split(df, target, test_size = .3, random_state = 0)
parameters = {'max_leaf_nodes': list(range(2, 100)), 'min_samples_split': list(range(2, 5))}
grid = GridSearchCV(dt, parameters, cv = kf, scoring = 'recall_micro')
grid.fit(X_train, y_train)
model = grid.best_estimator_
y_pred = model.predict(X_test)
y_true = y_test
print('Decision Tree')
classification_results(y_true, y_pred, 'dt')

Decision Tree
precision score:  0.9321766561514195
recall score:  0.9321766561514195


In [54]:
X_train, X_test, y_train, y_test = train_test_split(df, target, test_size = .3, random_state = 0)
parameters = {'var_smoothing': np.logspace(0, -9, num=20)}
grid = GridSearchCV(gnb, parameters, cv = kf, scoring = 'recall_micro')
grid.fit(X_train, y_train)
model = grid.best_estimator_
y_pred = model.predict(X_test)
y_true = y_test
print('Naive Bayes')
classification_results(y_true, y_pred, 'gnb')

Naive Bayes
precision score:  0.6845425867507886
recall score:  0.6845425867507886


In [50]:
X_train, X_test, y_train, y_test = train_test_split(df, target, test_size = .3, random_state = 0)
parameters = {'var_smoothing': np.logspace(0, -9, num=20)}
grid = GridSearchCV(gnb, parameters, cv = kf, scoring = 'precision_micro')
grid.fit(X_train, y_train)
model = grid.best_estimator_
y_pred = model.predict(X_test)
y_true = y_test
print('Naive Bayes')
classification_results(y_true, y_pred, 'gnb')

Naive Bayes
precision score:  0.6845425867507886
recall score:  0.6845425867507886


In [51]:
X_train, X_test, y_train, y_test = train_test_split(df, target, test_size = .3, random_state = 0)
parameters = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
grid = GridSearchCV(lr, parameters, cv = kf, scoring = 'precision_micro')
X_train = preprocessing.scale(X_train)
X_test = preprocessing.scale(X_test)
y_train = preprocessing.scale(y_train)
y_test = preprocessing.scale(y_test)
y_train = lab_enc.fit_transform(y_train)
y_test = lab_enc.fit_transform(y_test)
grid.fit(X_train, y_train)
model = grid.best_estimator_
y_pred = model.predict(X_test)
y_true = y_test
print('Logistic Regression')
classification_results(y_true, y_pred, 'lr')

Logistic Regression
precision score:  0.9479495268138801
recall score:  0.9479495268138801


In [52]:
X_train, X_test, y_train, y_test = train_test_split(df, target, test_size = .3, random_state = 0)
parameters = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
grid = GridSearchCV(lr, parameters, cv = kf, scoring = 'recall_micro')
X_train = preprocessing.scale(X_train)
X_test = preprocessing.scale(X_test)
y_train = preprocessing.scale(y_train)
y_test = preprocessing.scale(y_test)
y_train = lab_enc.fit_transform(y_train)
y_test = lab_enc.fit_transform(y_test)
grid.fit(X_train, y_train)
model = grid.best_estimator_
y_pred = model.predict(X_test)
y_true = y_test
print('Logistic Regression')
classification_results(y_true, y_pred, 'lr')

Logistic Regression
precision score:  0.9479495268138801
recall score:  0.9479495268138801
