## Logistic Model

In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
from scipy import stats

pd.options.display.max_columns = 500
pd.options.display.max_rows = 10

plt.style.use('fivethirtyeight')

In [2]:
data = pd.read_csv('final_data/clean_english_data.csv') 
tmp_nation = data.copy()
tmp_nation['nationality'] = tmp_nation['nationality'].apply(lambda x: x if (x=="Stati Uniti d'America") or (x=="Italia") else None)
tmp_nation.dropna(inplace=True)
for col in tmp_nation.columns:
    if type(tmp_nation[col].loc[22]) == str:
        tmp_nation[col] = tmp_nation[col].astype('category').cat.codes

In [3]:
for col in data.columns:
    if type(data[col].loc[22]) == str:
        data[col] = data[col].astype('category').cat.codes
        


In [45]:
stats.f_oneway(tmp_nation.loc[tmp_nation['nationality'] == 0]['diet health score'], 
               tmp_nation.loc[tmp_nation['nationality'] == 1]['diet health score'])

F_onewayResult(statistic=16.124789941699106, pvalue=6.440967746208372e-05)

In [53]:
tmp_nation.loc[tmp_nation['nationality'] == 0]['sex'].value_counts()

0    190
1    108
Name: sex, dtype: int64

In [5]:
x = tmp_nation.drop(columns=['nationality', 'region'])

y = tmp_nation['nationality']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)


In [6]:
logisticmodel = LogisticRegression(solver='lbfgs', max_iter=3000, multi_class='auto')

In [7]:
logisticmodel.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=3000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [8]:
predictions = logisticmodel.predict(x_test)

In [9]:
accuracy_score(predictions, y_test)

0.9273356401384083

In [10]:
r2_score(y_test, predictions)

0.688353702372394

In [41]:
diz = {}
for col, coef in zip(x_test.columns, logisticmodel.coef_[0]):
    diz[col] = coef 
diz = {k: v for k, v in sorted(diz.items(), key=lambda item: item[1])}
diz
# Sex is unbalanced, but it shouldn't give us issues

{'sex': -1.5306350332651302,
 'smoke': -1.336337444222233,
 'food shopping': -1.2996823376065814,
 'food weigh': -1.1396816906795277,
 'sport diet': -0.8908584751871468,
 'bike': -0.7275919308473853,
 'qualification': -0.6476897650360401,
 'sport': -0.5565943086330453,
 'city': -0.5292065724252585,
 'fruits morning': -0.4953584669755515,
 'drugs kind': -0.4753928024334587,
 'which meals skip': -0.40648224161661517,
 'fish': -0.3131730361515622,
 'which values': -0.28591996723584784,
 'smoke begin': -0.2766623788001715,
 'drinks': -0.24893366683995433,
 'working status': -0.23320110650093234,
 'cooking': -0.2025647200373169,
 'smoke object': -0.19366906967188816,
 'diet health score': -0.1872886650450264,
 'bed time': -0.1697277855201535,
 'carbohidrates': -0.169077579909588,
 'work transportation': -0.1166615328287708,
 'frozen food a month': -0.11454513913549495,
 'wine': -0.0710788893472981,
 'delivered food a month': -0.049137056744920325,
 'housemates': -0.04763921373130718,
 'age'

In [106]:
def print_logreg_r2():
    
    for col in x.columns:
        logisticmodel_ = LogisticRegression(solver='lbfgs', max_iter=300, multi_class='auto',random_state=42)
        
        logisticmodel_.fit(x_train[col].values.reshape(-1,1), y_train)
        
        coef = round(logisticmodel_.coef_[0][0],4)
        
        inter = round(logisticmodel_.intercept_[0],5)
        
        pred = logisticmodel_.predict(x_test[col].values.reshape(-1,1))
        
        print(col,
              "\nSlope: " + str(coef),
              "\nIntercept: " + str(inter),
              "\nR squared: " + str(round(r2_score(y_test, pred), 4)), 
              "\nAccuracy " + str(round(accuracy_score(y_test, pred), 4)) +
               "\n \n")
        


In [107]:
print_logreg_r2()

sex 
Slope: -0.0343 
Intercept: 0.7358 
R squared: -0.5879 
Accuracy 0.6298
 

age 
Slope: -0.0128 
Intercept: 1.07578 
R squared: -0.5879 
Accuracy 0.6298
 

housemates 
Slope: -0.0819 
Intercept: 0.87508 
R squared: -0.5879 
Accuracy 0.6298
 

city 
Slope: -1.0166 
Intercept: 1.70191 
R squared: -0.2911 
Accuracy 0.699
 

qualification 
Slope: -0.5954 
Intercept: 1.6096 
R squared: -0.766 
Accuracy 0.5882
 

working status 
Slope: -0.1728 
Intercept: 1.13856 
R squared: -0.7363 
Accuracy 0.5952
 

weight 
Slope: 0.0435 
Intercept: -2.27205 
R squared: -0.4395 
Accuracy 0.6644
 

bed time 
Slope: -0.3896 
Intercept: 1.26698 
R squared: -0.5879 
Accuracy 0.6298
 

sleeping hours 
Slope: 0.1242 
Intercept: 0.54415 
R squared: -0.5879 
Accuracy 0.6298
 

sleep importance 
Slope: 0.0208 
Intercept: -1.12747 
R squared: -0.5434 
Accuracy 0.6401
 

smoke 
Slope: -0.5907 
Intercept: 0.97723 
R squared: -0.3208 
Accuracy 0.692
 

smoke begin 
Slope: 0.7195 
Intercept: -0.56204 
R squared: -0.

In [127]:
from xgboost import XGBClassifier

model = XGBClassifier()

model.fit(x_train, y_train)

predictions = model.predict(x_test)

accuracy_score(y_test, predictions)

0.9377162629757786