In [1]:
# Define Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split


In [2]:
#load file
df=pd.read_csv("food_coded.csv")
df.head(5)

#print(X.columns)
#print(y.columns)
#print(y.head(5))


# Simplying data due to low training data avaialble
    # Removing 2,6,7 categroy due to low count
    # Adding 8 to 4 because indian food also comes under the umbrella of Asian food
    # Removing 0 because they are undecided and have low counts

print(df.groupby(['fav_cuisine_coded']).fav_cuisine_coded.count())

df.loc[df['fav_cuisine_coded']==8] = 4
df=df[df['fav_cuisine_coded'].isin([1,2,4,5])]

print(df.groupby(['fav_cuisine_coded']).fav_cuisine_coded.count())

# Split it into input features and output 




fav_cuisine_coded
0     6
1    59
2    15
3     2
4    22
5    15
6     1
7     1
8     4
Name: fav_cuisine_coded, dtype: int64
fav_cuisine_coded
1    59
2    15
4    26
5    15
Name: fav_cuisine_coded, dtype: int64


In [3]:

#for (i,names) in enumerate(df.columns):
    #print (i," : ",names)
df_sel = df.iloc[:,[26,4,6,10,15,19,20,21,22,30,33,37,45,47,48,57,58,38,39,32,49]]
print(df_sel.columns)
print(df_sel.head(5))

Index(['fav_cuisine_coded', 'calories_day', 'coffee', 'cook', 'drink',
       'eating_out', 'employment', 'ethnic_food', 'exercise', 'fruit_day',
       'healthy_feeling', 'income', 'nutritional_check', 'parents_cook',
       'pay_meal_out', 'veggies_day', 'vitamins', 'indian_food',
       'italian_food', 'greek_food', 'persian_food'],
      dtype='object')
   fav_cuisine_coded  calories_day  coffee  cook  drink  eating_out  \
1                  1           3.0       2   3.0    2.0           2   
2                  1           4.0       2   1.0    1.0           2   
4                  1           2.0       2   1.0    2.0           2   
6                  4           3.0       2   2.0    1.0           2   
7                  5           3.0       1   3.0    2.0           2   

   employment  ethnic_food  exercise  fruit_day  ...  income  \
1         2.0            4       1.0          4  ...     4.0   
2         3.0            5       2.0          5  ...     6.0   
4         2.0        

In [7]:
#data cleaning

    # Taqi code will come here, i am just getting the column as it is 
    
imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
imputer.fit(df_sel)
X = imputer.transform(df_sel)
df_sel_clean = pd.DataFrame(X,columns = df_sel.columns)




In [8]:
#standardization

    # This will be done once data is cleaned

In [9]:
# Data Exploration
    # scatter plots
    # histograms
    # Box plot 
    # Line chart etc

# Just put whatever you think is related to our model. 


In [10]:
def correlation_matrix(df_1):
    from matplotlib.colors import ListedColormap
    
    corrMatrix = df.corr()
    fig, ax = plt.subplots(figsize=(30,30))
    #sns.heatmap(corrMatrix,cmap=ListedColormap(['green','green', 'yellow','red', 'red']), annot=True,linewidths=.5, ax=ax)
    sns.heatmap(corrMatrix, annot=True,linewidths=.5, ax=ax)   
    plt.show()

In [31]:
correlation_matrix(df_sel_clean)

In [11]:
def chi_square_feature_sel(df_1, dep_col_name,total_col_selected):
    df_train_chi_ind = df_1.loc[:, df_1.columns != dep_col_name]
    df_train_chi_dep = df_1.iloc[:,df_1.columns == dep_col_name]
    
    #rint(df_train_chi_dep.head(5))
    #rint(df_train_chi_ind.head(5))
    
    
    
    bestfeatures = SelectKBest(score_func=chi2, k=20)
    fit = bestfeatures.fit(df_train_chi_ind,df_train_chi_dep)
    
    dfscores = pd.DataFrame(fit.scores_)
    dfcolumns = pd.DataFrame(df_train_chi_ind.columns)
    
    #concat two dataframes for better visualization 
    
    featureScores = pd.concat([dfcolumns,dfscores],axis=1)
    
    featureScores.columns = ['Specs','Score']  #naming the dataframe columns
    #print(featureScores.nlargest(20,'Score'))  #print 10 best features
    
    top_n_columns = featureScores.sort_values('Score',ascending=False).head(total_col_selected).Specs
    return top_n_columns

In [13]:
#feature selection
    # Chi-square (categorical variables)
    # Random Forest (overall)
    # Correlation (numerical variables)
    # use lasso as well 
# join multiple features


# Important step , need to iterate over and over to find suitable features

# Currently i am just hand-picking the features and then applying chi-square test and correlation to get top features 

top_n_cols = chi_square_feature_sel(df_sel_clean,'fav_cuisine_coded',10)
top_n_cols=list(top_n_cols)
top_n_cols.append('fav_cuisine_coded')

df_sel_features =  df_sel_clean[df_sel_clean.columns.intersection(top_n_cols)]

print(df_sel_features.columns)


Index(['fav_cuisine_coded', 'coffee', 'ethnic_food', 'exercise',
       'healthy_feeling', 'income', 'nutritional_check', 'parents_cook',
       'indian_food', 'greek_food', 'persian_food'],
      dtype='object')


In [15]:
#test-train split using stratification

# STRATIFIED SAMPLING 
X = df_sel_features.loc[:, df_sel_features.columns != 'fav_cuisine_coded']
y = df_sel_features[['fav_cuisine_coded']]

X_train, tempX_test, y_train, tempY_test = train_test_split(X, y,stratify=y,test_size=0.30)
x_val, x_test, y_val, y_test = train_test_split(tempX_test, tempY_test,stratify=tempY_test,test_size=0.64)

# Training Set : X_train, y_train
# Validation Set : x_val ,  y_val
# Testing Set : x_test , y_test


print(y_train.groupby(['fav_cuisine_coded']).fav_cuisine_coded.count())
print(y_test.groupby(['fav_cuisine_coded']).fav_cuisine_coded.count())
print(y_val.groupby(['fav_cuisine_coded']).fav_cuisine_coded.count())
print(X_train.head(2))

fav_cuisine_coded
1.0    41
2.0    11
4.0    18
5.0    10
Name: fav_cuisine_coded, dtype: int64
fav_cuisine_coded
1.0    12
2.0     3
4.0     5
5.0     3
Name: fav_cuisine_coded, dtype: int64
fav_cuisine_coded
1.0    6
2.0    1
4.0    3
5.0    2
Name: fav_cuisine_coded, dtype: int64
    coffee  ethnic_food  exercise  healthy_feeling  income  nutritional_check  \
80     2.0          5.0       1.0              7.0     2.0                5.0   
45     2.0          2.0       1.0              8.0     1.0                5.0   

    parents_cook  indian_food  greek_food  persian_food  
80           1.0          5.0         5.0           5.0  
45           1.0          1.0         1.0           1.0  


In [26]:
#model implementation
from sklearn.linear_model import LogisticRegression
softmax_reg = LogisticRegression(multi_class="multinomial",solver="lbfgs", C=10, random_state=42)
softmax_reg.fit(X_train, y_train)

c_est = softmax_reg.predict(x_val)
probs = softmax_reg.predict_proba(x_val)
print(c_est)

#print(x_val)
#print(y_val)

from sklearn.metrics import accuracy_score

accuracy_score(y_val, c_est)




c_est = softmax_reg.predict(x_test)
accuracy_score(y_test, c_est)



[1. 1. 1. 1. 1. 5. 1. 2. 1. 1. 4. 4.]


  y = column_or_1d(y, warn=True)


0.43478260869565216

In [None]:


#cross validation to test 

In [None]:
#grid search for finding right hyperparameter

In [None]:
# Test Multiple models

In [None]:
#Evaluation function & Thresholding
    # Loop through thresholds to get best performance
    # If implementing One vs ALl , use baseline performance and compare it with other.
    # Precision, Recall, Accuracy, ROC curve, F1 score
    # Bin Sampling 
    # Lift measure
    # Migth use R^2 , not sure

In [None]:
# Final hold-out sample testing

In [None]:
# If it is underperforming 
    #use other model lile (Logistic Regression{One vs All}) <- Try this definitely
    # Change feature selection method
    # Use l1 unstead of l2 or reduce L2 penalty
    
# If it is overperforming
    # use strict l2
    # go back to simple one vs all model
