In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import swifter


pd.set_option('display.max_columns', None)

In [None]:
recipes = pd.read_csv('./recipes.csv')
requests = pd.read_csv('./requests.csv')
reviews = pd.read_csv('./reviews.csv')
diet = pd.read_csv('./diet.csv')

import requests as rq
url = 'https://raw.githubusercontent.com/sallto/baml/master/classify_diet.csv'
page = rq.get(url)
classify_diet = pd.DataFrame([x.split(':;') for x in page.text.split('\n')], columns = ['Ingredient', 'Diet'])[1:-1]
classify_diet

In [None]:
recipes['RecipeIngredientParts'] = recipes['RecipeIngredientParts'].apply(lambda x: x.replace('\\','').replace('"','').replace('c(', '').replace(')','').replace('(',''))
recipes['RecipeIngredientQuantities'] = recipes['RecipeIngredientQuantities'].apply(lambda x: x.replace('\\','').replace('"','').replace('c(', '').replace(')','').replace('(',''))

def func(x):
    res = []
    for y in x.split(','):
        y = y.strip().lower().replace('-',' ')
        found = classify_diet.loc[classify_diet['Ingredient'] == y]['Diet'].values
        if len(found) == 0:
            print(y)
            continue
        else:
            res.append(found[0])
    return res

# add meal type
classify_diet['Ingredient'] = classify_diet['Ingredient'].str.lower().str.strip().str.replace('(', '').str.replace(')', '').replace('-',' ')
recipes['RecipeIngredientParts'] = recipes['RecipeIngredientParts'].str.lower().str.strip().replace('-',' ')

#classify_diet
print(len(recipes))
recipes['DietIngredient'] = recipes['RecipeIngredientParts'].swifter.allow_dask_on_strings(enable=True).apply(func)

recipes['DietIngredient']

In [None]:
def decideDiet(x):
    if list(set(x)) == list(['Vegan']):
        return 'Vegan'
    if list(set(x)) == list(['Vegetarian']) or list(set(x)) == list(['Vegan', 'Vegetarian']):
        return 'Vegetarian'
    return 'Omnivore'


recipes['DietRecipe'] = recipes['DietIngredient'].apply(decideDiet)
recipes['DietRecipe']

In [None]:
# join recipes and requests
joined = recipes.merge(reviews, on='RecipeId', how='inner')

# join with diet
joined = joined.merge(diet, on='AuthorId', how='inner')

# join with requests
all = joined.merge(requests, on=['RecipeId', 'AuthorId'], how='inner')

In [None]:
mapping = {
    'RecipeCategory': 'string',
    'RecipeIngredientQuantities': 'string',
    'RecipeIngredientParts': 'string',
    'Name': 'string',
    'AuthorId': 'string',
    'Like': 'bool',
    'Diet': 'string',
    'HighCalories': 'bool',
    'HighProtein': 'string',
    'LowFat': 'bool',
    'LowSugar': 'string',
    'HighFiber': 'bool'
    
}
df = all.astype(mapping)

In [None]:
# add feature Rated
df['Rated'] = df['Rating'].apply(lambda x: True if x > 0 else False)

# HighProtein and LowSugar have only two values
mapping = {
    'HighProtein': 'bool',
    'LowSugar': 'bool',
}
df = df.astype(mapping)

# add feature MatchesDiet and DietCompatible
def dietComatible(row):
    diet = row['Diet']
    diet_recipe = row['DietRecipe']
    if diet == 'Vegan':
        return diet_recipe == 'Vegan'
    if diet == 'Vegetarian':
        return (diet_recipe == 'Vegan' or diet_recipe == 'Vegetarian')
    return True
df['MatchesDiet'] = df['Diet'] == df['DietRecipe']
df['DietCompatible'] = df.apply(dietComatible, axis=1)


In [None]:
print(df.groupby("Like").size())

In [None]:
def boxplot(column):
    sns.boxplot(data=df,x=df[f"{column}"])
    plt.show()

cols = ['Age', 'CookTime', 'PrepTime', 'Calories', 'FatContent', 'SaturatedFatContent', 'CarbohydrateContent', 'FiberContent', 
        'SugarContent', 'ProteinContent', 'SodiumContent', 'CholesterolContent']

#for col in cols:
    #boxplot(col)

In [None]:
mapping = {
    'Diet': 'category',
    'DietRecipe': 'category',
    'RecipeCategory': 'category',
    'DietCompatible': 'bool',
    'MatchesDiet': 'bool',
}
df = df.astype(mapping)
df = pd.get_dummies(df, columns=['Diet', 'DietRecipe', 'RecipeCategory'])

In [None]:
df['RecipeServings'].fillna(1, inplace=True)

df['TimePerServing'] = df['CookTime'] / df['RecipeServings']
df['TotalTime'] = df['CookTime'] + df['PrepTime']
df['TotalTimePerServing'] =  df['TotalTime'] / df['RecipeServings']
df['CaloriesPerServing'] = df['Calories'] / df['RecipeServings']
df['ProteinFatRatio'] = df['ProteinContent'] / (df['FatContent']+0.000001)
df['RequestedTimeRatio'] = df['Time'] / (df['TotalTime']+0.000001)

df['isInTime'] = df['TotalTime'] <= df['Time']

df['NumberIngredients'] = df['RecipeIngredientParts'].apply(lambda x: len(x.split(',')))

def getRecipePopularity(row):
    recipe_id = row['RecipeId']
    return len(df.loc[(df['RecipeId'] == recipe_id) & (df['Like'] == True)])

df['RecipePopularity'] = df.swifter.allow_dask_on_strings(enable=True).apply(getRecipePopularity, axis=1)
df['RecipePopularity'].fillna(0, inplace=True)

df['RequestedIngredientCount'] = df['RecipeIngredientQuantities'].apply(lambda x: len(x.split(',')))
df['IngredientCountMatches'] = df['NumberIngredients'] == df['RequestedIngredientCount']


In [None]:
df['AvgRecipePopularity'] = df.swifter.allow_dask_on_strings(enable=True).apply(lambda row: row['RecipePopularity'] / len(df.loc[df['RecipeId'] == row['RecipeId']]), axis=1)
df['AvgRecipePopularity']

In [None]:
""" df['AuthorRecipeCount'] = df.swifter.allow_dask_on_strings(enable=True).apply(lambda row: len(df.loc[df['AuthorId'] == row['AuthorId']]), axis=1)
df['AuthorRecipeCount'] """

df['AuthorFrequency'] = df.groupby('AuthorId')['AuthorId'].transform('count')
df['RecipeFrequency'] = df.groupby('RecipeId')['RecipeId'].transform('count')
df['AuthorFrequency']

In [None]:
bins = [0, 18, 24, 34, 44, 54, 64, 100]
labels = ['0-18', '19-24', '25-34', '35-44', '45-54', '55-64', '65+']
df['AgeGroup'] = pd.cut(df['Age'], bins=bins, labels=labels, include_lowest=True)
df = pd.get_dummies(df, columns=['AgeGroup'])

#df['TimePerServing']


Until here all feature engineering stuff must be done

We can see some outlier which we will remove

In [None]:
# takes around 3 minutes
print(list(df))
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler, MinMaxScaler

numeric_cols = ['Age', 'CookTime', 'PrepTime', 'Calories', 'FatContent', 'SaturatedFatContent', 'CarbohydrateContent', 'FiberContent', 
        'SugarContent', 'ProteinContent', 'SodiumContent', 'CholesterolContent', 'TimePerServing','TotalTime','TotalTimePerServing','CaloriesPerServing', 'ProteinFatRatio'
        , 'RequestedTimeRatio', 'NumberIngredients']
df_lof = df[numeric_cols]

scaler = StandardScaler()
df_lof = scaler.fit_transform(df_lof)

clf = LocalOutlierFactor(n_neighbors=20)
y_pred = clf.fit_predict(df_lof)
X_scores = clf.negative_outlier_factor_

df['Outlier'] = y_pred

In [None]:
from sklearn.ensemble import IsolationForest

df_lof = df[numeric_cols]
clf = IsolationForest(random_state=0).fit(df_lof)
y_pred = clf.fit_predict(df_lof)

#df['Outlier'] = y_pred

In [None]:
train_all = df[df['TestSetId'].isna()]
test = df[df['TestSetId'].notna()]
print(len(train_all))
print(len(test))
test[test['TestSetId'].isna()]

In [None]:
threshold = -10
#outlier = train_all[train_all['Outlier'] < threshold]
outlier = train_all[train_all['Outlier'] == -1]
print(len(outlier))
outlier['Name'].unique()

In [None]:
#train_all = train_all[train_all['Outlier'] >= threshold]
train_all = train_all[train_all['Outlier'] == 1]
print(len(train_all))
train_all.head()

In [None]:
print(train_all.groupby("Like").size())

In [None]:
""" import matplotlib.pyplot as plt
%matplotlib inline

for col in numeric_cols:
    plt.hist(train_all[col], density=True, bins=30)  # density=False would make counts
    plt.ylabel('Probability')
    plt.xlabel(col)
    plt.show() """

In [None]:
train_all.describe()

The class labels are uneven distributed. Therefore, we sample only a part of the False Like labels

In [None]:
""" false_sample = train_all[train_all['Like'] == False].sample(frac=0.9, random_state=1)
true_sample = train_all[train_all['Like'] == True]
true_sample = pd.concat([true_sample] * 6)

train_all = pd.concat([false_sample, true_sample])
print(len(train_all))
print(train_all.groupby("Like").size()) """



In [None]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE


""" 

"""

X_cols = [
 'CookTime',
 'PrepTime',
 'Calories',
 'FatContent',
 'SaturatedFatContent',
 'CholesterolContent',
 'SodiumContent',
 'CarbohydrateContent',
 'FiberContent',
 'SugarContent',
 'ProteinContent',
 'Age',
 'Time',
 'HighCalories',
 'HighProtein',
 'LowFat',
 'LowSugar',
 'HighFiber',
 'Rated',

 'MatchesDiet',
 'DietCompatible',
 
 'AgeGroup_0-18', 
 'AgeGroup_19-24', 
 'AgeGroup_25-34', 
 'AgeGroup_35-44', 
 'AgeGroup_45-54', 
 'AgeGroup_55-64', 
 'AgeGroup_65+',

 'Diet_Omnivore',
 'Diet_Vegan',
 'Diet_Vegetarian',
 'DietRecipe_Omnivore',
 'DietRecipe_Vegan',
 'DietRecipe_Vegetarian',
 'RecipeCategory_Beverages',
 'RecipeCategory_Bread',
 'RecipeCategory_Breakfast',
 'RecipeCategory_Lunch',
 'RecipeCategory_One dish meal',
 'RecipeCategory_Other',
 'RecipeCategory_Soup',

 'NumberIngredients',

 'RecipePopularity',

 'AvgRecipePopularity',

 'AuthorFrequency',
 'RecipeFrequency',

 'IngredientCountMatches',

 'TimePerServing','TotalTime','TotalTimePerServing','CaloriesPerServing','ProteinFatRatio','RequestedTimeRatio'
 ]

X = train_all[X_cols]
y = train_all['Like']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=42, stratify = y)

In [None]:
print(pd.DataFrame(y_train, columns=['Like']).groupby("Like").size())
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

print(pd.DataFrame(y_train, columns=['Like']).groupby("Like").size())

In [None]:
""" from imblearn.over_sampling import RandomOverSampler
print(pd.DataFrame(y_train, columns=['Like']).groupby("Like").size())
oversampler = RandomOverSampler(random_state=42)
X_train, y_train = oversampler.fit_resample(X_train, y_train)

print(pd.DataFrame(y_train, columns=['Like']).groupby("Like").size()) """

In [None]:
from sklearn import tree
from sklearn.metrics import classification_report
from sklearn.metrics import balanced_accuracy_score
decision_tree = tree.DecisionTreeClassifier()
clf = decision_tree.fit(X_train, y_train)
y_pred = decision_tree.predict(X_val)


print(classification_report(y_val, y_pred))
print(balanced_accuracy_score(y_val, y_pred))

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import balanced_accuracy_score

rf = RandomForestClassifier(max_depth=6, random_state=0, n_estimators=300)
rf = rf.fit(X_train, y_train)
y_pred = rf.predict(X_val)


print(classification_report(y_val, y_pred))
print(balanced_accuracy_score(y_val, y_pred))

In [None]:
import numpy as np
# Get feature importances
feature_importances = rf.feature_importances_

# Get the indices of the features in descending order of importance
indices = np.argsort(feature_importances)[::-1]

# Print feature ranking
print("Feature ranking:")
for f in range(X_train.shape[1]):
    print(f"{X_train.columns[indices[f]]}: {feature_importances[indices[f]]}")


# Plot the feature importances
plt.figure(figsize=(10, 6))
plt.title("Feature Importances")
plt.bar(range(X_train.shape[1]), feature_importances[indices], align="center")
plt.xticks(range(X_train.shape[1]), X_train.columns[indices], rotation=90)
plt.xlabel("Feature")
plt.ylabel("Importance Score")
plt.show()

In [None]:
import xgboost as xgb
from sklearn.metrics import classification_report
from sklearn.metrics import balanced_accuracy_score

count_positive = len(train_all[train_all['Like'] == 1])
count_negative = len(train_all[train_all['Like'] == 0])
scale_pos_weight = count_negative / count_positive 
scale_pos_weight = 1
print(scale_pos_weight)

#Creating an XGBoost classifier
xgboost_only = xgb.XGBClassifier(scale_pos_weight = scale_pos_weight)

#Training the model on the training data
xgboost_only.fit(X_train, y_train)

#Making predictions on the test set
predictions = xgboost_only.predict(X_val)

print(classification_report(y_val, predictions))
print(balanced_accuracy_score(y_val, predictions))
#0.9413489394190455
#0.9427678583094481
#0.8855812391596929

In [None]:
xgb.plot_importance(xgboost_only, max_num_features = 30)

In [None]:
from sklearn.ensemble import BaggingClassifier
import xgboost as xgb
from sklearn.metrics import classification_report

xgboost = xgb.XGBClassifier()
bagging_clf = BaggingClassifier(estimator = xgboost, n_estimators=30, random_state=0, max_samples = 0.7, oob_score = True).fit(X_train, y_train)
predictions = bagging_clf.predict(X_val)

print(bagging_clf.oob_score_)
print(classification_report(y_val, predictions))
print(balanced_accuracy_score(y_val, predictions))
#0.9587725853231339

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler


scaler = StandardScaler()
X_train_stand = scaler.fit_transform(X_train)
X_val_stand = scaler.fit_transform(X_val)

knn_model = KNeighborsClassifier(n_neighbors=200, weights = 'distance')
#Training the model on the training data
knn_model.fit(X_train_stand, y_train)

#Making predictions on the test set
predictions = knn_model.predict(X_val_stand)

print(classification_report(y_val, predictions))
print(balanced_accuracy_score(y_val, predictions))

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn import tree

decision_tree = tree.DecisionTreeClassifier(max_depth=2)
adaboost_model = AdaBoostClassifier(n_estimators=100, random_state=42, estimator=decision_tree)

#Training the model on the training data
adaboost_model.fit(X_train, y_train)

#Making predictions on the test set
predictions = adaboost_model.predict(X_val)

print(classification_report(y_val, predictions))
print(balanced_accuracy_score(y_val, predictions))
#0.9106066772690254

In [51]:
from sklearn.ensemble import VotingClassifier
import xgboost as xgb
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import balanced_accuracy_score
from sklearn.pipeline import Pipeline
from skopt import BayesSearchCV
np.int=int # fix annoying depreciation error


In [57]:


random_forest = RandomForestClassifier(random_state=0)

decision_tree = tree.DecisionTreeClassifier(random_state=0)
xgboost = xgb.XGBClassifier(random_state=0)
knn_model = Pipeline([('scaler', StandardScaler()), ('knn', KNeighborsClassifier())]) # n_neighbors=200


eclf = VotingClassifier(estimators=[('xgboost', xgboost), ('dt', decision_tree), ('knn', knn_model), ('adaboost', adaboost_model), ('rf', random_forest)], voting='hard')
# shorthand__parametername
params={
    'xgboost__max_depth': (10,20),
    'knn__knn__n_neighbors': (100,500),
    'rf__max_depth': (10,30),
    'adaboost__n_estimators': (50,200),
}
search = BayesSearchCV(estimator=eclf, search_spaces=params, n_jobs=-1, n_iter=50,n_points=2) # define cv

#Training the model on the training data
#eclf.fit(X_train, y_train)
search.fit(X_train,y_train)

# report the best result
print(search.best_score_)
print(search.best_params_)
# use best model
eclf=search.best_estimator_

#Making predictions on the test set
predictions = eclf.predict(X_val)

print(classification_report(y_val, predictions))
print(balanced_accuracy_score(y_val, predictions))
#0.9665586708637973



0.956829217696199
OrderedDict([('adaboost__n_estimators', 50), ('knn__knn__n_neighbors', 100), ('rf__max_depth', 24), ('xgboost__max_depth', 14)])
              precision    recall  f1-score   support

       False       0.95      0.98      0.97     27325
        True       0.83      0.68      0.75      4144

    accuracy                           0.94     31469
   macro avg       0.89      0.83      0.86     31469
weighted avg       0.94      0.94      0.94     31469

0.830069139522479


## Save predictions as file

In [58]:
# train with all data
xgboost_final = eclf

final_model = xgboost_final
final_model = final_model.fit(X, y)
predictions_final = final_model.predict(test[X_cols])

df_final = pd.DataFrame()
df_final['id'] = test['TestSetId']
df_final['prediction'] = predictions_final

df_final['id'] = df_final['id'].astype(int)
df_final['prediction'] = df_final['prediction'].astype(int)

df_final.to_csv('predictions.csv', index=False)
df_final

Unnamed: 0,id,prediction
0,1548,0
1,1547,1
2,2069,0
3,2070,0
4,2152,0
...,...,...
140157,6587,1
140161,41225,1
140168,2361,1
140171,41081,1
