In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import spatial


In [2]:
df = pd.read_excel(r'giallozaferano_dataset.xlsx')

In [3]:
cost_cat_dic = {
    'Molto basso': '1',
    'Basso': '2',
    'Medio': '3',
    'Elevato': '4',
    'Molto elevata': '5'
}

diff_cat_dic = {
    'Molto facile': '1',
    'Facile': '2',
    'Media' : '3',
    'Difficile': '4',
    'Molto difficile': '5'
}

df.replace({"cost": cost_cat_dic}, inplace=True)
df.replace({"difficulty": diff_cat_dic}, inplace=True)

In [4]:
# Remove non-numeric values and convert totalTime to integer datatype. 
df["totalTime"] = df["totalTime"].str.extract('(\d+)', expand=False)
df["totalTime"] = pd.to_numeric(df["totalTime"], errors="coerce")
df["ratingCount"] = pd.to_numeric(df["ratingCount"], errors="coerce")

# Remove non-numeric values and convert prepTime & cookTime to integer datatype. 
df["cookTime"] = df["cookTime"].str.extract('(\d+)', expand=False)
df["prepTime"] = df["prepTime"].str.extract('(\d+)', expand=False)
df["cookTime"] = pd.to_numeric(df["cookTime"], errors="coerce")
df["prepTime"] = pd.to_numeric(df["prepTime"], errors="coerce")

# Converting cost text categories in to numbers and performing numeric conversion on column
df["cost"] = pd.to_numeric(df["cost"], errors="coerce")
df["difficulty"] = pd.to_numeric(df["difficulty"], errors="coerce")

In [5]:
del df["bestRating"] # Servers us no real as it only takes one rating to make a max

In [6]:
# Dropping all rows with NaN values from cost column is NaN values or inaccurate values
# devalues the whole point of the recommendation, 
df = df.dropna(subset=["cost"])

# Replacing all missing values with 0 and use the value 0
# as a mark to state that the amount of that certain feature is unknown
df["cholesterol"].fillna(0, inplace=True)
df["fibers"].fillna(0, inplace=True)
df["fat"].fillna(0, inplace=True)
df["saturatedFat"].fillna(0, inplace=True)
df["sodium"].fillna(0, inplace=True)
df["saturatedFat"].fillna(0, inplace=True)
df["proteins"].fillna(0, inplace=True)
df["sugars"].fillna(0, inplace=True)
df["carbohydrates"].fillna(0, inplace=True)
df["calories"].fillna(0, inplace=True)

In [7]:
from scipy import spatial

In [9]:
unimp_features = df.loc[:, ["calories", "carbohydrates", "sugars", "proteins", "fat",
                           "saturatedFat", "fibers", "cholesterol", "sodium"]]

In [11]:
dish_one = unimp_features.iloc[235]
dish_two = unimp_features.iloc[621]
dish_three = unimp_features.iloc[831]

In [12]:
df_important = df.copy()
df_unimportant = df.copy()

In [13]:
for index, row in unimp_features.iterrows():
    df_unimportant.loc[index, 'cos_one'] = 1 - spatial.distance.cosine(dish_one, row)
    df_unimportant.loc[index, 'cos_two'] = 1 - spatial.distance.cosine(dish_two, row)
    df_unimportant.loc[index, 'cos_three'] = 1 - spatial.distance.cosine(dish_three, row)

  dist = 1.0 - uv / np.sqrt(uu * vv)


In [22]:
for index, row in df_unimportant.iterrows():
    cos_summed = (row["cos_one"] + row["cos_two"] + row["cos_three"])
    total_cos = cos_summed / 3
    df.loc[index, 'notimportscore'] = total_cos

In [16]:
df_unimportant[["title", "score"]].sort_values('score', ascending=False)

Unnamed: 0,title,score
2382,Yogurt panna cotta with peach jelly,0.980442
1886,Ice cream without berries ice cream,0.980430
1165,Yogurt ice cream,0.980379
3462,Pavlova at coffee,0.979870
804,Cream ice cream,0.979860
...,...,...
4623,Olive leaves with omelette cubes,
4626,Crush with vegetable turret and fondue,
4627,Legume flour beans with chestnuts and speck,
4630,Panettoncini with candied tangerine and rosemary,


In [17]:
imp_features = df.loc[:, ["cost", "totalTime", "difficulty"]]

In [18]:
dish_one = imp_features.iloc[235]
dish_two = imp_features.iloc[621]
dish_three = imp_features.iloc[831]

In [19]:
for index, row in imp_features.iterrows():
    df_important.loc[index, 'cos_one'] = 1 - spatial.distance.cosine(dish_one, row)
    df_important.loc[index, 'cos_two'] = 1 - spatial.distance.cosine(dish_two, row)
    df_important.loc[index, 'cos_three'] = 1 - spatial.distance.cosine(dish_three, row)

In [23]:
for index, row in df_important.iterrows():
    cos_summed = (row["cos_one"] + row["cos_two"] + row["cos_three"])
    total_cos = cos_summed / 3
    df.loc[index, 'importscore'] = total_cos

In [21]:
df_important[["title", "score"]].sort_values('score', ascending=False)

Unnamed: 0,title,score
1723,Spice bread house,0.999851
3384,Vegan tartellettes,0.999846
929,Rolled focaccia,0.999846
2740,Black Forest Cheesecake,0.999846
3689,Red muffins,0.999846
...,...,...
3131,Roast beef with zucchini potato tin and cherry...,0.084775
2897,Braised mushrooms and red wine,0.084775
1031,Liquid mother yeast,0.079977
3858,Brinate rose petals,0.078774


In [24]:
df[["importscore", "notimportscore"]]

Unnamed: 0,importscore,notimportscore
0,0.998856,0.938100
1,0.999594,0.931541
2,0.999461,0.942993
3,0.999461,0.850722
4,0.999196,0.662935
...,...,...
4631,0.998856,0.741445
4633,0.996972,0.585174
4635,0.997959,0.628487
4636,0.999089,0.544935


In [25]:
for index, row in df.iterrows():
    df.loc[index, 'totalscore'] = (row["importscore"] + row["importscore"] + row["notimportscore"]) / 3

In [26]:
df[["title", "cost", "difficulty", "importscore", "notimportscore", "totalscore"]].sort_values('totalscore', ascending=False).head(15)

Unnamed: 0,title,cost,difficulty,importscore,notimportscore,totalscore
1886,Ice cream without berries ice cream,2.0,3.0,0.999778,0.98043,0.993328
1165,Yogurt ice cream,2.0,2.0,0.999443,0.980379,0.993089
565,Milk cream,2.0,2.0,0.999805,0.979652,0.993087
944,Cookies,3.0,3.0,0.999805,0.979292,0.992967
2390,Pan of the Dead,3.0,2.0,0.999377,0.979792,0.992848
1411,Coffee chocolate salami,2.0,2.0,0.999461,0.979423,0.992782
2382,Yogurt panna cotta with peach jelly,3.0,2.0,0.998925,0.980442,0.992764
3906,Coffee with salentina,2.0,2.0,0.999825,0.978548,0.992733
3322,Christmas brownies,2.0,2.0,0.999443,0.979299,0.992729
623,Yogurt cake with strawberry jelly and berries,3.0,3.0,0.999846,0.978484,0.992725


In [28]:
dish_one = df.iloc[235]
dish_two = df.iloc[621]
dish_three = df.iloc[831]

In [30]:
dish_one["title"]

'Muffin with the heart'

In [31]:
dish_two["title"]

'Tropea onion jam'

In [33]:
dish_three["title"]

'Cheesecake caramel and hazelnuts'