IS 2495

Final Project - MyFoodGenie

Sarah Kim

# Content Based Filtering 

##Data Prep

In [211]:
import pandas as pd
import numpy as np

df1 = pd.read_csv('/content/dataset.csv')

In [212]:
df1.head()

Unnamed: 0,Food_ID,Name,C_Type,Veg_Non,Describe
0,1,summer squash salad,Healthy Food,veg,"white balsamic vinegar, lemon juice, lemon rin..."
1,2,chicken minced salad,Healthy Food,non-veg,"olive oil, chicken mince, garlic (minced), oni..."
2,3,sweet chilli almonds,Snack,veg,"almonds whole, egg white, curry leaves, salt, ..."
3,4,tricolour salad,Healthy Food,veg,"vinegar, honey/sugar, soy sauce, salt, garlic ..."
4,5,christmas cake,Dessert,veg,"christmas dry fruits (pre-soaked), orange zest..."


In [213]:
df1.shape

(400, 5)

In [214]:
df1.columns

Index(['Food_ID', 'Name', 'C_Type', 'Veg_Non', 'Describe'], dtype='object')

In [215]:
df1.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Food_ID   400 non-null    int64 
 1   Name      400 non-null    object
 2   C_Type    400 non-null    object
 3   Veg_Non   400 non-null    object
 4   Describe  400 non-null    object
dtypes: int64(1), object(4)
memory usage: 15.8+ KB


##Recommendation Based on 'describe' 

TfIdfVectorizer

In [216]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english') #remove some unnecessary words

In [217]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
ENGLISH_STOP_WORDS #like these words

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'

In [218]:
df1['Describe'].isnull().values.any() #See if there's any null words, if false, not any

False

###if the result is true, I should've used this function

In [219]:
df1['Describe'] = df1['Describe'].fillna('') 

In [220]:
tfidf_matrix = tfidf.fit_transform(df1['Describe'])
tfidf_matrix.shape #400 describe attribute contains 1166 words

(400, 1166)

In [221]:
tfidf_matrix

<400x1166 sparse matrix of type '<class 'numpy.float64'>'
	with 6591 stored elements in Compressed Sparse Row format>

### find similarity

 cosine similarity score

In [222]:
from sklearn.metrics.pairwise import linear_kernel

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim

array([[1.        , 0.1623187 , 0.12997236, ..., 0.12669135, 0.12381163,
        0.09773775],
       [0.1623187 , 1.        , 0.06775767, ..., 0.14711416, 0.05786264,
        0.16466691],
       [0.12997236, 0.06775767, 1.        , ..., 0.03242423, 0.12197946,
        0.01763026],
       ...,
       [0.12669135, 0.14711416, 0.03242423, ..., 1.        , 0.        ,
        0.09761451],
       [0.12381163, 0.05786264, 0.12197946, ..., 0.        , 1.        ,
        0.        ],
       [0.09773775, 0.16466691, 0.01763026, ..., 0.09761451, 0.        ,
        1.        ]])

In [223]:
cosine_sim.shape

(400, 400)

###Create Series

In [224]:
indices = pd.Series(df1.index, index=df1['Name']).drop_duplicates()
indices #The resulting indices Series can be used to look up the row index of a particular Name value in df1 by using the Series' .loc accessor.

Name
summer squash salad                                          0
chicken minced salad                                         1
sweet chilli almonds                                         2
tricolour salad                                              3
christmas cake                                               4
                                                          ... 
Kimchi Toast                                               395
Tacos de Gobernador (Shrimp, Poblano, and Cheese Tacos)    396
Melted Broccoli Pasta With Capers and Anchovies            397
Lemon-Ginger Cake with Pistachios                          398
Rosemary Roasted Vegetables                                399
Length: 400, dtype: int64

In [225]:
indices['chicken minced salad']

1

In [226]:
df1.iloc[[1]]

Unnamed: 0,Food_ID,Name,C_Type,Veg_Non,Describe
1,2,chicken minced salad,Healthy Food,non-veg,"olive oil, chicken mince, garlic (minced), oni..."


### MAIN Recommenation Code

In [227]:
# Get the top 10 lists when getting the Food Name
def get_recommendations(Name, cosine_sim=cosine_sim):

# Get the index through Food Name in whole data
  idx = indices[Name]
    
# Get data as (idc,sim) in (cosine_sim) 
  sim_scores = list(enumerate(cosine_sim[idx]))
    
# Get in reverse order based on sim score
  sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
# 10 food recommendations exclding oneself
  sim_scores = sim_scores[1:11]
    
# 10 index info in food 
  food_indices = [i[0] for i in sim_scores]
    
# get foodname by index
  return df1['Name'].iloc[food_indices]

*###ALL TESTING*

In [228]:
test_idx = indices['chicken minced salad'] # Food Name 을 통해서 전체 데이터 기준 그 food의 index 값을 얻기
test_idx

1

In [229]:
test_sim_scores = list(enumerate(cosine_sim[3])) # 코사인 유사도 매트릭스 (cosine_sim) 에서 idx 에 해당하는 데이터를 (idx, 유사도) 형태로 얻기

In [230]:
test_sim_scores = sorted(test_sim_scores, key=lambda x: x[1], reverse=True) # 코사인 유사도 기준으로 내림차순 정렬
test_sim_scores[1:11] # 자기 자신을 제외한 10개의 추천food 슬라이싱

[(103, 0.3181342687766962),
 (380, 0.2963583549716202),
 (1, 0.2761374340542722),
 (166, 0.26812746042640634),
 (27, 0.2587928875700267),
 (359, 0.25477138979180414),
 (106, 0.25003212523978285),
 (282, 0.24530841245457732),
 (369, 0.24148632121054334),
 (129, 0.2288725282985011)]

In [231]:
# 추천 food 목록 10개의 인덱스 정보 추출
test_food_indices = [i[0] for i in test_sim_scores[1:11]]
test_food_indices

[103, 380, 1, 166, 27, 359, 106, 282, 369, 129]

In [232]:
# 인덱스 정보를 통해 food name 추출
df1['Name'].iloc[test_food_indices]

103                                 chilli chicken
380                       Vietnamese Chicken Salad
1                             chicken minced salad
166                                 veg fried rice
27                         vegetable som tam salad
359    Sesame Noodles with Chili Oil and Scallions
106                            garlic soya chicken
282                              veg hakka noodles
369              Sweet and Sour Chicken Fried Rice
129                            chicken shami kebab
Name: Name, dtype: object

### LET'S TEST HOW IT WORKS

In [233]:
get_recommendations('chicken minced salad')

103                                 chilli chicken
282                              veg hakka noodles
166                                 veg fried rice
151                               prawn fried rice
154                                    chilli fish
106                            garlic soya chicken
328                                 Thai Spareribs
312                             Spicy Korean Steak
167                      egg and garlic fried rice
359    Sesame Noodles with Chili Oil and Scallions
Name: Name, dtype: object

In [234]:
get_recommendations('sweet chilli almonds')

304                        sunga pork
43           andhra pan fried pomfret
96     restaurant style fried chicken
25                      almond pearls
102          amritsari chicken masala
268       spiced orange valencia cake
129               chicken shami kebab
67                         gajar tart
149                 kerala fish curry
168                         curd rice
Name: Name, dtype: object

##Recommendation Based on 'C_Type' and 'Veg_Non'

In [235]:
df1.head(3)

Unnamed: 0,Food_ID,Name,C_Type,Veg_Non,Describe
0,1,summer squash salad,Healthy Food,veg,"white balsamic vinegar, lemon juice, lemon rin..."
1,2,chicken minced salad,Healthy Food,non-veg,"olive oil, chicken mince, garlic (minced), oni..."
2,3,sweet chilli almonds,Snack,veg,"almonds whole, egg white, curry leaves, salt, ..."


In [236]:
df1.loc[0, 'C_Type']

'Healthy Food'

In [237]:
def data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(' ', '')) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(' ', ''))
        else:
            return ''

In [238]:
features = ['C_Type', 'Veg_Non']
for feature in features:
    df1[feature] = df1[feature].apply(data)

In [239]:
df1[['Name', 'C_Type', 'Veg_Non']].head(3)

Unnamed: 0,Name,C_Type,Veg_Non
0,summer squash salad,healthyfood,veg
1,chicken minced salad,healthyfood,non-veg
2,sweet chilli almonds,snack,veg


In [240]:
def create_soup(x):
    return ''.join(x['C_Type']) + ' ' + ''.join(x['Veg_Non'])
df1['soup'] = df1.apply(create_soup, axis=1)
df1['soup']

0          healthyfood veg
1      healthyfood non-veg
2                snack veg
3          healthyfood veg
4              dessert veg
              ...         
395             korean veg
396        mexican non-veg
397         french non-veg
398        dessert non-veg
399        healthyfood veg
Name: soup, Length: 400, dtype: object

In [241]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df1['soup'])
count_matrix

<400x17 sparse matrix of type '<class 'numpy.int64'>'
	with 962 stored elements in Compressed Sparse Row format>

In [242]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)
cosine_sim2

array([[1.        , 0.81649658, 0.5       , ..., 0.40824829, 0.40824829,
        1.        ],
       [0.81649658, 1.        , 0.40824829, ..., 0.66666667, 0.66666667,
        0.81649658],
       [0.5       , 0.40824829, 1.        , ..., 0.40824829, 0.40824829,
        0.5       ],
       ...,
       [0.40824829, 0.66666667, 0.40824829, ..., 1.        , 0.66666667,
        0.40824829],
       [0.40824829, 0.66666667, 0.40824829, ..., 0.66666667, 1.        ,
        0.40824829],
       [1.        , 0.81649658, 0.5       , ..., 0.40824829, 0.40824829,
        1.        ]])

In [243]:
indices['summer squash salad']

0

In [244]:
df1 = df1.reset_index()
indices = pd.Series(df1.index, index=df1['Name'])
indices

Name
summer squash salad                                          0
chicken minced salad                                         1
sweet chilli almonds                                         2
tricolour salad                                              3
christmas cake                                               4
                                                          ... 
Kimchi Toast                                               395
Tacos de Gobernador (Shrimp, Poblano, and Cheese Tacos)    396
Melted Broccoli Pasta With Capers and Anchovies            397
Lemon-Ginger Cake with Pistachios                          398
Rosemary Roasted Vegetables                                399
Length: 400, dtype: int64

### LET'S TEST HOW IT WORKS

In [245]:
get_recommendations('summer squash salad', cosine_sim2)

3                        tricolour salad
8                   cream of almond soup
9               broccoli and almond soup
10             coconut lime quinoa salad
12    watermelon and strawberry smoothie
13    peach, raspberry and nuts smoothie
26                  hawaiin papaya salad
27               vegetable som tam salad
33         mixed berry & banana smoothie
34                banana walnut smoothie
Name: Name, dtype: object

In [246]:
df1.loc[0] #data of summer squash salad

index                                                       0
Food_ID                                                     1
Name                                      summer squash salad
C_Type                                            healthyfood
Veg_Non                                                   veg
Describe    white balsamic vinegar, lemon juice, lemon rin...
soup                                          healthyfood veg
Name: 0, dtype: object

In [247]:
df1.loc[3] #data of tricolour salad

index                                                       3
Food_ID                                                     4
Name                                          tricolour salad
C_Type                                            healthyfood
Veg_Non                                                   veg
Describe    vinegar, honey/sugar, soy sauce, salt, garlic ...
soup                                          healthyfood veg
Name: 3, dtype: object

both summer squash salad and tricolour salad are healthyfood and veg

In [248]:
get_recommendations('christmas cake', cosine_sim2)

6                chocolate nero cookies
17                 grilled almond barfi
20                          apple rabdi
22                 dates and nuts ladoo
23           green lentil dessert fudge
24                   cashew nut cookies
31            almond and amaranth ladoo
50    christmas chocolate fudge cookies
65                   betel nut popsicle
68         banana and maple ice lollies
Name: Name, dtype: object

#Collaborative Filtering

##Data Prep

In [249]:
!pip install scikit-surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [250]:
import surprise
surprise.__version__

'1.1.3'

In [251]:
import pandas as pd
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

In [252]:
df2 = pd.read_csv('/content/ratings.csv')
df2.head()

Unnamed: 0,User_ID,Food_ID,Rating
0,1.0,88.0,4.0
1,1.0,46.0,3.0
2,1.0,24.0,5.0
3,1.0,25.0,4.0
4,2.0,49.0,1.0


In [253]:
#Checking the shape
df2.shape

(512, 3)

In [254]:
# Checking for null values 
df2.isnull().sum()

User_ID    1
Food_ID    1
Rating     1
dtype: int64

In [255]:
df2 = df2.dropna()

In [256]:
df2.isnull().sum()

User_ID    0
Food_ID    0
Rating     0
dtype: int64

In [257]:
df2['Rating'].min()

1.0

In [258]:
df2['Rating'].max()

10.0

In [259]:
reader = Reader(rating_scale=(1, 10))

In [260]:
data = Dataset.load_from_df(df2[['User_ID', 'Food_ID', 'Rating']], reader=reader)
data

<surprise.dataset.DatasetAutoFolds at 0x7faa39fe96a0>

## Cross Validate(RMSE, MAE)

###K-Fold

In [261]:
svd = SVD(random_state=0)

In [262]:
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    2.9893  2.7734  3.0034  2.8162  2.7400  2.8645  0.1105  
MAE (testset)     2.6094  2.3772  2.6280  2.3748  2.3156  2.4610  0.1308  
Fit time          0.01    0.01    0.01    0.01    0.01    0.01    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    


{'test_rmse': array([2.9893157 , 2.77343519, 3.00336604, 2.81620099, 2.73996959]),
 'test_mae': array([2.60935674, 2.37717352, 2.62802854, 2.37482444, 2.31558793]),
 'fit_time': (0.009449958801269531,
  0.01004791259765625,
  0.006375789642333984,
  0.006022930145263672,
  0.006298065185546875),
 'test_time': (0.0022389888763427734,
  0.0007269382476806641,
  0.0007965564727783203,
  0.0007719993591308594,
  0.0006494522094726562)}

cv = 5 (divide data into 5 datasests)

100 data

A:1-20

B:21-40

C:41-60

D:61-80

E:81-100

ABCD (train set) E (test set)

ABCE (train set) D (test set)

ABDE (train set) C (test set)

ACDE (train set) B (test set)

BCDE (train set) A (test set)

In [263]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7faa39f62820>

In [264]:
df2[df2['User_ID'] == 1] #how userid = 1 rated the food

Unnamed: 0,User_ID,Food_ID,Rating
0,1.0,88.0,4.0
1,1.0,46.0,3.0
2,1.0,24.0,5.0
3,1.0,25.0,4.0


In [265]:
svd.predict(1, 12) # predict how would userid = 1 rate foodid = 12 -> est = 10 might rate 10

Prediction(uid=1, iid=12, r_ui=None, est=4.4544710848220825, details={'was_impossible': False})

In [266]:
svd.predict(1, 300, 3) # User_ID = 1 real rated score is 3 for Food_ID = 300, how would it be the predicted score? -> est = 10

Prediction(uid=1, iid=300, r_ui=3, est=5.31926947929299, details={'was_impossible': False})

apparently this isn't accurate, I think bc the data is small

In [267]:
df2[df2['User_ID'] == 100]

Unnamed: 0,User_ID,Food_ID,Rating
508,100.0,24.0,10.0
509,100.0,233.0,10.0
510,100.0,29.0,7.0


In [268]:
svd.predict(100, 300) # User_Id = 100, Food_Id = 300

Prediction(uid=100, iid=300, r_ui=None, est=6.826004026075378, details={'was_impossible': False})

userid 100 would rate 10 for foodid 300