<a href="https://colab.research.google.com/github/Lawrence-Krukrubo/Building-a-Content-Based-Movie-Recommender-System/blob/master/building_a_content_based_recommendation_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Content-based recommendation system

In [170]:
import pandas as pd
from collections import Counter
from IPython.core.interactiveshell import InteractiveShell
from tqdm import tqdm
import re

InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.max_rows', 40)

In [171]:
bay_df = pd.read_csv("bay_full.csv")
bay_df = bay_df.rename(columns={'Unnamed: 0': 'activity_id'})

In [172]:
print('Shape:',bay_df.shape)
bay_df.head()

Shape: (826, 11)


Unnamed: 0,activity_id,title,features,quality,price,popularity,category,latitude,longitude,address,link
0,0,La Note,"{'brunch': 185, 'pancakes': 96, 'french toast'...",4.4,2,1702,French restaurant,37.866206,-122.267249,"2377 Shattuck Ave., Berkeley, CA 94704",https://www.google.com/maps/place/La+Note/data...
1,1,Grégoire Restaurant,"{'puffs': 115, 'fried chicken sandwich': 42, '...",4.5,2,917,French restaurant,37.878583,-122.268578,"2109 Cedar St, Berkeley, CA 94709",https://www.google.com/maps/place/Gr%C3%A9goir...
2,2,À Côté,"{'mussels': 28, 'wine list': 19, 'cocktails': ...",4.4,3,413,Restaurant,37.842073,-122.251264,"5478 College Ave, Oakland, CA 94618",https://www.google.com/maps/place/%C3%80+C%C3%...
3,3,Julia's Restaurant,"{'architecture': 5, 'hotel': 4, 'wine': 4, 'st...",4.4,0,102,French restaurant,37.867621,-122.262837,"2315 Durant Ave 2nd floor, Berkeley, CA 94704",https://www.google.com/maps/place/Julia's+Rest...
4,4,Le Bateau Ivre Restaurant,"{'atmosphere': 21, 'coffee': 16, 'brunch': 13,...",4.3,2,241,Restaurant,37.862503,-122.258559,"2629 Telegraph Ave, Berkeley, CA 94704",https://www.google.com/maps/place/Le+Bateau+Iv...


In [173]:
#print('Ratings_df Shape:',ratings_df.shape)
#ratings_df.head()
# No ratings for now...

#### Get categorical labels

Note for future: content could be quantified, not just categorical + cutting metadata for data filtering & interaction for now

In [174]:
#Every genre is separated by a | so we simply have to call the split function on |
bay_df['labels'] = bay_df['features'].apply(lambda x: re.findall(r'\b[a-zA-Z]+(?=\:)', x.replace("{", "").replace("}", "").replace("'", "")))
bay_df['labels'] = bay_df.apply(lambda row: row['labels'] + [row['category'].lower()], axis=1)

# Interpreting values as categorical data for now
# Excluding price, rating, popularity
bay_df = bay_df.drop(['features', 'quality', 'price', 'popularity', 'latitude', 'longitude', 'address', 'category', 'link'], axis=1)

bay_df.head()

Unnamed: 0,activity_id,title,labels
0,0,La Note,"[brunch, pancakes, toast, patio, fries, ginger..."
1,1,Grégoire Restaurant,"[puffs, sandwich, boxes, buttermilk, poutine, ..."
2,2,À Côté,"[mussels, list, cocktails, patio, plates, flat..."
3,3,Julia's Restaurant,"[architecture, hotel, wine, steak, week, duck,..."
4,4,Le Bateau Ivre Restaurant,"[atmosphere, coffee, brunch, dinner, salmon, s..."


No missing values

In [175]:
bay_df.isna().sum()

activity_id    0
title          0
labels         0
dtype: int64

In [176]:
bay_df.head(3)

Unnamed: 0,activity_id,title,labels
0,0,La Note,"[brunch, pancakes, toast, patio, fries, ginger..."
1,1,Grégoire Restaurant,"[puffs, sandwich, boxes, buttermilk, poutine, ..."
2,2,À Côté,"[mussels, list, cocktails, patio, plates, flat..."


Remove niche labels

In [177]:
exploded_labels = bay_df['labels'].explode()

label_counts = Counter(exploded_labels)

NICHE_THRESHOLD = 8

values_greater_than = sum(1 for count in label_counts.values() if count >= NICHE_THRESHOLD)
values_greater_than

bay_df['labels'] = bay_df['labels'].apply(lambda arr: [val for val in arr if label_counts[val] >= NICHE_THRESHOLD])
bay_df.head(40)

255

Unnamed: 0,activity_id,title,labels
0,0,La Note,"[brunch, pancakes, toast, patio, fries, mimosas]"
1,1,Grégoire Restaurant,"[sandwich, boxes, sandwich, sandwich, tables]"
2,2,À Côté,"[list, cocktails, patio, lighting, restaurant]"
3,3,Julia's Restaurant,"[architecture, hotel, wine, duck]"
4,4,Le Bateau Ivre Restaurant,"[atmosphere, brunch, dinner, salmon, soup, pat..."
5,5,The Butcher’s Son Vegan Delicatessen & Bakery,"[salad, pork, sandwich, sandwich]"
6,6,Tane Vegan Izakaya,"[sushi, rolls, sprouts, sunset, soup, eggplant]"
7,7,Humbowl,"[vegan, price, shrimp, ingredients, dinner, sa..."
8,8,FAVA,"[falafel, healthy, soup, hummus, lamb, salmon]"
9,9,Long Life Vegi House,"[vegan, soup, dinner, eggplant, rice, rolls]"


#### One hot encode

In [178]:
activities_encoded = bay_df.copy(deep=True)

# Let's iterate through movies_df, then append the movie genres as columns of 1s or 0s.
# 1 if that column contains movies in the genre at the present index and 0 if not.

x = []
for index, row in bay_df.iterrows():
    x.append(index)
    for label in row['labels']:
        activities_encoded.at[index, label] = 1

print(len(x) == len(bay_df))
activities_encoded = activities_encoded.fillna(0)

activities_encoded.head(3)

True


  activities_encoded.at[index, label] = 1
  activities_encoded.at[index, label] = 1
  activities_encoded.at[index, label] = 1
  activities_encoded.at[index, label] = 1
  activities_encoded.at[index, label] = 1
  activities_encoded.at[index, label] = 1
  activities_encoded.at[index, label] = 1
  activities_encoded.at[index, label] = 1
  activities_encoded.at[index, label] = 1
  activities_encoded.at[index, label] = 1
  activities_encoded.at[index, label] = 1
  activities_encoded.at[index, label] = 1
  activities_encoded.at[index, label] = 1
  activities_encoded.at[index, label] = 1
  activities_encoded.at[index, label] = 1
  activities_encoded.at[index, label] = 1
  activities_encoded.at[index, label] = 1
  activities_encoded.at[index, label] = 1
  activities_encoded.at[index, label] = 1
  activities_encoded.at[index, label] = 1
  activities_encoded.at[index, label] = 1
  activities_encoded.at[index, label] = 1
  activities_encoded.at[index, label] = 1
  activities_encoded.at[index, lab

Unnamed: 0,activity_id,title,labels,brunch,pancakes,toast,patio,fries,mimosas,sandwich,...,escape room center,caviar,puzzles,paintball center,michelin,story,new american restaurant,tee,golf course,country club
0,0,La Note,"[brunch, pancakes, toast, patio, fries, mimosas]",1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,Grégoire Restaurant,"[sandwich, boxes, sandwich, sandwich, tables]",0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,À Côté,"[list, cocktails, patio, lighting, restaurant]",0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [179]:
# Ratings data

"""# print out the shape and first five rows of ratings data.
print('Ratings_df shape:',ratings_df.shape)          
ratings_df.head()

# Dropping the timestamp column
ratings_df.drop('timestamp', axis=1, inplace=True)

# Confirming the drop
ratings_df.head(3)

# Let's confirm the right data types exist per column in ratings data_set
ratings_df.dtypes

# Let's check for missing values

ratings_df.isna().sum()"""

"# print out the shape and first five rows of ratings data.\nprint('Ratings_df shape:',ratings_df.shape)          \nratings_df.head()\n\n# Dropping the timestamp column\nratings_df.drop('timestamp', axis=1, inplace=True)\n\n# Confirming the drop\nratings_df.head(3)\n\n# Let's confirm the right data types exist per column in ratings data_set\nratings_df.dtypes\n\n# Let's check for missing values\n\nratings_df.isna().sum()"

## Content Based recommendation

#### Creating a profile for our user, 'Rocket'

Rocket is a health freak who loves good food and nature

In [180]:
# scale of 0 to 10
#rocket_preferences = [
#            {'title':'Mezzo', 'rating':8},
#            {'title':'sweetgreen', 'rating':9},
#            {'title':'Big C', 'rating':10},
#            {'title':"La Marcha Tapas Bar", 'rating':7},
#            {'title':'Raleigh\'s Pub', 'rating':2},
#            {'title':'The Tap Haus', 'rating':4},
#            {'title':'Grove Park', 'rating':8},
#            {'title':'Berkeley Rose Garden', 'rating':10}
#         ]

rocket_preferences = [
            {'title':'Monroe SF', 'rating':9},
            {'title':'DNA Lounge', 'rating':10},
            {'title':'Big C', 'rating':10},
            {'title':"Lucky Strike San Francisco", 'rating':10},
            {'title':'Raleigh\'s Pub', 'rating':8},
            {'title':'The Tap Haus', 'rating':9},
            {'title':'Redwood Grove Nature Preserve', 'rating':1},
            {'title':'San Francisco Zoo', 'rating':1}
         ]

rocket_preferences = pd.DataFrame(rocket_preferences)
rocket_preferences

Unnamed: 0,title,rating
0,Monroe SF,9
1,DNA Lounge,10
2,Big C,10
3,Lucky Strike San Francisco,10
4,Raleigh's Pub,8
5,The Tap Haus,9
6,Redwood Grove Nature Preserve,1
7,San Francisco Zoo,1


Getting the ID for each of the mentioned activities & remove labels

In [181]:
rocket_preferences_id = bay_df[bay_df['title'].isin(rocket_preferences['title'])]
rocket_preferences_id = pd.merge(rocket_preferences_id, rocket_preferences)

rocket_preferences_id = rocket_preferences_id.drop('labels', axis=1)
rocket_preferences_id

Unnamed: 0,activity_id,title,rating
0,190,Raleigh's Pub,8
1,198,The Tap Haus,9
2,211,Big C,10
3,475,Monroe SF,9
4,480,DNA Lounge,10
5,580,Lucky Strike San Francisco,10
6,626,Redwood Grove Nature Preserve,1
7,696,San Francisco Zoo,1


Matching ratings with the respective content labels data

In [182]:
rocket_categories_df = activities_encoded[activities_encoded.activity_id.isin(rocket_preferences_id.activity_id)]
rocket_categories_df

Unnamed: 0,activity_id,title,labels,brunch,pancakes,toast,patio,fries,mimosas,sandwich,...,escape room center,caviar,puzzles,paintball center,michelin,story,new american restaurant,tee,golf course,country club
190,190,Raleigh's Pub,"[patio, college, game, fries, hour, sandwiches...",0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
198,198,The Tap Haus,"[pool, games, beer, atmosphere, crowd, college...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
211,211,Big C,"[hike, sunset, parking, trees, night, bay]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
429,475,Monroe SF,"[party, bartenders, hour, atmosphere, coat, cr...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
434,480,DNA Lounge,"[pizza, shows, night club]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
528,580,Lucky Strike San Francisco,"[bartender, sushi, hour]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
563,626,Redwood Grove Nature Preserve,"[creek, parking, trees, picnic, playground, un...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
632,696,San Francisco Zoo,"[exhibits, ocean, train]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [183]:
rocket_categories_df.reset_index(drop=True, inplace=True)
rocket_categories_df.drop(['activity_id','title','labels'], axis=1, inplace=True)
rocket_categories_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rocket_categories_df.drop(['activity_id','title','labels'], axis=1, inplace=True)


Unnamed: 0,brunch,pancakes,toast,patio,fries,mimosas,sandwich,boxes,tables,list,...,escape room center,caviar,puzzles,paintball center,michelin,story,new american restaurant,tee,golf course,country club
0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Step 3: Building Lawrence's Profile<br>
To do this, we're going to turn each genre into weights, by multiplying Lawrence's movie ratings by lawrence_genres_df table. And then summing up the resulting table by column. This operation is actually a dot product between a matrix and a vector.
First let's confirm the shapes of the data frames we have recently defined

In [184]:
# let's confirm the shapes of our data frames to guide us as we do matrix multiplication

print('Shape of rocket_preferences_id:',rocket_preferences_id.shape)
print('Shape of rocket_categories_df:',rocket_categories_df.shape)

Shape of rocket_preferences_id: (8, 3)
Shape of rocket_categories_df: (8, 255)


In [185]:
rocket_profile = rocket_categories_df.T.dot(rocket_preferences_id.rating)
#rocket_profile

sorted_rocket = rocket_profile.sort_values(ascending=False)
sorted_rocket

hour                        27.0
night club                  19.0
crowd                       18.0
atmosphere                  18.0
bar                         17.0
                            ... 
cafe                         0.0
course                       0.0
buffet                       0.0
mediterranean restaurant     0.0
country club                 0.0
Length: 255, dtype: float64

Interests: sunset, bay, courts, garden, patio, etc.

#### Recommendation !!!

In [186]:
# let's set the index to the movieId
activities_encoded = activities_encoded.set_index(activities_encoded.activity_id)
activities_encoded.drop(['activity_id','title','labels'], axis=1, inplace=True)

activities_encoded.head()

Unnamed: 0_level_0,brunch,pancakes,toast,patio,fries,mimosas,sandwich,boxes,tables,list,...,escape room center,caviar,puzzles,paintball center,michelin,story,new american restaurant,tee,golf course,country club
activity_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Take weighted average and sort by 'best match' to profile

In [187]:
recommendation_table_df = (activities_encoded.dot(rocket_profile)) / rocket_profile.sum()
recommendation_table_df.head()

activity_id
0    0.048338
1    0.000000
2    0.024169
3    0.000000
4    0.078550
dtype: float64

...and sort!

In [188]:
# Let's sort values from great to small
recommendation_table_df.sort_values(ascending=False, inplace=True)
recommendation_table_df.head(20)

activity_id
475    0.356495
198    0.320242
190    0.280967
495    0.268882
471    0.247734
494    0.241692
493    0.241692
492    0.238671
472    0.223565
478    0.193353
905    0.190332
469    0.190332
388    0.187311
771    0.187311
211    0.187311
515    0.169184
473    0.166163
467    0.166163
491    0.166163
767    0.166163
dtype: float64

In [189]:
copy = bay_df.copy(deep=True)
copy = copy.set_index('activity_id', drop=True)
top_20_index = recommendation_table_df.index[:30].tolist()

recommended_activities = copy.loc[top_20_index, :]
recommended_activities

Unnamed: 0_level_0,title,labels
activity_id,Unnamed: 1_level_1,Unnamed: 2_level_1
475,Monroe SF,"[party, bartenders, hour, atmosphere, coat, cr..."
198,The Tap Haus,"[pool, games, beer, atmosphere, crowd, college..."
190,Raleigh's Pub,"[patio, college, game, fries, hour, sandwiches..."
495,International Sports Bar,"[hour, bartender, cash, pool, prices, atmosphe..."
471,The Valencia Room,"[bartender, hour, cash, bar, games, night club]"
494,83 Proof,"[hour, atmosphere, prices, crowd, menu, wall, ..."
493,Tunnel Top Lounge and Bar,"[prices, atmosphere, floor, crowd, hour, stair..."
492,San Francisco Eagle Bar,"[patio, sunday, bartenders, crowd, space, atmo..."
472,Providence Night Club,"[bartender, floor, crowd, pay, space, atmosphe..."
478,The Great Northern,"[sound, security, floor, drinks, party, atmosp..."
