<a href="https://colab.research.google.com/github/stevengregori92/LearnRecSys/blob/main/ML_Engineering_Content_Based_Filtering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize

import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
!gdown https://drive.google.com/uc?id=1kF6d9bJnZjehyPeoh6ixMETJnYbjFC3I
!unzip /content/rec.zip

Downloading...
From: https://drive.google.com/uc?id=1kF6d9bJnZjehyPeoh6ixMETJnYbjFC3I
To: /content/rec.zip
100% 18.1M/18.1M [00:00<00:00, 42.7MB/s]
Archive:  /content/rec.zip
  inflating: collaborative_filtering.csv  
  inflating: content_by_multiple.csv  
  inflating: content_by_synopsis.csv  
  inflating: demographic.csv         


In [None]:
from sklearn.metrics.pairwise import cosine_distances

class RecommendSystem:
  def __init__(self, df, content_col):
    self.df = pd.read_csv(df)
    self.content_col = content_col
    self.encoder = None
    self.bank = None

  def fit(self):
    self.encoder = CountVectorizer(stop_words='english', tokenizer=word_tokenize)
    self.bank = self.encoder.fit_transform(self.df[self.content_col])

  def recommend(self, idx, topk=10):
    content = self.df.loc[idx, self.content_col]
    code = self.encoder.transform([content])
    dist = cosine_distances(code, self.bank)
    dist.argsort()
    rec_idx = dist.argsort()[0, 1:(topk+1)]
    return self.df.loc[rec_idx]

In [None]:
recsys = RecommendSystem('content_by_synopsis.csv', content_col='overview')
recsys.fit()
recsys.recommend(1)



Unnamed: 0,title,overview
27006,Superdome,"It's Superbowl. And there's a lot of drama, on..."
40606,Stasis,After a night out of partying and left behind ...
37971,Snowed Under,"Alan Tanner's new play opens in a week, but Ta..."
18715,Wreck-It Ralph,"Wreck-It Ralph is the 9-foot-tall, 643-pound v..."
40431,Liar Game: Reborn,"To exact revenge, the Liar Game office is revi..."
38232,Enter the Battlefield: Life on the Magic - The...,Magic: The Gathering is the most popular colle...
36540,Beta Test,While testing the latest first person shooter ...
14859,Le Pont du Nord,"Marie, is just out from prison when she runs i..."
13105,Break Up,"Jimmy is married to the abusive Frank, but she..."
17918,Dante's Inferno: An Animated Epic,Dante journeys through the nine circles of Hel...


#Content = Multiple Information = MultipleSoup

In [None]:
df = pd.read_csv('content_by_multiple.csv')
df.head()

Unnamed: 0,title,genres,cast,keywords,director,metadata
0,Toy Story,animation comedy family,tom_hanks tim_allen don_rickles,jealousy toy boy,john_lasseter,animation comedy family tom_hanks tim_allen do...
1,Jumanji,adventure fantasy family,robin_williams jonathan_hyde kirsten_dunst,board_game disappearance based_on_children's_book,joe_johnston,adventure fantasy family robin_williams jonath...
2,Grumpier Old Men,romance comedy,walter_matthau jack_lemmon ann-margret,fishing best_friend duringcreditsstinger,howard_deutch,romance comedy walter_matthau jack_lemmon ann-...
3,Waiting to Exhale,comedy drama romance,whitney_houston angela_bassett loretta_devine,based_on_novel interracial_relationship single...,forest_whitaker,comedy drama romance whitney_houston angela_ba...
4,Father of the Bride Part II,comedy,steve_martin diane_keaton martin_short,baby midlife_crisis confidence,charles_shyer,comedy steve_martin diane_keaton martin_short ...


In [None]:
recsys = RecommendSystem('content_by_multiple.csv', content_col='metadata')
recsys.fit()
recsys.recommend(1)



Unnamed: 0,title,genres,cast,keywords,director,metadata
41600,The Kingdom of Fairies,adventure fantasy,,,,adventure fantasy
28394,The Rain Fairy,family fantasy,,,,family fantasy
39899,Tainá: An Amazon Adventure,family fantasy adventure,,comedy,kahane_cooperman,family fantasy adventure comedy kahane_cooperman
552,The Pagemaster,fantasy science_fiction family,macaulay_culkin christopher_lloyd patrick_stewart,library adventure part_animated,joe_johnston,fantasy science_fiction family macaulay_culkin...
40803,Princess Goldilocks,adventure family fantasy,charlie_durkin,woman_director,callie_t._wiser,adventure family fantasy charlie_durkin woman_...
14070,Playmobil: The Secret of Pirate Island,action adventure family,lee_tockar caitlin_williams,fantasy adventure cartoon,alexander_e._sokoloff,action adventure family lee_tockar caitlin_wil...
15781,Cirque du Soleil: Varekai,drama family fantasy,,,,drama family fantasy
21579,The Young and Prodigious T.S. Spivet,adventure drama family,,,,adventure drama family
12560,City of Ember,adventure family fantasy,saoirse_ronan harry_treadaway mary_kay_place,underground_world mayor adventure,gil_kenan,adventure family fantasy saoirse_ronan harry_t...
17504,G.I. Joe: The Revenge of Cobra,family fantasy action,,,,family fantasy action


In [None]:
recsys.recommend(579)

Unnamed: 0,title,genres,cast,keywords,director,metadata
2808,Home Alone 2: Lost in New York,comedy family adventure,macaulay_culkin joe_pesci catherine_o'hara,holiday new_york new_york_city,chris_columbus,comedy family adventure macaulay_culkin joe_pe...
19021,Nativity!,comedy family,daniel_stern braeden_lemasters stacey_travis,holiday,brian_levant,comedy family daniel_stern braeden_lemasters s...
34843,Father of Four: Never Gives Up!,comedy family,,,,comedy family
369,Ri¢hie Ri¢h,comedy family,macaulay_culkin john_larroquette edward_herrmann,family life_raft private_airplane,donald_petrie,comedy family macaulay_culkin john_larroquette...
39019,"Good Luck Charlie, It's Christmas!",comedy family,,,william_k.l._dickson_,comedy family william_k.l._dickson_
41843,50 Kilos of Sour Cherry,family drama comedy,,,,family drama comedy
25916,Norm MacDonald: Me Doing Standup,comedy,,holiday,timothy_quay,comedy holiday timothy_quay
30648,Oh How It Hurts 66,comedy family,,,bertrand_avril,comedy family bertrand_avril
23872,"Welcome, or No Trespassing",comedy family,,,elem_klimov,comedy family elem_klimov
31241,Get Santa,family comedy,,,karin_steinberger,family comedy karin_steinberger
