<a href="https://colab.research.google.com/github/rehann888/Recommendation-System/blob/main/RecomendationSystem.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Demographic Filtering: FILTER --> SCORING --> SORT**

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_distances

In [None]:
df = pd.read_csv("data_demographic.csv")
df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,release_year,Action,Adventure,Animation,Comedy,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
0,Toy Story,Animation; Comedy; Family,81.0,7.7,5415.0,1995,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,Jumanji,Adventure; Fantasy; Family,104.0,6.9,2413.0,1995,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Grumpier Old Men,Romance; Comedy,101.0,6.5,92.0,1995,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
3,Waiting to Exhale,Comedy; Drama; Romance,127.0,6.1,34.0,1995,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
4,Father of the Bride Part II,Comedy,106.0,5.7,173.0,1995,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


**1. Filter**

In [None]:
genre = ["Animation", "Comedy"]
duration = (60, 155)
year = (2015, 2019)
top = 20

In [None]:
df[genre]

Unnamed: 0,Animation,Comedy
0,1,1
1,0,0
2,0,1
3,0,1
4,0,1
...,...,...
45125,0,0
45126,0,0
45127,0,0
45128,0,0


In [None]:
df = df[df.release_year.between(year[0], year[1])&
     df.runtime.between(duration[0], duration[1]) &
     df[genre].all(axis=1)]
df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,release_year,Action,Adventure,Animation,Comedy,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
28358,The SpongeBob Movie: Sponge Out of Water,Animation; Adventure; Comedy; Family,93.0,5.7,733.0,2015,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
28702,Home,Fantasy; Comedy; Animation; Science Fiction; F...,94.0,6.8,1539.0,2015,0,0,1,1,...,0,0,0,0,0,1,0,0,0,0
29061,Shaun the Sheep Movie,Family; Animation; Comedy; Adventure,85.0,6.9,456.0,2015,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
30208,Inside Out,Drama; Comedy; Animation; Family,94.0,7.9,6737.0,2015,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
30588,Minions,Family; Animation; Adventure; Comedy,91.0,6.4,4729.0,2015,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0


**2. Scoring (Vote)**

**3. Sort**

**Use IMDB Weighted Rating**

In [None]:
def imdb_score(df):
    df = df.copy()
    m = df.vote_count.quantile(0.7)
    C = (df.vote_average * df.vote_count).sum() / df.vote_count.sum()
    df = df[df.vote_count >= m]
    df["score"] = df.apply(lambda x: (x.vote_average * x.vote_count + C*m)/(x.vote_count + m), axis=1)
    return df



In [None]:
df = imdb_score(df)

In [None]:
result = df.loc[:, "title" : "release_year"]
result = result.sort_values ("vote_average", ascending=False).head(top)
result

Unnamed: 0,title,genres,runtime,vote_average,vote_count,release_year
30208,Inside Out,Drama; Comedy; Animation; Family,94.0,7.9,6737.0,2015
36082,Zootopia,Animation; Adventure; Family; Comedy,108.0,7.7,4961.0,2016
41714,The Lego Batman Movie,Action; Animation; Comedy; Family; Fantasy,104.0,7.2,1473.0,2017
28702,Home,Fantasy; Comedy; Animation; Science Fiction; F...,94.0,6.8,1539.0,2015
37969,Finding Dory,Adventure; Animation; Comedy; Family,97.0,6.8,4333.0,2016
41433,Sing,Animation; Comedy; Drama; Family; Music,108.0,6.8,2363.0,2016
33103,Hotel Transylvania 2,Animation; Comedy; Family,89.0,6.7,1528.0,2015
35225,Kung Fu Panda 3,Action; Adventure; Animation; Comedy; Family,95.0,6.7,1630.0,2016
30588,Minions,Family; Animation; Adventure; Comedy,91.0,6.4,4729.0,2015
43708,Despicable Me 3,Action; Animation; Adventure; Family; Comedy,96.0,6.2,2002.0,2017


In [None]:
from sqlalchemy.engine.result import result_tuple
class RecomendationSystem:
    def __init__(self, data):
      self.df = pd.read_csv(data)

    def recommendation(self, genre=None, year=None, duration=None, top=20):
      df = self.df.copy()
      df = self.demographic_filtering(df, genre=genre, duration=duration, year=year)
      df = self.run_imdb_score(df)

      result = df.loc[:, "title" : "release_year"]
      result = result.sort_values ("vote_average", ascending=False)
      result = result.head(top)
      return result

    @staticmethod
    def demographic_filtering(df, genre=None, year=None, duration=None):
      df = df.copy()

      if genre is not None:
          df = df[df[genre].all(axis=1)]

      if year is not None:
          df = df[df.release_year.between(year[0], year[1])]

      if duration is not None:
          df = df [df.runtime.between(duration[0], duration[1])]
      return df

    @staticmethod
    def run_imdb_score (df):
      df = df.copy()
      m = df.vote_count.quantile(0.7)
      C = (df.vote_average * df.vote_count).sum() / df.vote_count.sum()
      df = df[df.vote_count >= m]
      df["score"] = df.apply(lambda x: (x.vote_average * x.vote_count + C*m)/(x.vote_count + m), axis=1)
      return df



In [None]:
test_system = RecomendationSystem(data="data_demographic.csv")

In [None]:
test_system.recommendation(genre = ["Action"], duration = (60, 150), year = (2015, 2019))

Unnamed: 0,title,genres,runtime,vote_average,vote_count,release_year
34237,The Boy and the Beast,Fantasy; Action; Adventure; Animation,119.0,7.9,304.0,2015
39596,Train to Busan,Action; Horror; Thriller,118.0,7.7,984.0,2016
41905,Logan,Action; Drama; Science Fiction,137.0,7.6,6310.0,2017
26482,Guardians of the Galaxy Vol. 2,Action; Adventure; Comedy; Science Fiction,137.0,7.6,4858.0,2017
25456,Kingsman: The Secret Service,Crime; Comedy; Action; Adventure,130.0,7.6,6069.0,2015
26471,Star Wars: The Force Awakens,Action; Adventure; Science Fiction; Fantasy,136.0,7.5,7993.0,2015
44363,Dunkirk,Action; Drama; History; Thriller; War,107.0,7.5,2712.0,2017
37306,They Call Me Jeeg Robot,Action; Drama; Comedy; Thriller; Science Fiction,112.0,7.5,575.0,2015
26480,Deadpool,Action; Adventure; Comedy,108.0,7.4,11444.0,2016
41235,Rogue One: A Star Wars Story,Action; Adventure; Science Fiction,133.0,7.4,5111.0,2016


#**Content Base Filtering**

In [None]:
df = pd.read_csv("data_content.csv")
df.head()

Unnamed: 0,title,overview
0,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,Jumanji,When siblings Judy and Peter discover an encha...
2,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
4,Father of the Bride Part II,Just when George Banks has recovered from his ...


**1. Encode Overview**

In [None]:
bow = CountVectorizer(stop_words = "english", tokenizer = word_tokenize)
bank = bow.fit_transform(df.overview)



In [None]:
index = 1

In [None]:
content = df.loc[index, "overview"]
content

"When siblings Judy and Peter discover an enchanted board game that opens the door to a magical world, they unwittingly invite Alan -- an adult who's been trapped inside the game for 26 years -- into their living room. Alan's only hope for freedom is to finish the game, which proves risky as all three find themselves running from giant rhinoceroses, evil monkeys and other terrifying creatures."

In [None]:
numeric = bow.transform([content])

In [None]:
numeric.toarray()

array([[0, 0, 0, ..., 0, 0, 0]])

**2. Search**

In [None]:
dist = cosine_distances(numeric, encode)
dist

array([[0.68698928, 0.        , 0.73145692, ..., 0.87596527, 0.71823891,
        0.73267245]])

In [None]:
rec = dist.argsort()[0, 1:11]
rec

array([27006, 40606, 37971, 18715, 40431, 38232, 36540, 14859, 13105,
       17918])

**3. Recommend**

In [None]:
df.loc[rec]

Unnamed: 0,title,overview
27006,Superdome,"It's Superbowl. And there's a lot of drama, on..."
40606,Stasis,After a night out of partying and left behind ...
37971,Snowed Under,"Alan Tanner's new play opens in a week, but Ta..."
18715,Wreck-It Ralph,"Wreck-It Ralph is the 9-foot-tall, 643-pound v..."
40431,Liar Game: Reborn,"To exact revenge, the Liar Game office is revi..."
38232,Enter the Battlefield: Life on the Magic - The...,Magic: The Gathering is the most popular colle...
36540,Beta Test,While testing the latest first person shooter ...
14859,Le Pont du Nord,"Marie, is just out from prison when she runs i..."
13105,Break Up,"Jimmy is married to the abusive Frank, but she..."
17918,Dante's Inferno: An Animated Epic,Dante journeys through the nine circles of Hel...


In [None]:
class recommendation_system:
  def __init__(self, data, content_col):
    self.df = pd.read_csv(data)
    self.content_col =content_col
    self.encoder = None
    self.bank = None


  def fit(self):
    self.encoder = CountVectorizer(stop_words = "english", tokenizer = word_tokenize)
    self.bank = self.encoder.fit_transform(self.df[self.content_col])


  def recommend(self, index, top = 20):
    content = df.loc[index, self.content_col]
    numeric = self.encoder.transform([content])
    dist = cosine_distances(numeric, self.bank)
    rec = dist.argsort()[0, 1:(top+1)]
    return self.df.loc[rec]

In [None]:
test_system = recommendation_system("data_content.csv", content_col = "overview")
test_system.fit()



In [None]:
test_system.recommend(0)

Unnamed: 0,title,overview
14706,Toy Story 3,"Woody, Buzz, and the rest of Andy's toys haven..."
2945,Toy Story 2,"Andy heads off to Cowboy Camp, leaving his toy..."
9984,The 40 Year Old Virgin,Andy Stitzer has a pleasant life with a nice a...
36827,Wabash Avenue,Andy Clark discovers he was cheated out of a h...
40606,Stasis,After a night out of partying and left behind ...
13404,The Gang's All Here,"Playboy Andy Mason, on leave from the army, ro..."
22084,The Pied Piper,"Greed, corruption, ignorance, and disease. Mid..."
14078,A Matter of Dignity,"During one of her parents many parties, Chloe ..."
6172,The Courtship of Eddie's Father,The film that started the classic TV series. A...
27006,Superdome,"It's Superbowl. And there's a lot of drama, on..."


**Multiple Information**

In [None]:
df = pd.read_csv("multiple_content.csv")
df.head()

Unnamed: 0,title,genres,cast,keywords,director,metadata
0,Toy Story,animation comedy family,tom_hanks tim_allen don_rickles,jealousy toy boy,john_lasseter,animation comedy family tom_hanks tim_allen do...
1,Jumanji,adventure fantasy family,robin_williams jonathan_hyde kirsten_dunst,board_game disappearance based_on_children's_book,joe_johnston,adventure fantasy family robin_williams jonath...
2,Grumpier Old Men,romance comedy,walter_matthau jack_lemmon ann-margret,fishing best_friend duringcreditsstinger,howard_deutch,romance comedy walter_matthau jack_lemmon ann-...
3,Waiting to Exhale,comedy drama romance,whitney_houston angela_bassett loretta_devine,based_on_novel interracial_relationship single...,forest_whitaker,comedy drama romance whitney_houston angela_ba...
4,Father of the Bride Part II,comedy,steve_martin diane_keaton martin_short,baby midlife_crisis confidence,charles_shyer,comedy steve_martin diane_keaton martin_short ...


In [None]:
df.metadata[1]

"adventure fantasy family robin_williams jonathan_hyde kirsten_dunst board_game disappearance based_on_children's_book joe_johnston"

In [None]:
test_system = recommendation_system("multiple_content.csv", content_col = "metadata")
test_system.fit()



In [None]:
test_system.recommend(1) #Jumanji

Unnamed: 0,title,genres,cast,keywords,director,metadata
41600,The Kingdom of Fairies,adventure fantasy,,,,adventure fantasy
28394,The Rain Fairy,family fantasy,,,,family fantasy
39899,Tainá: An Amazon Adventure,family fantasy adventure,,comedy,kahane_cooperman,family fantasy adventure comedy kahane_cooperman
552,The Pagemaster,fantasy science_fiction family,macaulay_culkin christopher_lloyd patrick_stewart,library adventure part_animated,joe_johnston,fantasy science_fiction family macaulay_culkin...
40803,Princess Goldilocks,adventure family fantasy,charlie_durkin,woman_director,callie_t._wiser,adventure family fantasy charlie_durkin woman_...
14070,Playmobil: The Secret of Pirate Island,action adventure family,lee_tockar caitlin_williams,fantasy adventure cartoon,alexander_e._sokoloff,action adventure family lee_tockar caitlin_wil...
15781,Cirque du Soleil: Varekai,drama family fantasy,,,,drama family fantasy
21579,The Young and Prodigious T.S. Spivet,adventure drama family,,,,adventure drama family
12560,City of Ember,adventure family fantasy,saoirse_ronan harry_treadaway mary_kay_place,underground_world mayor adventure,gil_kenan,adventure family fantasy saoirse_ronan harry_t...
17504,G.I. Joe: The Revenge of Cobra,family fantasy action,,,,family fantasy action


# **Collaborative Filtering**

In [None]:
df = pd.read_csv("data_collaborative.csv")
df.head()

Unnamed: 0,userId,movie,rating
0,1,One Flew Over the Cuckoo's Nest (1975),5
1,1,James and the Giant Peach (1996),3
2,1,My Fair Lady (1964),3
3,1,Erin Brockovich (2000),4
4,1,"Bug's Life, A (1998)",5


In [None]:
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

In [None]:
data = Dataset.load_from_df(df, Reader())
trainset = data.build_full_trainset()

In [None]:
model = SVD()
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7a751bf166e0>

In [None]:
model.predict(1, "My Fair Lady (1964)")

Prediction(uid=1, iid='My Fair Lady (1964)', r_ui=None, est=4.219098210211252, details={'was_impossible': False})

**1. Rating Predict**

In [None]:
user_id = 1

In [None]:
all_movies = df.movie.unique()
all_movies

array(["One Flew Over the Cuckoo's Nest (1975)",
       'James and the Giant Peach (1996)', 'My Fair Lady (1964)', ...,
       'White Boys (1999)', 'One Little Indian (1973)',
       'Five Wives, Three Secretaries and Me (1998)'], dtype=object)

In [None]:
Rated = df[df.userId == 1].movie
Rated.head()

0    One Flew Over the Cuckoo's Nest (1975)
1          James and the Giant Peach (1996)
2                       My Fair Lady (1964)
3                    Erin Brockovich (2000)
4                      Bug's Life, A (1998)
Name: movie, dtype: object

In [None]:
not_rated = [movie for movie in all_movies if movie not in Rated]
not_rated

["One Flew Over the Cuckoo's Nest (1975)",
 'James and the Giant Peach (1996)',
 'My Fair Lady (1964)',
 'Erin Brockovich (2000)',
 "Bug's Life, A (1998)",
 'Princess Bride, The (1987)',
 'Ben-Hur (1959)',
 'Christmas Story, A (1983)',
 'Snow White and the Seven Dwarfs (1937)',
 'Wizard of Oz, The (1939)',
 'Beauty and the Beast (1991)',
 'Gigi (1958)',
 'Miracle on 34th Street (1947)',
 "Ferris Bueller's Day Off (1986)",
 'Sound of Music, The (1965)',
 'Airplane! (1980)',
 'Tarzan (1999)',
 'Bambi (1942)',
 'Awakenings (1990)',
 'Big (1988)',
 'Pleasantville (1998)',
 'Wallace & Gromit: The Best of Aardman Animation (1996)',
 'Back to the Future (1985)',
 "Schindler's List (1993)",
 'Meet Joe Black (1998)',
 'Pocahontas (1995)',
 'E.T. the Extra-Terrestrial (1982)',
 'Titanic (1997)',
 'Ponette (1996)',
 'Close Shave, A (1995)',
 'Antz (1998)',
 'Girl, Interrupted (1999)',
 'Hercules (1997)',
 'Aladdin (1992)',
 'Mulan (1998)',
 'Hunchback of Notre Dame, The (1996)',
 'Last Days of Di

In [None]:
score = [model.predict(user_id, movie).est for movie in not_rated]
score

[4.396793218901471,
 3.3410345341072345,
 4.219098210211252,
 4.138836489812511,
 4.177100259269077,
 4.283105238435281,
 4.375270395071203,
 4.625019872181361,
 4.3182497103230455,
 4.534108051992254,
 4.376923007172754,
 3.9357061555378103,
 4.231959629594494,
 4.332016404468273,
 4.503277741274786,
 4.134981971238271,
 3.8688911316767065,
 4.33613606096278,
 4.148449116335362,
 4.078229474270723,
 3.439368275802355,
 4.161193351401728,
 4.14269468916417,
 4.621184784698986,
 3.419324187442553,
 4.019656842994513,
 4.309765876203063,
 3.983838223194209,
 4.206469197106225,
 4.457893160343771,
 3.639724722098899,
 3.8588713157393433,
 3.7853902103666908,
 4.18595227254025,
 4.224874705664571,
 3.577275136974036,
 3.7344049945280995,
 4.522037987768739,
 4.413746211339402,
 4.398380799155862,
 4.4838565564435315,
 4.4949067731849865,
 4.322005644215469,
 4.164028399547564,
 4.2033342962806595,
 4.597376347952925,
 4.3524201742984285,
 4.673806075142873,
 4.482761700032714,
 3.749841177

In [None]:
class RecommendationSystem:
  def __init__(self, data):
    self.df = pd.read_csv(data)
    self.all_movies = self.df.movie.unique()
    self.model = None

  def fit(self):
    data = Dataset.load_from_df(self.df, Reader())
    trainset = data.build_full_trainset()
    self.model = SVD()
    self.model.fit(trainset)

  def recommend(self, user_id, top=10):
    Rated = self.df[df.userId == user_id].movie
    not_rated = [movie for movie in self.all_movies if movie not in Rated]
    score = [self.model.predict(user_id, movie).est for movie in not_rated]

    result = pd.DataFrame({"movie": not_rated, "score": score})
    result.sort_values("score", ascending=False, inplace=True)
    return result.head(top)


In [None]:
test_system = RecommendationSystem("data_collaborative.csv")
test_system.fit()

In [None]:
test_system.recommend(user_id = 2)

Unnamed: 0,movie,score
171,Forrest Gump (1994),4.87723
26,E.T. the Extra-Terrestrial (1982),4.771155
167,"Shawshank Redemption, The (1994)",4.757797
23,Schindler's List (1993),4.744289
128,"Silence of the Lambs, The (1991)",4.679535
2617,Sanjuro (1962),4.631534
1651,Cinema Paradiso (1988),4.609178
9,"Wizard of Oz, The (1939)",4.577412
27,Titanic (1997),4.566846
44,Star Wars: Episode IV - A New Hope (1977),4.56649
