# Content-Based Recommender

In [1]:
import re
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from ast import literal_eval
from nltk.stem import SnowballStemmer

## Description Based Recommender
1. Computes similarity between movies using taglines and overview
2. Find the most similar movies to a particular movie
3. Suggest found movies to the user who liked the particular movie

### Load Data

In [2]:
md = pd.read_csv("../data/movies/movies_metadata.csv")
links = pd.read_csv("../data/movies/links_small.csv")
links = links[links["tmdbId"].notnull()]["tmdbId"].astype("int")

# Drop rows with bad id value, such as `1997-08-20`
md = md.drop([19730, 29503, 35587])

# Choose only useful columns
md = md[["id", "title", "tagline", "overview"]]

# Filter data excluding from the defined set
md["id"] = md["id"].astype("int")
md = md[md["id"].isin(links)]
md.reset_index(inplace=True, drop=True)
print(f"Total {md.shape[0]} data")

Total 9099 data


  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
md.head()

Unnamed: 0,id,title,tagline,overview
0,862,Toy Story,,"Led by Woody, Andy's toys live happily in his ..."
1,8844,Jumanji,Roll the dice and unleash the excitement!,When siblings Judy and Peter discover an encha...
2,15602,Grumpier Old Men,Still Yelling. Still Fighting. Still Ready for...,A family wedding reignites the ancient feud be...
3,31357,Waiting to Exhale,Friends are the people who let you be yourself...,"Cheated on, mistreated and stepped on, the wom..."
4,11862,Father of the Bride Part II,Just When His World Is Back To Normal... He's ...,Just when George Banks has recovered from his ...


### Prepare Features

In [4]:
md["tagline"] = md["tagline"].fillna("")
md["overview"] = md["overview"].fillna("")

md["desc"] = md["tagline"] + md["overview"]

In [5]:
md["desc"].head()

0    Led by Woody, Andy's toys live happily in his ...
1    Roll the dice and unleash the excitement!When ...
2    Still Yelling. Still Fighting. Still Ready for...
3    Friends are the people who let you be yourself...
4    Just When His World Is Back To Normal... He's ...
Name: desc, dtype: object

### Transform Words in Tagline and Overview to TF-IDF Vectors

In [6]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words="english")
tfidf = vectorizer.fit_transform(md["desc"])

### Compute Cosine Similarities

In [7]:
dists = cosine_similarity(tfidf, tfidf)

### Make Recommendations

In [8]:
def getTopkRecommendations(title, metadataDf, similarities, topk=5):
    idx = metadataDf.index[metadataDf["title"] == title].tolist()
    if len(idx) == 0:
        raise ValueError("Title not found!")
    # Choose 1st item and its similarity arr
    idx = idx[0]
    sim = similarities[idx]
    # Set similarity of the given title to the minimum
    sim[idx] = sim.min()
    # Desc sort 
    indices = np.argpartition(-sim, 1+topk)[1:1+topk]
    return metadataDf.iloc[indices, :]["title"].tolist()

In [9]:
topkTitles = getTopkRecommendations("Interstellar", md, dists)
print(topkTitles)

topkTitles = getTopkRecommendations("Inception", md, dists)
print(topkTitles)

topkTitles = getTopkRecommendations("The Shawshank Redemption", md, dists)
print(topkTitles)

topkTitles = getTopkRecommendations("The Truman Show", md, dists)
print(topkTitles)

topkTitles = getTopkRecommendations("The Godfather", md, dists)
print(topkTitles)

['Gattaca', 'All Good Things', 'Stargate', 'Final Destination 2', 'Space Cowboys']
['Crumb', 'Lone Star', 'Renaissance', 'House', 'The Pink Panther']
['Les Misérables', 'The Blue Dahlia', 'Chasers', 'The Woodsman', 'Breakout']
['Delirious', 'Margin Call', 'Top Five', 'Nurse Betty', 'Anchorman 2: The Legend Continues']
['Made', 'The Godfather: Part II', 'The Family', 'Fury', 'Shanghai Triad']


In [10]:
np.save("contentSim/descSim", dists)
md.to_csv("contentSim/descFeatures.csv", index=False)

## Metadata Based Recommender
1. Computes similarity between movies using metadata including genres, crew, cast and keywords
2. Find the most similar movies to a particular movie
3. Suggest found movies to the user who liked the particular movie

### Load Data and Merge Three Dataframes by IDs

In [11]:
md = pd.read_csv("../data/movies/movies_metadata.csv")
links = pd.read_csv("../data/movies/links_small.csv")
links = links[links["tmdbId"].notnull()]["tmdbId"].astype("int")
credits = pd.read_csv("../data/movies/credits.csv")
keywords = pd.read_csv("../data/movies/keywords.csv")

# Drop rows with bad id value, such as `1997-08-20`
md = md.drop([19730, 29503, 35587])

# Choose only useful columns
md = md[["id", "genres", "title"]]

# Filter data excluding from the defined set
md["id"] = md["id"].astype("int")
md = md[md["id"].isin(links)]
md.reset_index(inplace=True, drop=True)
print(f"Total {md.shape[0]} data")

md["id"] = md["id"].astype("int")
keywords["id"] = keywords["id"].astype("int")
credits["id"] = credits["id"].astype("int")

md = md.merge(credits, on="id")
md = md.merge(keywords, on="id")

  interactivity=interactivity, compiler=compiler, result=result)


Total 9099 data


In [12]:
md.head()

Unnamed: 0,id,genres,title,cast,crew,keywords
0,862,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",Toy Story,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",Jumanji,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",Grumpier Old Men,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",Waiting to Exhale,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...","[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 35, 'name': 'Comedy'}]",Father of the Bride Part II,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


### Transform Values Back to the Original Data Type

In [13]:
md["genres"] = md["genres"].apply(literal_eval)
md["cast"] = md["cast"].apply(literal_eval)
md["crew"] = md["crew"].apply(literal_eval)
md["keywords"] = md["keywords"].apply(literal_eval)

### Prepro Genres

In [14]:
def getGenres(genres):
    if len(genres) == 0:
        return []     
    genres = [str(genre["name"]) for genre in genres]
    return [re.sub(r"[^\w]", "", genre.lower()) for genre in genres]

md["genres"] = md["genres"].apply(getGenres)

### Prepro Crew

In [15]:
def getDirector(crew):
    if len(crew) == 0:
        return []
    return [str(person["name"]) for person in crew if person["job"] == "Director"]

md["director"] = md["crew"].apply(getDirector)
md = md.drop("crew", 1)

# Cast strings to lowercase and keep only letters
md["director"] = md["director"].apply(lambda x: [re.sub(r"[^\w]", "", name.lower()) for name in x])

### Prepro Cast

In [16]:
def getTopkCast(cast, topk=3):
    if len(cast) == 0:
        return []
    cast = cast[:topk]
    return [str(person["name"]) for person in cast]
  
md["cast"] = md["cast"].apply(getTopkCast)

# Cast strings to lowercase and keep only letters
md["cast"] = md["cast"].apply(lambda x: [re.sub(r"[^\w]", "", name.lower()) for name in x])

### Prepro Keywords

In [17]:
def getStemmedKeywords(keywords):
    if len(keywords) == 0:
        return []
    return [stemmer.stem(keyword["name"]) for keyword in keywords]

stemmer = SnowballStemmer("english")
md["keywords"] = md["keywords"].apply(getStemmedKeywords)

In [18]:
keywordCount = md["keywords"].explode().value_counts()
keywordCount

independent film      610
woman director        550
murder                414
duringcreditssting    327
based on novel        318
                     ... 
rhyme battl             1
floyd                   1
night creatur           1
jason voorhe            1
cyclist                 1
Name: keywords, Length: 12430, dtype: int64

In [19]:
useKeywords = keywordCount[keywordCount > 10].index.tolist()

In [20]:
def processKeywords(keywords):
    if len(keywords) == 0:
        return []    
    return [re.sub(r"[^\w]", "", keyword.lower()) for keyword in keywords if keyword in useKeywords]
    
md["keywords"] = md["keywords"].apply(processKeywords)

### Show the Final Dataframe

In [21]:
md.head()

Unnamed: 0,id,genres,title,cast,keywords,director
0,862,"[animation, comedy, family]",Toy Story,"[tomhanks, timallen, donrickles]","[jealousi, toy, boy, friendship, friend, rival...",[johnlasseter]
1,8844,"[adventure, fantasy, family]",Jumanji,"[robinwilliams, jonathanhyde, kirstendunst]","[disappear, basedonchildrensbook]",[joejohnston]
2,15602,"[romance, comedy]",Grumpier Old Men,"[waltermatthau, jacklemmon, annmargret]","[fish, bestfriend, duringcreditssting]",[howarddeutch]
3,31357,"[comedy, drama, romance]",Waiting to Exhale,"[whitneyhouston, angelabassett, lorettadevine]","[basedonnovel, interracialrelationship, single...",[forestwhitaker]
4,11862,[comedy],Father of the Bride Part II,"[stevemartin, dianekeaton, martinshort]","[babi, midlifecrisi, confid, age, daughter, mo...",[charlesshyer]


### Gather All Features

In [22]:
md["desc"] = md["genres"] + md["cast"] + md["keywords"] + md["director"]
md["desc"] = md["desc"].apply(lambda x: " ".join(x))

### Transform Genres, Cast, Directors, and Keywords to Count Vectors

In [23]:
vectorizer = CountVectorizer(ngram_range=(1, 2), stop_words="english")
counter = vectorizer.fit_transform(md["desc"])

### Compute Cosine Similarities

In [24]:
dists = cosine_similarity(counter, counter)

### Make Recommendations

In [25]:
topkTitles = getTopkRecommendations("Interstellar", md, dists)
print(topkTitles)

topkTitles = getTopkRecommendations("Inception", md, dists)
print(topkTitles)

topkTitles = getTopkRecommendations("The Shawshank Redemption", md, dists)
print(topkTitles)

topkTitles = getTopkRecommendations("The Truman Show", md, dists)
print(topkTitles)

topkTitles = getTopkRecommendations("The Godfather", md, dists)
print(topkTitles)

['The Martian', 'Apollo 13', 'Midnight Special', 'Planet of the Apes', 'Approaching the Unknown']
['Sky Captain and the World of Tomorrow', 'The Core', 'I Am Number Four', 'Jurassic Park III', 'Green Lantern']
['Murder in the First', 'Mitchell', 'The Bad Lieutenant: Port of Call - New Orleans', 'Ariel', 'Le Cercle Rouge']
['Swimming to Cambodia', 'Hyde Park on Hudson', 'Beautiful Girls', 'The Squid and the Whale', 'Green Card']
['The Godfather: Part II', 'The Gambler', 'The Rainmaker', 'Mitchell', 'The Cotton Club']


In [26]:
np.save("contentSim/metaSim", dists)
md.to_csv("contentSim/metaFeatures.csv", index=False)