[View in Colaboratory](https://colab.research.google.com/github/ruxandraburtica/recommender-systems/blob/master/3_content_based.ipynb)


# 3. Content-based recommender systems




## Imports

In [0]:
from io import BytesIO
import os
from ast import literal_eval

import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import zipfile
import seaborn as sns

%matplotlib inline

## Read the data

In [0]:
def download_file(url, file_name):
    # First remove the archive_name file if it exists.
    if os.path.exists(file_name):
        os.remove(file_name)
    request = requests.get(url)
    with open(file_name, "wb") as fd:
        fd.write(request.content)


download_file('https://s3-eu-west-1.amazonaws.com/machine-learning-public/workshop/movies_metadata.csv', 'movies_metadata.csv')
download_file('https://s3-eu-west-1.amazonaws.com/machine-learning-public/workshop/keywords.csv', 'keywords.csv')
download_file('https://s3-eu-west-1.amazonaws.com/machine-learning-public/workshop/credits.csv', 'credits.csv')
download_file('https://s3-eu-west-1.amazonaws.com/machine-learning-public/workshop/links_small.csv', 'links_small.csv')

In [0]:
movies = pd.read_csv('movies_metadata.csv')

# Tags for movies
keywords = pd.read_csv('keywords.csv')

# Details about cast and crew of the movie
credits = pd.read_csv('credits.csv')

# Ids for a smaller dataset of movies.
links_small = pd.read_csv('links_small.csv')

In [0]:
movies.head()

In [0]:
movies['genres'] = movies['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
movies['year'] = pd.to_datetime(movies['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

Remove movies 

In [0]:
# Convert id from all datasets to integer.
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
movies['id'] = movies['id'].astype('int')


In [0]:
movies = movies.drop([19730, 29503, 35587])

In [0]:
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')
small_movies = movies[movies['id'].isin(links_small)]
small_movies.shape

In [0]:
small_movies['tagline'] = small_movies['tagline'].fillna('')
small_movies['description'] = small_movies['overview'] + small_movies['tagline']
small_movies['description'] = small_movies['description'].fillna('')

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(small_movies['description'])

In [0]:
tfidf_matrix.shape

In [0]:
# Merge with credits & keywords.
movies = movies.merge(credits, on='id')
movies = movies.merge(keywords, on='id')

In [0]:
small_movies = movies[movies['id'].isin(links_small)]
small_movies.shape

In [0]:
small_movies['cast'] = small_movies['cast'].apply(literal_eval)
small_movies['crew'] = small_movies['crew'].apply(literal_eval)
small_movies['keywords'] = small_movies['keywords'].apply(literal_eval)
small_movies['cast_size'] = small_movies['cast'].apply(lambda x: len(x))
small_movies['crew_size'] = small_movies['crew'].apply(lambda x: len(x))

In [0]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [0]:
small_movies['director'] = small_movies['crew'].apply(get_director)

In [0]:
small_movies['cast'] = small_movies['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
small_movies['cast'] = small_movies['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)

In [0]:
small_movies['keywords'] = small_movies['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [0]:
small_movies['cast'] = small_movies['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [0]:
small_movies['director'] = small_movies['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
small_movies['director'] = small_movies['director'].apply(lambda x: [x,x, x])

In [0]:
small_movies.head()

In [0]:
s = small_movies.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'
s = s.value_counts()
s[:5]

In [0]:
s = s[s > 1]

In [0]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')

In [0]:
def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words

In [0]:
small_movies['keywords'] = small_movies['keywords'].apply(filter_keywords)
small_movies['keywords'] = small_movies['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
small_movies['keywords'] = small_movies['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [0]:
small_movies['soup'] = small_movies['keywords'] +small_movies['cast'] + small_movies['director'] + small_movies['genres']
small_movies['soup'] = small_movies['soup'].apply(lambda x: ' '.join(x))

In [0]:
smd.head()

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(small_movies['soup'])

In [0]:
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [0]:
small_movies = small_movies.reset_index()
titles = small_movies['title']
indices = pd.Series(small_movies.index, index=small_movies['title'])

## Get recommendations


In [0]:
get_recommendations('The Dark Knight').head(10)