In [8]:
import pandas as pd

metadata = pd.read_csv('app/exports/metadata.csv')

In [9]:
metadata = metadata[['movieId', 'title', 'genres', 'original_title', 'production_companies', 'production_countries', 'overview']]
metadata

Unnamed: 0,movieId,title,genres,original_title,production_companies,production_countries,overview
0,1,Toy Story,"Animation, Comedy, Family",Toy Story,Pixar Animation Studios,United States of America,"Led by Woody, Andy's toys live happily in his ..."
1,2,Jumanji,"Adventure, Fantasy, Family",Jumanji,"TriStar Pictures, Teitler Film, Interscope Com...",United States of America,When siblings Judy and Peter discover an encha...
2,3,Grumpier Old Men,"Romance, Comedy",Grumpier Old Men,"Warner Bros., Lancaster Gate",United States of America,A family wedding reignites the ancient feud be...
3,4,Waiting to Exhale,"Comedy, Drama, Romance",Waiting to Exhale,Twentieth Century Fox Film Corporation,United States of America,"Cheated on, mistreated and stepped on, the wom..."
4,5,Father of the Bride Part II,Comedy,Father of the Bride Part II,"Sandollar Productions, Touchstone Pictures",United States of America,Just when George Banks has recovered from his ...
...,...,...,...,...,...,...,...
21740,176223,The Man with the Rubber Head,"Comedy, Fantasy, Science Fiction",L'Homme à la tête de caoutchouc,Star Film Company,France,A chemist in his laboratory places upon a tabl...
21741,176229,The Devilish Tenant,"Fantasy, Comedy",Le locataire diabolique,Star Film Company,France,A man rents an apartment and furnishes it in r...
21742,176237,The One-Man Band,"Fantasy, Action, Thriller",L'Homme orchestre,Star Film Company,France,A band-leader has arranged seven chairs for th...
21743,176249,Mom,"Crime, Drama, Thriller",Maa,"Mad Films, Third Eye Pictures",India,The bliss of a biology teacher’s family life i...


In [10]:
metadata.dropna(inplace=True, subset=['title'])
metadata.fillna('', inplace=True)

In [11]:
metadata['tags'] = metadata['title'] + ' ' + metadata['genres'] + ' ' + metadata['production_companies'] + ' ' + metadata['production_countries'] + ' ' + metadata['overview']
metadata['tags'] = metadata['tags'].apply(lambda x: x.lower())
metadata = metadata[['movieId', 'title', 'tags']]
metadata

Unnamed: 0,movieId,title,tags
0,1,Toy Story,"toy story animation, comedy, family pixar anim..."
1,2,Jumanji,"jumanji adventure, fantasy, family tristar pic..."
2,3,Grumpier Old Men,"grumpier old men romance, comedy warner bros.,..."
3,4,Waiting to Exhale,"waiting to exhale comedy, drama, romance twent..."
4,5,Father of the Bride Part II,father of the bride part ii comedy sandollar p...
...,...,...,...
21740,176223,The Man with the Rubber Head,"the man with the rubber head comedy, fantasy, ..."
21741,176229,The Devilish Tenant,"the devilish tenant fantasy, comedy star film ..."
21742,176237,The One-Man Band,"the one-man band fantasy, action, thriller sta..."
21743,176249,Mom,"mom crime, drama, thriller mad films, third ey..."


In [12]:
# cleaning text
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re

stop_words = stopwords.words('english')
stemmer = PorterStemmer()

def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = text.split()
    text = [stemmer.stem(word) for word in text if word not in stop_words]
    text = ' '.join(text)
    return text

metadata['tags'] = metadata['tags'].apply(lambda x: clean_text(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metadata['tags'] = metadata['tags'].apply(lambda x: clean_text(x))


In [13]:
# tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english',)
tfidf_matrix = tfidf.fit_transform(metadata['tags'])

In [14]:
# cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# calculate cosine similarity in batches to fit in memory
batch_size = 1000
num_samples = tfidf_matrix.shape[0]

content_similarity = np.zeros((num_samples, num_samples))

for i in range(0, num_samples, batch_size):
    start = i
    end = min(i + batch_size, num_samples)
    content_similarity[start:end, :] = cosine_similarity(tfidf_matrix[start:end], tfidf_matrix)


In [15]:
content_similarity.shape

(21745, 21745)

In [16]:
# to a pandas dataframe
content_similarity = pd.DataFrame(content_similarity, index=metadata['movieId'], columns=metadata['movieId'])

In [17]:
import pickle

with open('app/exports/content_similarity.pkl', 'wb') as f:
    pickle.dump(content_similarity, f)

In [18]:
content_similarity.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,176207,176211,176213,176217,176219,176223,176229,176237,176249,176267
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.02428,0.008608,0.005484,0.006439,0.003386,0.005713,0.008716,0.002446,0.005959,...,0.0,0.053102,0.004697,0.011063,0.006482,0.007059,0.003923,0.0,0.003772,0.026258
2,0.02428,1.0,0.021898,0.006072,0.008262,0.045007,0.007331,0.021587,0.068476,0.013495,...,0.00257,0.029418,0.005856,0.036922,0.004715,0.014573,0.017878,0.006073,0.009414,0.02079
3,0.008608,0.021898,1.0,0.011258,0.028191,0.024673,0.011728,0.01054,0.002957,0.007206,...,0.0,0.024154,0.0,0.0033,0.007838,0.010477,0.004743,0.0,0.00456,0.009533
4,0.005484,0.006072,0.011258,1.0,0.008532,0.0132,0.012853,0.010451,0.003241,0.007897,...,0.03601,0.018586,0.001745,0.001379,0.00859,0.006816,0.021395,0.007004,0.044876,0.066648
5,0.006439,0.008262,0.028191,0.008532,1.0,0.005269,0.096776,0.007097,0.054225,0.014026,...,0.010478,0.012854,0.0,0.050355,0.015087,0.001945,0.006104,0.0,0.016074,0.006504


In [19]:
"""
movies : [(movieId, rating), (movieId, rating), (movieId, rating)...]
"""
def get_content_similarity(movies):
    movie_ids = [movie[0] for movie in movies]
    ratings = [movie[1] - 2.5 for movie in movies]

    movie_similarities = content_similarity[movie_ids] * ratings
    movie_similarities = movie_similarities.sum(axis=1)

    return movie_similarities

get_content_similarity([(149406,5), (87876, 5)])

movieId
1         0.317118
2         0.145432
3         0.062347
4         0.186961
5         0.087966
            ...   
176223    0.238061
176229    0.104885
176237    0.034016
176249    0.066355
176267    0.225359
Length: 21745, dtype: float64

In [20]:
def get_recommendations(movies):
    movie_similarities = get_content_similarity(movies)
    movie_similarities = sorted(list(zip(movie_similarities.index, movie_similarities)), key=lambda x: x[1], reverse=True)

    return movie_similarities

for movie, val in get_recommendations([(149406,5), (87876, 5)]):
    print(movie, '\t', val, '\t', metadata[metadata['movieId'] == movie]['title'].values[0])

149406 	 2.7187796161121502 	 Kung Fu Panda 3
87876 	 2.718779616112149 	 Cars 2
164929 	 1.2606243739537433 	 Air Mater
87222 	 1.2258612573985261 	 Kung Fu Panda 2
45517 	 1.0932229805797016 	 Cars
109420 	 1.0805343538159309 	 Mater and the Ghostlight
59784 	 1.0752502635306853 	 Kung Fu Panda
67295 	 1.0647917965960887 	 Kung Fu Panda: Secrets of the Furious Five
136556 	 0.9911900494821402 	 Kung Fu Panda: Secrets of the Masters
170957 	 0.9536578483270386 	 Cars 3
136016 	 0.850395782090977 	 The Good Dinosaur
5109 	 0.8257319818584523 	 Return to Never Land
91823 	 0.8156100047259917 	 Kung Fu Panda Holiday
115664 	 0.8002904945923024 	 The Book of Life
48 	 0.7891813732215541 	 Pocahontas
33615 	 0.775242059402497 	 Madagascar
102720 	 0.7667260624821494 	 Epic
6158 	 0.7648374400586406 	 The Jungle Book 2
168418 	 0.7550169430568684 	 The Boss Baby
3745 	 0.74716157144604 	 Titan A.E.
5389 	 0.7449923816184689 	 Spirit: Stallion of the Cimarron
141676 	 0.7233728275164794 	 Jo