# Content based movie recommendation

## Data preprocessing

In [1]:
import pandas as pd
import numpy as np

credits_df = pd.read_csv('data/the_movies/credits.csv')
#ratings_df = pd.read_csv('data/the_movies/ratings.csv')
keywords_df = pd.read_csv('data/the_movies/keywords.csv')
metadata_df = pd.read_csv('data/the_movies/movies_metadata.csv')

  metadata_df = pd.read_csv('data/the_movies/movies_metadata.csv')


In [2]:
from ast import literal_eval

# description 
metadata_df['tagline'] = metadata_df['tagline'].fillna("") #NaN vals -> ""
metadata_df['overview'] = metadata_df['overview'].fillna("")
#merge overview and tagline for bag of words quantization
metadata_df['desc'] = metadata_df['overview'] +" "+ metadata_df['tagline']
metadata_df['desc'] = metadata_df['desc'].fillna('')

#keywords and cast
keywords_df['id'] = keywords_df['id'].astype('int')
credits_df['id'] = credits_df['id'].astype('int')
metadata_df = metadata_df[metadata_df['id'].apply(lambda x: x.isdigit())]
metadata_df['id'] = metadata_df['id'].astype('int')

df = pd.merge(metadata_df, credits_df, on='id')
df = pd.merge(df, keywords_df, on='id')

#str to dict
df['cast'] = df['cast'].apply(literal_eval)
df['crew'] = df['crew'].apply(literal_eval)
df['keywords'] = df['keywords'].apply(literal_eval)
df['cast_size'] = df['cast'].apply(lambda x: len(x))
df['crew_size'] = df['crew'].apply(lambda x: len(x))


In [3]:
#stem the description
#stemmeer -> keep only word stem 
#does not seem to effect anything

from nltk.stem import *

stemmer = PorterStemmer()
st = 'A sufferer of cystic fibrosis travels to a legendary shrine that reputedly has the power to heal his condition. '.split(' ')
x = [stemmer.stem(word) for word in st]
' '.join(x)

for i in range(10):
    st = df.iloc[i]['desc'].split(' ')
    stems = [stemmer.stem(word) for word in st]
    df.at[i,'desc'] = ' '.join(stems)

In [4]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

df['director'] = df['crew'].apply(get_director)


In [5]:
df['cast'] = df['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
df['cast'] = df['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)
df['keywords'] = df['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [6]:
df['cast'] = df['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

df['director'] = df['director'].astype('str').apply(lambda x: [str.lower(x.replace(" ", ""))])
#df['director'] = df['director'].apply(lambda x: [x,x, x]) nah, director 3x is a bs

In [7]:
len(df)

46628

In [8]:
df['castkey'] =  df['keywords'] + df['cast'] + df['director'] 
#genre ignored for now
#TODO what about genre? --> preprocessing needed & what weight against keywords

In [9]:
for i in range(len(df)):
    df.at[i, 'castkey'] = " ".join(df.at[i,'castkey'])

## Keywords-Cast Vectorization

Keep only keywords with occurence > 1, using only stems of each word. Those are calculated using NLTK stemmer implementation based on Porter stemming algorithm. We shal

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity

def vectorize(target, method='count', max_features=5000):
    tfidf_matrix = None
    if method == 'tfidf':
        vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 2),
                                     stop_words='english', max_features=max_features,
                                     min_df=1)
        tfidf_matrix = vectorizer.fit_transform(target)
    if method == 'count':
        vectorizer = CountVectorizer(analyzer='word', lowercase=False, ngram_range=(1, 2), min_df=0, stop_words='english')
        tfidf_matrix = vectorizer.fit_transform(target)
        tfidf_matrix = normalize(tfidf_matrix, norm='l1', axis=1) #normalize -> sum of a column == 1
    
    return tfidf_matrix

In [11]:
feat_castkey = vectorize(df['castkey']) #count matrix

In [12]:
sim_castkey = cosine_similarity(feat_castkey, feat_castkey)
print("similarity matrix computed of shape", sim_castkey.shape)

similarity matrix computed of shape (46628, 46628)


## Description Vectorization

In [None]:
feat_desc = vectorize(df['desc'], method='tfidf')
sim_desc = cosine_similarity(feat_desc, feat_desc)

## Results

### Description based recommendation


In [None]:
name2id = pd.Series(df.index, index=df['original_title'])

def cosine_recommend(name, cosine_sim, n=10):
    '''return n most similar movies (in order) given name and similarity matrix '''
    beta = 0
    idx = name2id[name]
    n+=1
    print(idx)
    #get vect of ratings 
    rating = df.iloc[:]['vote_average'].to_numpy() / 10
        
    score = cosine_sim[idx] * rating ** beta #using rating as somewhat apriori proba.
    sort = np.argsort(score) #returns indices of would be sorted, default quicksort
    
    df['score'] = score
    
    if n==1: #details, returs whole df
        recom_id = sort[-1]
        recom = df.iloc[recom_id]
        print(sort.shape)
        print("Recommendation:", recom['original_title'], "| score:", score[recom_id])
    else:
        recom_id = sort[-n:]
        recom_id = recom_id[::-1] #reverse
        recom = [None]*len(recom_id)
        for i in range(len(recom_id)):
            recom[i] = df.iloc[recom_id[i]]['original_title']
            print(i, "recommendation:", recom[i], "| score:", score[recom_id[i]], 
                  "| rating:", df.iloc[recom_id[i]]['vote_average'])
    return recom

cosine_recommend("The Lord of the Rings: The Two Towers", sim_desc)

## Results
As seen above, we have compared the distances and received 5 closest elements to the given movie, which is "The Lord of the Rings: The Two Towers". Naturally the closest movie in the dataset to the inputed one, is the movie itself. Below we can see data for inputed movie, and the best recommendation of our algorithm, based on which the recommendation was selected. Just by reading, we can see the similarities, between those two corpuses. However by looking at the average rating of the recommended movie, this might be a questionable recommendation. We will export computed similarity matrix for later usage, and continue with a different set of features.

In [None]:
print(df.loc[df['original_title'] == 'The Lord of the Rings: The Two Towers']['desc'].item())

df.loc[df['original_title'] == 'Wizards of the Lost Kingdom']['desc'].item()

## Keyword-cast Based Results

In [16]:
cosine_recommend("The Lord of the Rings: The Two Towers", sim_castkey)

5876
0 recommendation: The Lord of the Rings: The Two Towers | score: 1.0000000000000002 | rating: 8.0
1 recommendation: The Lord of the Rings: The Return of the King | score: 0.5001181893409723 | rating: 8.1
2 recommendation: The Lord of the Rings: The Fellowship of the Ring | score: 0.48980536649995376 | rating: 8.0
3 recommendation: The Hobbit: The Battle of the Five Armies | score: 0.34921514788478925 | rating: 7.1
4 recommendation: The Hobbit: The Desolation of Smaug | score: 0.279751442472094 | rating: 7.6
5 recommendation: The Hobbit: An Unexpected Journey | score: 0.2764294762875762 | rating: 7.0
6 recommendation: Warcraft | score: 0.17466675292187456 | rating: 6.3
7 recommendation: The Hunt for Gollum | score: 0.17099639201419226 | rating: 6.3
8 recommendation: L'homme sans ombre | score: 0.16903085094570325 | rating: 8.5
9 recommendation: 神様のパズル | score: 0.16903085094570325 | rating: 6.2
10 recommendation: Ator l'invincibile 2 | score: 0.16064386578049974 | rating: 1.9


['The Lord of the Rings: The Two Towers',
 'The Lord of the Rings: The Return of the King',
 'The Lord of the Rings: The Fellowship of the Ring',
 'The Hobbit: The Battle of the Five Armies',
 'The Hobbit: The Desolation of Smaug',
 'The Hobbit: An Unexpected Journey',
 'Warcraft',
 'The Hunt for Gollum',
 "L'homme sans ombre",
 '神様のパズル',
 "Ator l'invincibile 2"]

In [17]:
df.loc[df['original_title'] == "The Lord of the Rings: The Two Towers"]

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,vote_average,vote_count,desc,cast,crew,keywords,cast_size,crew_size,director,castkey
5876,False,"{'id': 119, 'name': 'The Lord of the Rings Col...",79000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",http://www.lordoftherings.net/,121,tt0167261,en,The Lord of the Rings: The Two Towers,Frodo and Sam are trekking to Mordor to destro...,...,8.0,7641.0,Frodo and Sam are trekking to Mordor to destro...,"[elijahwood, ianmckellen, viggomortensen]","[{'credit_id': '52fe421ac3a36847f800454f', 'de...","[elves, orcs, middle-earth (tolkien), hobbit, ...",34,36,[peterjackson],elves orcs middle-earth (tolkien) hobbit based...


In [44]:
x = np.asarray([[1,2,3],[4,1,5],[6,7,1]])
low = np.tril(x, -1)
up = np.triu(x)
low + up

array([[1, 2, 3],
       [4, 1, 5],
       [6, 7, 1]])

## N-movies based recommendation

For our final recommender system we want to use several input movies - criterions, to which we want to find the closest element in the matrix. Which we can formulate given set of inputs $I = {i_1, i_2,... ,i_n}$ as follows;  
$m^* = argmax_m \, s(i_1,m) + s(i_2,m) + ... + s(i_n,m) \; \forall m \in M$, where M is set of known movies, and $s$ is the similarity of the two given movies.

In [18]:
#TODO: beware of indexing, and id correspondance from the dist matrix

#similarity of all m in M to I := S
in_mov = [5876]#[23076, 911] #= interstellar, space odyssey
S = np.zeros(sim_castkey.shape[0])

BETA = 0
def movie_rating(idx):
        rat = df.iloc[idx]['vote_average']
        #count = df.iloc[idx]['vote_count'] TODO weighted vote
        if rat > 0:
            return rat
        else:
            return 0

w = [0.7, 0.3] #vector of length same as the similarity        
        
similarity = [sim_castkey, sim_desc]
for m in range(len(sim_castkey)):
    s = 0 #total sim to movies
    for i in in_mov:  #stupid, just half is enough
        if m != i:
            for j in range(len(similarity)):
                s += similarity[j][m,i]*w[j] + movie_rating(m) * BETA
    S[m] = s
    
    
# #now argsort again and find several best ones
print("max score:", np.max(S), " for id:", np.argmax(S))
sort = np.argsort(S)
df.iloc[np.flip(sort[-10:])]

max score: 0.4048488061163992  for id: 7069


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,vote_average,vote_count,desc,cast,crew,keywords,cast_size,crew_size,director,castkey
7069,False,"{'id': 119, 'name': 'The Lord of the Rings Col...",94000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",http://www.lordoftherings.net,122,tt0167260,en,The Lord of the Rings: The Return of the King,Aragorn is revealed as the heir to the ancient...,...,8.1,8226.0,Aragorn is revealed as the heir to the ancient...,"[elijahwood, ianmckellen, viggomortensen]","[{'credit_id': '52fe421bc3a36847f80046c3', 'de...","[elves, orcs, middle-earth (tolkien), based on...",29,29,[peterjackson],elves orcs middle-earth (tolkien) based on nov...
4904,False,"{'id': 119, 'name': 'The Lord of the Rings Col...",93000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",http://www.lordoftherings.net/,120,tt0120737,en,The Lord of the Rings: The Fellowship of the Ring,"Young hobbit Frodo Baggins, after inheriting a...",...,8.0,8892.0,"Young hobbit Frodo Baggins, after inheriting a...","[elijahwood, ianmckellen, cateblanchett]","[{'credit_id': '52fe421ac3a36847f80043ef', 'de...","[elves, dwarves, orcs, middle-earth (tolkien),...",26,30,[peterjackson],elves dwarves orcs middle-earth (tolkien) hobb...
25591,False,"{'id': 121938, 'name': 'The Hobbit Collection'...",250000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",http://www.thehobbit.com/,122917,tt2310332,en,The Hobbit: The Battle of the Five Armies,Immediately after the events of The Desolation...,...,7.1,4884.0,Immediately after the events of The Desolation...,"[martinfreeman, ianmckellen, richardarmitage]","[{'credit_id': '548ad49a9251414fa20011ab', 'de...","[corruption, elves, dwarves, orcs, middle-eart...",42,127,[peterjackson],corruption elves dwarves orcs middle-earth (to...
22257,False,"{'id': 121938, 'name': 'The Hobbit Collection'...",250000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",http://www.thehobbit.com/,57158,tt1170358,en,The Hobbit: The Desolation of Smaug,"The Dwarves, Bilbo and Gandalf have successful...",...,7.6,4633.0,"The Dwarves, Bilbo and Gandalf have successful...","[martinfreeman, ianmckellen, richardarmitage]","[{'credit_id': '5350e7b0c3a3681d93000e5d', 'de...","[elves, dwarves, orcs, hobbit, dragon, wizard,...",32,108,[peterjackson],elves dwarves orcs hobbit dragon wizard sword ...
20174,False,"{'id': 121938, 'name': 'The Hobbit Collection'...",250000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",http://www.thehobbit.com/,49051,tt0903624,en,The Hobbit: An Unexpected Journey,"Bilbo Baggins, a hobbit enjoying his quiet lif...",...,7.0,8427.0,"Bilbo Baggins, a hobbit enjoying his quiet lif...","[ianmckellen, martinfreeman, richardarmitage]","[{'credit_id': '52fe4783c3a36847f8139f7f', 'de...","[riddle, elves, dwarves, orcs, middle-earth (t...",30,50,[peterjackson],riddle elves dwarves orcs middle-earth (tolkie...
16535,False,"{'id': 141290, 'name': 'The Lord of the Rings ...",0,"[{'id': 16, 'name': 'Animation'}, {'id': 10751...",,1361,tt0079802,en,The Return of the King,Two Hobbits struggle to destroy the Ring in Mo...,...,5.1,9.0,Two Hobbits struggle to destroy the Ring in Mo...,"[orsonbean, theodorebikel, williamconrad]","[{'credit_id': '52fe42eec3a36847f802dc9f', 'de...","[elves, orcs, hobbit, shire, tower]",10,8,[julesbass],elves orcs hobbit shire tower orsonbean theodo...
2027,False,"{'id': 141290, 'name': 'The Lord of the Rings ...",4000000,"[{'id': 14, 'name': 'Fantasy'}, {'id': 18, 'na...",,123,tt0077869,en,The Lord of the Rings,The Fellowship of the Ring embark on a journey...,...,6.1,187.0,The Fellowship of the Ring embark on a journey...,"[christopherguard, williamsquire, michaelscholes]","[{'credit_id': '52fe421bc3a36847f8004833', 'de...","[elves, dwarves, hobbit, mission]",18,11,[ralphbakshi],elves dwarves hobbit mission christopherguard ...
14315,False,,5000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",http://www.thehuntforgollum.com/,17632,tt1323925,en,The Hunt for Gollum,A British fan film based on the appendices of ...,...,6.3,30.0,A British fan film based on the appendices of ...,"[adrianwebster, patricko'connor, arinalldridge]","[{'credit_id': '52fe47399251416c75091f47', 'de...","[middle-earth (tolkien), the lord of the rings]",27,24,[chrisbouchard],middle-earth (tolkien) the lord of the rings a...
44513,False,,0,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",,7234,tt0090333,en,Wizards of the Lost Kingdom,"Simon, son of the king, must flee when the emp...",...,2.3,8.0,"Simon, son of the king, must flee when the emp...","[bosvenson, vidalpeterson, thomchristopher]","[{'credit_id': '52fe4475c3a36847f809731d', 'de...","[sword, magic, prince, kingdom, shurka, magica...",14,7,[héctorolivera],sword magic prince kingdom shurka magical obje...
46177,False,,60000000,"[{'id': 28, 'name': 'Action'}, {'id': 37, 'nam...",http://thedarktower-movie.com,353491,tt1648190,en,The Dark Tower,"The last Gunslinger, Roland Deschain, has been...",...,5.7,688.0,"The last Gunslinger, Roland Deschain, has been...","[idriselba, matthewmcconaughey, tomtaylor]","[{'credit_id': '5912cf71c3a36864d40533b7', 'de...","[gunslinger, based on novel]",50,199,[nikolajarcel],gunslinger based on novel idriselba matthewmcc...


Since we have two matrices of dimensions $46000 \times 46000$ the size of the data is $45000^2 \cdot 8 \approx 16.9$ Gb (8 bytes for float64). Because of $s_{i,j} = s_{j,i}$, the necessery part is only the lower (or upper) triangle matrix containing $\frac{n(n-1)}{2}$ elements. Omitting the redundant information, we can approximately half the memory requirements, making the distance matrix comfortably loadable into a PC with 16Gb of RAM. In this fashion we will compose the similarity matrix of the two traingulare matrices based on our two currant distance matrices.

For our final model we will also add the similarities before saving the matrixes, which will result with only one matrix, although with fixed weighting ratio.

### Merged similarity

In [13]:
W1 = 0.7
W2 = 0.3

feat_desc = vectorize(df['desc'], method='tfidf')
sim_castkey = W1 * sim_castkey + W2 * cosine_similarity(feat_desc, feat_desc)

In [15]:
np.save("/home/tomas/Downloads/similarity_lower", np.tril(sim_castkey))

## Other

In [None]:
# export matrix to numpy binary TODO: sql!!!!
# np.save("/home/tomas/Downloads/sim_castkey", sim_castkey)

### Storing df as a SQLite DB


In [39]:
import sqlite3
from sqlalchemy import create_engine

df2store = df.drop(['crew', 'keywords'], 1) #remove, makes problems for storing
df2store = df2store.applymap(str)
con = sqlite3.connect('movies.db')
df2store.to_sql('movies', con=con)
con.close()

  df2store = df.drop(['crew', 'keywords'], 1) #remove, makes problems for storing


In [38]:
df2store.iloc[:,26]

0        13
1        26
2         7
3        10
4        12
         ..
46623     3
46624    11
46625    15
46626     5
46627     0
Name: cast_size, Length: 46628, dtype: object

In [31]:
df.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'desc', 'cast', 'crew', 'keywords',
       'cast_size', 'crew_size', 'director', 'castkey', 'score'],
      dtype='object')

In [32]:
df['cast']

0                        [tomhanks, timallen, donrickles]
1             [robinwilliams, jonathanhyde, kirstendunst]
2                [waltermatthau, jacklemmon, ann-margret]
3          [whitneyhouston, angelabassett, lorettadevine]
4                 [stevemartin, dianekeaton, martinshort]
                               ...                       
46623            [leilahatami, kouroshtahami, elhamkorda]
46624             [angelaquino, perrydizon, hazelorencio]
46625            [erikaeleniak, adambaldwin, juliedupage]
46626    [iwanmosschuchin, nathalielissenko, pavelpavlov]
46627                                                  []
Name: cast, Length: 46628, dtype: object

In [49]:
#test
con = sqlite3.connect('movies.db')
cursor = con.execute("SELECT * FROM movies")
names = list(map(lambda x: x[0], cursor.description))
print(names)
con.close()

['index', 'adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id', 'imdb_id', 'original_language', 'original_title', 'overview', 'popularity', 'poster_path', 'production_companies', 'production_countries', 'release_date', 'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title', 'video', 'vote_average', 'vote_count', 'desc', 'cast', 'cast_size', 'crew_size', 'director', 'castkey', 'score']


In [46]:
con.close()