In [11]:
import pandas as pd 
import numpy as np 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.compose import ColumnTransformer

In [12]:
df= pd.read_csv("../Data/imdb_cleaned_data.csv", index_col=0, decimal=',',dtype={'RELEASE_YEAR':'O'})

In [13]:
df.head()

Unnamed: 0,ORIGINAL_TITLE,CAST,DIRECTOR,OVERVIEW,GENRES,PRODUCTION_COMPANIES,RUNTIME,RELEASE_DATE,RELEASE_YEAR,RELEASE_MONTH,RELEASE_DAY,BUDGET,REVENUE,PROFIT,VOTE_COUNT,VOTE_AVERAGE,POPULARITY
0,Jurassic World,"Chris Pratt, Bryce Dallas Howard, Irrfan Khan,...",Colin Trevorrow,Twenty-two years after the events of Jurassic ...,"Action, Adventure, Science Fiction",Universal Studios,124.0,2015-06-09,2015.0,June,Tuesday,137999939.3,1392446000.0,1254446000.0,5562.0,6.5,32.985763
1,Mad Max: Fury Road,"Tom Hardy, Charlize Theron, Hugh Keays-Byrne, ...",George Miller,An apocalyptic story set in the furthest reach...,"Action, Adventure, Science Fiction",Village Roadshow Pictures,120.0,2015-05-13,2015.0,May,Wednesday,137999939.3,348161300.0,210161400.0,6185.0,7.1,28.419936
2,Insurgent,"Shailene Woodley, Theo James, Kate Winslet, An...",Robert Schwentke,Beatrice Prior must confront her inner demons ...,"Adventure, Science Fiction, Thriller",Summit Entertainment,119.0,2015-03-18,2015.0,March,Wednesday,101199955.5,271619000.0,170419100.0,2480.0,6.3,13.112507
3,Star Wars: The Force Awakens,"Harrison Ford, Mark Hamill, Carrie Fisher, Ada...",J.J. Abrams,Thirty years after defeating the Galactic Empi...,"Action, Adventure, Science Fiction",Lucasfilm,136.0,2015-12-15,2015.0,December,Tuesday,183999919.0,1902723000.0,1718723000.0,5292.0,7.5,11.173104
4,Furious 7,"Vin Diesel, Paul Walker, Jason Statham, Michel...",James Wan,Deckard Shaw seeks revenge against Dominic Tor...,"Action, Crime, Thriller",Universal Pictures,137.0,2015-04-01,2015.0,April,Wednesday,174799923.1,1385749000.0,1210949000.0,2947.0,7.3,9.335014


In [4]:
# Converting strings to lowercase.
for col in df.columns.difference(['ORIGINAL_TITLE']):
    try:
        df[col] = df[col].str.lower()
    except:
        pass

In [5]:
X = df[['CAST','DIRECTOR','OVERVIEW','GENRES','PRODUCTION_COMPANIES']]

In [6]:
SW = set(ENGLISH_STOP_WORDS)

In [7]:
ct = ColumnTransformer(transformers=
    [('cast_countvectorizer', CountVectorizer(ngram_range=(2,2)),'CAST'),
     ('director_countvectorizer', CountVectorizer(ngram_range=(2,2)), 'DIRECTOR'),
     ('overview_countvectorizer', CountVectorizer(stop_words=SW),'OVERVIEW'),
     ('genres_countvectorizer', CountVectorizer(),'GENRES'),
     ('production_countvectorizer', CountVectorizer(ngram_range=(1,3)),'PRODUCTION_COMPANIES')
     ],
    remainder='passthrough')

In [8]:
X_trans = ct.fit_transform(X)

In [9]:
X_trans

<9466x96385 sparse matrix of type '<class 'numpy.int64'>'
	with 406059 stored elements in Compressed Sparse Row format>

## Content Based Recommender

To customize the recommendations, we create a engine that calculates the similarity between movies based on certain metrics and suggests movies that are the most similar to a particular movie that a user liked. Since we will use film metadata (or content) to create this engine, this is also called content-based filtering.

We use the casting, the director, the overview, the genre and the production company of the movies to calculate the similarity between them.
To do that, we use the Cosine similarity which the formula is given by :

$cos (x,y) =\frac {x.y^T}{\lVert{x}\rVert.\lVert{y}\rVert}$

In [10]:
#Generating the cosine similarity matrix
cosine_sim = cosine_similarity(X_trans)
cosine_sim 

array([[1.        , 0.08951436, 0.08660254, ..., 0.        , 0.02357023,
        0.        ],
       [0.08951436, 1.        , 0.06201737, ..., 0.        , 0.08439495,
        0.        ],
       [0.08660254, 0.06201737, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.0238705 ,
        0.        ],
       [0.02357023, 0.08439495, 0.        , ..., 0.0238705 , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [11]:
Similarity = pd.DataFrame(df['ORIGINAL_TITLE'])

In [12]:
Similarity = Similarity.merge(pd.DataFrame(cosine_sim), right_index=True,left_index=True)

In [13]:
Similarity

Unnamed: 0,ORIGINAL_TITLE,0,1,2,3,4,5,6,7,8,...,9456,9457,9458,9459,9460,9461,9462,9463,9464,9465
0,Jurassic World,1.000000,0.089514,0.086603,0.144338,0.063246,0.029881,0.062828,0.081349,0.000000,...,0.000000,0.022140,0.000000,0.038633,0.000000,0.022140,0.044281,0.00000,0.023570,0.000000
1,Mad Max: Fury Road,0.089514,1.000000,0.062017,0.082690,0.067937,0.064194,0.059990,0.058255,0.049090,...,0.015553,0.031710,0.000000,0.055332,0.000000,0.015855,0.047565,0.00000,0.084395,0.000000
2,Insurgent,0.086603,0.062017,1.000000,0.100000,0.036515,0.034503,0.072548,0.031311,0.000000,...,0.000000,0.000000,0.000000,0.066915,0.000000,0.000000,0.025565,0.00000,0.000000,0.000000
3,Star Wars: The Force Awakens,0.144338,0.082690,0.100000,1.000000,0.036515,0.034503,0.072548,0.031311,0.047494,...,0.000000,0.025565,0.031311,0.066915,0.022473,0.000000,0.051131,0.00000,0.027217,0.000000
4,Furious 7,0.063246,0.067937,0.036515,0.036515,1.000000,0.000000,0.000000,0.102899,0.017342,...,0.027472,0.056011,0.000000,0.024434,0.000000,0.000000,0.028006,0.00000,0.059628,0.037139
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9461,The Endless Summer,0.022140,0.015855,0.000000,0.000000,0.000000,0.000000,0.000000,0.024015,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.017236,1.000000,0.000000,0.00000,0.041748,0.000000
9462,Grand Prix,0.044281,0.047565,0.025565,0.051131,0.028006,0.052926,0.037094,0.024015,0.012142,...,0.019234,0.019608,0.048029,0.034214,0.017236,0.000000,1.000000,0.00000,0.020874,0.000000
9463,Beregis Avtomobilya,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.027770,...,0.021995,0.000000,0.027462,0.039126,0.019710,0.000000,0.000000,1.00000,0.023870,0.000000
9464,"What's Up, Tiger Lily?",0.023570,0.084395,0.000000,0.027217,0.059628,0.000000,0.000000,0.051131,0.025852,...,0.040953,0.041748,0.025565,0.036424,0.018349,0.041748,0.020874,0.02387,1.000000,0.000000


In [14]:
y = Similarity.ORIGINAL_TITLE
cosine_sim = Similarity.drop(['ORIGINAL_TITLE'], axis=1).values

In [15]:
#We create a function that returns the 10 most similar movies from the one the user liked
def recommend(title):
    recommended_movies = []
    idx = y[y == title].index[0]   # to get the index of the movie title matching the input movie
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)   # similarity scores in descending order
    top_10_indices = list(score_series.iloc[1:11].index)   # to get the indices of top 10 most similar movies
    
    
    for i in top_10_indices:   # to append the titles of top 10 similar movies to the recommended_movies list
        recommended_movies.append(y[i])
        
    return recommended_movies

### Examples

In [16]:
recommend('Ant-Man')

['Iron Man 2',
 'Iron Man 3',
 'Captain America: The First Avenger',
 'Iron Man',
 'The Avengers',
 'Outlander',
 'The Quiet Earth',
 'Marvel One-Shot: Agent Carter',
 'Marvel One-Shot: Item 47',
 'Captain America: The Winter Soldier']

In [17]:
recommend('Star Trek')

['Star Trek V: The Final Frontier',
 'Star Trek III: The Search for Spock',
 'Star Trek Into Darkness',
 'Star Trek VI: The Undiscovered Country',
 'Star Trek IV: The Voyage Home',
 'Star Trek: The Motion Picture',
 'Star Trek: First Contact',
 'Star Trek: Insurrection',
 'Congo',
 'Transformers: Revenge of the Fallen']

In [18]:
recommend('Fast & Furious')

['Fast Five',
 'Furious 7',
 'The Fast and the Furious',
 'Fast & Furious 6',
 'Backdraft',
 'Mobsters',
 'The Sting',
 'The Real McCoy',
 "Carlito's Way",
 'The Border']

In [19]:
recommend('X-Men: First Class')

['4: Rise of the Silver Surfer',
 'Live Free or Die Hard',
 'Flyboys',
 'X-Men: The Last Stand',
 'Dragonball Evolution',
 'The Last Legion',
 'X-Men Origins: Wolverine',
 'Avatar',
 'X2',
 'Virgin Territory']

In [20]:
recommend('Star Wars')

['The Empire Strikes Back',
 'Star Wars: The Force Awakens',
 'Return of the Jedi',
 'The Star Wars Holiday Special',
 'Star Wars: Episode I - The Phantom Menace',
 "Family Guy Presents: It's a Trap!",
 'Star Wars: Episode III - Revenge of the Sith',
 'Iron Man',
 'Shanghai Noon',
 'Hot Shots! Part Deux']