In [1]:
import numpy as np
import pandas as pd

# Data Loading
### Dataset : https://github.com/dainantonio/Investigating-a-TMDB-5000-Movie-Dataset/blob/main/tmdb-movies.csv

In [3]:
movies=pd.read_csv("tmdb_5000_movies.csv")

In [4]:
movies.head(2)

Unnamed: 0,id,imdb_id,popularity,budget,revenue,original_title,cast,homepage,director,tagline,...,overview,runtime,genres,production_companies,release_date,vote_count,vote_average,release_year,budget_adj,revenue_adj
0,135397,tt0369610,32.985763,150000000,1513528810,Jurassic World,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,http://www.jurassicworld.com/,Colin Trevorrow,The park is open.,...,Twenty-two years after the events of Jurassic ...,124,Action|Adventure|Science Fiction|Thriller,Universal Studios|Amblin Entertainment|Legenda...,6/9/15,5562,6.5,2015,137999900.0,1392446000.0
1,76341,tt1392190,28.419936,150000000,378436354,Mad Max: Fury Road,Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic...,http://www.madmaxmovie.com/,George Miller,What a Lovely Day.,...,An apocalyptic story set in the furthest reach...,120,Action|Adventure|Science Fiction|Thriller,Village Roadshow Pictures|Kennedy Miller Produ...,5/13/15,6185,7.1,2015,137999900.0,348161300.0


In [5]:
print(movies.columns)

Index(['id', 'imdb_id', 'popularity', 'budget', 'revenue', 'original_title',
       'cast', 'homepage', 'director', 'tagline', 'keywords', 'overview',
       'runtime', 'genres', 'production_companies', 'release_date',
       'vote_count', 'vote_average', 'release_year', 'budget_adj',
       'revenue_adj'],
      dtype='object')


In [6]:
movies.shape

(10866, 21)

In [7]:
movies.iloc[0]['cast']

"Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vincent D'Onofrio|Nick Robinson"

# Data Preprocessing

##### dropping irrelevant columns

In [10]:
movies = movies[['id', 'imdb_id', 'popularity', 'original_title', 'cast', 'director', 'tagline', 'keywords', 'overview',
       'runtime', 'genres', 'release_date',
       'vote_count', 'vote_average', 'release_year']]

In [11]:
movies.head(2)

Unnamed: 0,id,imdb_id,popularity,original_title,cast,director,tagline,keywords,overview,runtime,genres,release_date,vote_count,vote_average,release_year
0,135397,tt0369610,32.985763,Jurassic World,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,Colin Trevorrow,The park is open.,monster|dna|tyrannosaurus rex|velociraptor|island,Twenty-two years after the events of Jurassic ...,124,Action|Adventure|Science Fiction|Thriller,6/9/15,5562,6.5,2015
1,76341,tt1392190,28.419936,Mad Max: Fury Road,Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic...,George Miller,What a Lovely Day.,future|chase|post-apocalyptic|dystopia|australia,An apocalyptic story set in the furthest reach...,120,Action|Adventure|Science Fiction|Thriller,5/13/15,6185,7.1,2015


##### find missing values

In [13]:
movies.isnull().sum()

id                   0
imdb_id             10
popularity           0
original_title       0
cast                76
director            44
tagline           2824
keywords          1493
overview             4
runtime              0
genres              23
release_date         0
vote_count           0
vote_average         0
release_year         0
dtype: int64

In [14]:
movies.fillna(0, inplace=True)

##### check for duplicates and drop it

In [16]:
movies.duplicated().sum()

1

In [17]:
movies.drop_duplicates(inplace=True)

##### filtering from cast, keywords, genres 

In [19]:
def filter(txt):
    if(txt!=0):
        return txt.split('|')
    return []

In [20]:
movies['cast'] = movies['cast'].apply(filter)
movies['keywords'] = movies['keywords'].apply(filter)
movies['genres'] = movies['genres'].apply(filter)


In [21]:
movies.head(1)

Unnamed: 0,id,imdb_id,popularity,original_title,cast,director,tagline,keywords,overview,runtime,genres,release_date,vote_count,vote_average,release_year
0,135397,tt0369610,32.985763,Jurassic World,"[Chris Pratt, Bryce Dallas Howard, Irrfan Khan...",Colin Trevorrow,The park is open.,"[monster, dna, tyrannosaurus rex, velociraptor...",Twenty-two years after the events of Jurassic ...,124,"[Action, Adventure, Science Fiction, Thriller]",6/9/15,5562,6.5,2015


##### concatenate director, cast names

In [23]:
movies['dir_cast'] = movies.apply(lambda m: [m['director']] + m['cast'], axis=1)

In [24]:
movies = movies[['id','imdb_id', 'popularity', 'original_title', 'tagline',
       'keywords', 'overview', 'runtime', 'genres', 'release_date',
       'vote_count', 'vote_average', 'release_year', 'dir_cast']]

In [25]:
movies.head(2)

Unnamed: 0,id,imdb_id,popularity,original_title,tagline,keywords,overview,runtime,genres,release_date,vote_count,vote_average,release_year,dir_cast
0,135397,tt0369610,32.985763,Jurassic World,The park is open.,"[monster, dna, tyrannosaurus rex, velociraptor...",Twenty-two years after the events of Jurassic ...,124,"[Action, Adventure, Science Fiction, Thriller]",6/9/15,5562,6.5,2015,"[Colin Trevorrow, Chris Pratt, Bryce Dallas Ho..."
1,76341,tt1392190,28.419936,Mad Max: Fury Road,What a Lovely Day.,"[future, chase, post-apocalyptic, dystopia, au...",An apocalyptic story set in the furthest reach...,120,"[Action, Adventure, Science Fiction, Thriller]",5/13/15,6185,7.1,2015,"[George Miller, Tom Hardy, Charlize Theron, Hu..."


In [26]:
def filter2(l):
    res=[]
    for name in l:
        if name!=0:
            res.append(name.replace(" ",""))
    return res

In [27]:
movies['dir_cast']=movies['dir_cast'].apply(filter2)

##### filter more keywords from overview, tagline

In [29]:
def filter3(txt):
    if txt!=0:
        return txt.split()
    else:
        return []

In [30]:
movies['overview'] = movies['overview'].apply(filter3)
movies['tagline'] = movies['tagline'].apply(filter3)
movies['keywords'] = movies['keywords'].apply(filter2)

In [31]:
movies['keywords'] = movies['keywords'].apply(filter2)

In [32]:
movies.head(2)

Unnamed: 0,id,imdb_id,popularity,original_title,tagline,keywords,overview,runtime,genres,release_date,vote_count,vote_average,release_year,dir_cast
0,135397,tt0369610,32.985763,Jurassic World,"[The, park, is, open.]","[monster, dna, tyrannosaurusrex, velociraptor,...","[Twenty-two, years, after, the, events, of, Ju...",124,"[Action, Adventure, Science Fiction, Thriller]",6/9/15,5562,6.5,2015,"[ColinTrevorrow, ChrisPratt, BryceDallasHoward..."
1,76341,tt1392190,28.419936,Mad Max: Fury Road,"[What, a, Lovely, Day.]","[future, chase, post-apocalyptic, dystopia, au...","[An, apocalyptic, story, set, in, the, furthes...",120,"[Action, Adventure, Science Fiction, Thriller]",5/13/15,6185,7.1,2015,"[GeorgeMiller, TomHardy, CharlizeTheron, HughK..."


##### make new column : tags

In [34]:
movies['tags'] = movies['tagline'] + movies['keywords'] + movies['overview'] + movies['genres'] + movies['dir_cast']
movies['tags'] = movies['tags'].apply(lambda x: " ".join(x))

In [35]:
movies = movies.drop(columns=['overview','genres','keywords','dir_cast','tagline'])

In [36]:
movies.head(2)

Unnamed: 0,id,imdb_id,popularity,original_title,runtime,release_date,vote_count,vote_average,release_year,tags
0,135397,tt0369610,32.985763,Jurassic World,124,6/9/15,5562,6.5,2015,The park is open. monster dna tyrannosaurusrex...
1,76341,tt1392190,28.419936,Mad Max: Fury Road,120,5/13/15,6185,7.1,2015,What a Lovely Day. future chase post-apocalypt...


##### Make a column with lowercase titles without special characters

In [38]:
import re
def clean_string(text):
    text = text.lower()
    # Remove special characters, keeping only letters, numbers, and spaces
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text

In [39]:
movies['title_clean'] = movies['original_title'].apply(clean_string)

In [40]:
movies.head(2)

Unnamed: 0,id,imdb_id,popularity,original_title,runtime,release_date,vote_count,vote_average,release_year,tags,title_clean
0,135397,tt0369610,32.985763,Jurassic World,124,6/9/15,5562,6.5,2015,The park is open. monster dna tyrannosaurusrex...,jurassic world
1,76341,tt1392190,28.419936,Mad Max: Fury Road,120,5/13/15,6185,7.1,2015,What a Lovely Day. future chase post-apocalypt...,mad max fury road


##### edit the tags, update the words to the stem word (e.g.: <font color='brown'>_lov_</font> is the stem word for __love, loved, loving__)

In [42]:
import nltk
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [43]:
def getStems(text):
    st=[]
    for i in text.split():
        st.append(ps.stem(i))      
    return(" ".join(st))

In [44]:
movies['tags'] = movies['tags'].apply(getStems)

In [45]:
movies.iloc[0]['tags']

"the park is open. monster dna tyrannosaurusrex velociraptor island twenty-two year after the event of jurass park, isla nublar now featur a fulli function dinosaur theme park, jurass world, as origin envis by john hammond. action adventur scienc fiction thriller colintrevorrow chrispratt brycedallashoward irrfankhan vincentd'onofrio nickrobinson"

##### remove stop words from tags

In [47]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')  #to remove the words like the, is, in, a, on etc.
vector = cv.fit_transform(movies['tags']).toarray()

In [48]:
vector.shape

(10865, 5000)

### Calculate Ratings according to IMDB Formula

W=(R*v+C*m)/(v+m)
where:

    W is the weighted rating;
    R is the mean rating for the movie, from 1 to 10;
    v is the number of votes for the movie;
    m is the minimum votes required to be listed in the Top 250 (25,000 as of 2013)
    C is the mean vote across the whole report (7.0 as of 2013).

In [51]:
C=7.0
m=25000
movies['rating'] = (movies["vote_average"]*movies["vote_count"]+C*m)/(movies["vote_count"]+m)

In [52]:
movies.head(2)

Unnamed: 0,id,imdb_id,popularity,original_title,runtime,release_date,vote_count,vote_average,release_year,tags,title_clean,rating
0,135397,tt0369610,32.985763,Jurassic World,124,6/9/15,5562,6.5,2015,the park is open. monster dna tyrannosaurusrex...,jurassic world,6.909005
1,76341,tt1392190,28.419936,Mad Max: Fury Road,120,5/13/15,6185,7.1,2015,what a love day. futur chase post-apocalypt dy...,mad max fury road,7.019833


In [53]:
movies.drop(["vote_count", "vote_average"], axis=1)

Unnamed: 0,id,imdb_id,popularity,original_title,runtime,release_date,release_year,tags,title_clean,rating
0,135397,tt0369610,32.985763,Jurassic World,124,6/9/15,2015,the park is open. monster dna tyrannosaurusrex...,jurassic world,6.909005
1,76341,tt1392190,28.419936,Mad Max: Fury Road,120,5/13/15,2015,what a love day. futur chase post-apocalypt dy...,mad max fury road,7.019833
2,262500,tt2908446,13.112507,Insurgent,119,3/18/15,2015,one choic can destroy you basedonnovel revolut...,insurgent,6.936827
3,140607,tt2488496,11.173104,Star Wars: The Force Awakens,136,12/15/15,2015,everi gener ha a story. android spaceship jedi...,star wars the force awakens,7.087350
4,168259,tt2820852,9.335014,Furious 7,137,4/1/15,2015,vengeanc hit home carrac speed reveng suspens ...,furious 7,7.031635
...,...,...,...,...,...,...,...,...,...,...
10861,21,tt0060371,0.080598,The Endless Summer,95,6/15/66,1966,"surfer surfboard surf the endless summer, by b...",the endless summer,7.000176
10862,20379,tt0060472,0.065543,Grand Prix,176,12/21/66,1966,cinerama sweep you into a drama of speed and s...,grand prix,6.998961
10863,39768,tt0060161,0.065141,Beregis Avtomobilya,94,1/1/66,1966,car trolley stealingcar an insur agent who moo...,beregis avtomobilya,6.999780
10864,21449,tt0061177,0.064317,"What's Up, Tiger Lily?",80,11/2/66,1966,woodi allen strike back! spoof in comic woodi ...,whats up tiger lily,6.998593


## RECOMMENDATION ALGORITHM

###### 13. finding similarity through cosine_similarity 
###### which will give us similarity between each and every movies,
###### i.e. a square matrix

In [56]:
from sklearn.metrics.pairwise import cosine_similarity
smlr = cosine_similarity(vector)

In [57]:
smlr.shape

(10865, 10865)

In [58]:
print(smlr)

[[1.         0.16820685 0.15488062 ... 0.         0.06189845 0.        ]
 [0.16820685 1.         0.14165765 ... 0.04003204 0.13209899 0.02360961]
 [0.15488062 0.14165765 1.         ... 0.         0.         0.        ]
 ...
 [0.         0.04003204 0.         ... 1.         0.02946278 0.03686049]
 [0.06189845 0.13209899 0.         ... 0.02946278 1.         0.        ]
 [0.         0.02360961 0.         ... 0.03686049 0.         1.        ]]


#### RECOMMEND FUNCTION

In [60]:
def recommend(movie_name, n=5):
    cleanedMovieName = clean_string(movie_name)
    idx = movies[movies['title_clean'] == cleanedMovieName].index[0]  #evaluate index where the given movie lies
    movieidx_smlr = [(i,smlr[idx][i]) for i in range(len(smlr[idx]))]  #list of tuple: (another movie index, similarity)
    sortSmlr = sorted(movieidx_smlr, reverse = True, key = lambda x:x[1])  #sort the list according to similarity, in descending order

    if n>len(sortSmlr)-1: n=len(sortSmlr)-2
    for i in sortSmlr[1:n+1]:                                #sortSmlr[0] will contain itself and it's value will be highest
        print(movies.iloc[i[0]]['original_title'])

In [61]:
recommend("Mad Max: Fury Road")

Steel Dawn
Mad Max
Kites
Turbo Kid
The Maze Runner


##### Serializing and saving the object to file 

In [63]:
import pickle

In [64]:
pickle.dump(movies,open('movie_list.pkl','wb'))
pickle.dump(smlr,open('similarity.pkl','wb'))