In [1]:
import numpy as np
import pandas as pd
from ast import literal_eval
import pickle

In [2]:
movies = pd.read_csv('./dataset/movies_metadata.csv', low_memory=False)
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [3]:
credits = pd.read_csv('./dataset/credits.csv')
credits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45476 entries, 0 to 45475
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   cast    45476 non-null  object
 1   crew    45476 non-null  object
 2   id      45476 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.0+ MB


In [4]:
def temp(x):
    try:
        x = int(x)
    except:
        x = 0
    return x

movies['id'] = movies['id'].apply(temp)
movies = movies[movies['id']!=0]
movies = movies.merge(credits, on='id')

In [5]:
keywords = pd.read_csv('./dataset/keywords.csv')
keywords.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46419 entries, 0 to 46418
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        46419 non-null  int64 
 1   keywords  46419 non-null  object
dtypes: int64(1), object(1)
memory usage: 725.4+ KB


In [6]:
movies = movies.merge(keywords, on='id')

In [7]:
movies.duplicated().sum()

1172

In [8]:
movies.drop_duplicates(inplace=True)

In [9]:
movies.shape

(45456, 27)

In [10]:
movies.isnull().sum()

adult                        0
belongs_to_collection    40965
budget                       0
genres                       0
homepage                 37679
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   3
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      3
runtime                    260
spoken_languages             3
status                      84
tagline                  25046
title                        3
video                        3
vote_average                 3
vote_count                   3
cast                         0
crew                         0
keywords                     0
dtype: int64

# Data Cleaning

In [11]:
movies = movies[movies.title.notnull()]

In [12]:
movies['genres'] = movies['genres'].apply(lambda x: [i['name'] for i in literal_eval(x)])
movies['keywords'] = movies['keywords'].apply(lambda x: [i['name'] for i in literal_eval(x)])

In [13]:
movies[['genres','keywords']].head()

Unnamed: 0,genres,keywords
0,"[Animation, Comedy, Family]","[jealousy, toy, boy, friendship, friends, riva..."
1,"[Adventure, Fantasy, Family]","[board game, disappearance, based on children'..."
2,"[Romance, Comedy]","[fishing, best friend, duringcreditsstinger, o..."
3,"[Comedy, Drama, Romance]","[based on novel, interracial relationship, sin..."
4,[Comedy],"[baby, midlife crisis, confidence, aging, daug..."


In [14]:
def fetch_cast(obj):
    L = []
    counter = 0
    for i in literal_eval(obj):
        if counter!=3:
            L.append(i['name'])
            counter+=1
        else:
            break
    return L

def fetch_director(x):
    L = []
    for i in literal_eval(x):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L

In [15]:
movies['cast'] = movies['cast'].apply(fetch_cast)
movies['crew'] = movies['crew'].apply(fetch_director)

In [16]:
movies[['cast','crew']].head()

Unnamed: 0,cast,crew
0,"[Tom Hanks, Tim Allen, Don Rickles]",[John Lasseter]
1,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]",[Joe Johnston]
2,"[Walter Matthau, Jack Lemmon, Ann-Margret]",[Howard Deutch]
3,"[Whitney Houston, Angela Bassett, Loretta Devine]",[Forest Whitaker]
4,"[Steve Martin, Diane Keaton, Martin Short]",[Charles Shyer]


In [17]:
# production companies
movies['production_companies'] = movies['production_companies'].apply(lambda x: [i['name'] for i in literal_eval(x)])
movies['production_countries'] = movies['production_countries'].apply(lambda x: [i['name'] for i in literal_eval(x)])

In [18]:
movies[['production_companies','production_countries']].head()

Unnamed: 0,production_companies,production_countries
0,[Pixar Animation Studios],[United States of America]
1,"[TriStar Pictures, Teitler Film, Interscope Co...",[United States of America]
2,"[Warner Bros., Lancaster Gate]",[United States of America]
3,[Twentieth Century Fox Film Corporation],[United States of America]
4,"[Sandollar Productions, Touchstone Pictures]",[United States of America]


In [19]:
movies['original_language'] = movies['original_language'].fillna('')

In [20]:
movies['genres'] = movies['genres'].apply(lambda x: [str.lower(i.replace(" ","")) for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [str.lower(i.replace(" ","_")) for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [str.lower(i.replace(" ","_")) for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [str.lower(i.replace(" ","_")) for i in x])
movies['production_companies'] = movies['production_companies'].apply(lambda x: [str.lower(i.replace(" ","_")) for i in x])
movies['production_countries'] = movies['production_countries'].apply(lambda x: [str.lower(i.replace(" ","_")) for i in x])

In [21]:
movies['original_language'] = movies['original_language'].apply(lambda x: [x])

In [22]:
movies['overview'] = movies['overview'].fillna('')

In [23]:
from nltk.stem.porter import PorterStemmer
import re
ps = PorterStemmer()
from nltk.corpus import stopwords
stwords = stopwords.words('english')

def stem(text):
    y = []
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    text = text.split()
    for word in text:
        if not word in stwords:
            y.append(ps.stem(word))
    
    return  " ".join(y)

In [24]:
movies['overview'] = movies['overview'].apply(stem)

In [25]:
movies['overview']

0        led woodi andi toy live happili room andi birt...
1        sibl judi peter discov enchant board game open...
2        famili wed reignit ancient feud next door neig...
3        cheat mistreat step women hold breath wait elu...
4        georg bank recov daughter wed receiv news preg...
                               ...                        
46623                                  rise fall man woman
46624    artist struggl finish work storylin cult play ...
46625    one hit goe wrong profession assassin end suit...
46626    small town live two brother one minist one hun...
46627    year decriminalis homosexu uk director daisi a...
Name: overview, Length: 45453, dtype: object

In [27]:
movies.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'cast', 'crew', 'keywords'],
      dtype='object')

In [28]:
movies[['genres','keywords','cast','crew','original_language','production_companies','production_countries']].head()

Unnamed: 0,genres,keywords,cast,crew,original_language,production_companies,production_countries
0,"[animation, comedy, family]","[jealousy, toy, boy, friendship, friends, riva...","[tom_hanks, tim_allen, don_rickles]",[john_lasseter],[en],[pixar_animation_studios],[united_states_of_america]
1,"[adventure, fantasy, family]","[board_game, disappearance, based_on_children'...","[robin_williams, jonathan_hyde, kirsten_dunst]",[joe_johnston],[en],"[tristar_pictures, teitler_film, interscope_co...",[united_states_of_america]
2,"[romance, comedy]","[fishing, best_friend, duringcreditsstinger, o...","[walter_matthau, jack_lemmon, ann-margret]",[howard_deutch],[en],"[warner_bros., lancaster_gate]",[united_states_of_america]
3,"[comedy, drama, romance]","[based_on_novel, interracial_relationship, sin...","[whitney_houston, angela_bassett, loretta_devine]",[forest_whitaker],[en],[twentieth_century_fox_film_corporation],[united_states_of_america]
4,[comedy],"[baby, midlife_crisis, confidence, aging, daug...","[steve_martin, diane_keaton, martin_short]",[charles_shyer],[en],"[sandollar_productions, touchstone_pictures]",[united_states_of_america]


In [49]:
movies[['id','title','genres','original_language','cast','crew','keywords','production_companies','production_countries']]

Unnamed: 0,id,title,genres,original_language,cast,crew,keywords,production_companies,production_countries
0,862,Toy Story,"[animation, comedy, family]",[en],"[tom_hanks, tim_allen, don_rickles]",[john_lasseter],"[jealousy, toy, boy, friendship, friends, riva...",[pixar_animation_studios],[united_states_of_america]
1,8844,Jumanji,"[adventure, fantasy, family]",[en],"[robin_williams, jonathan_hyde, kirsten_dunst]",[joe_johnston],"[board_game, disappearance, based_on_children'...","[tristar_pictures, teitler_film, interscope_co...",[united_states_of_america]
2,15602,Grumpier Old Men,"[romance, comedy]",[en],"[walter_matthau, jack_lemmon, ann-margret]",[howard_deutch],"[fishing, best_friend, duringcreditsstinger, o...","[warner_bros., lancaster_gate]",[united_states_of_america]
3,31357,Waiting to Exhale,"[comedy, drama, romance]",[en],"[whitney_houston, angela_bassett, loretta_devine]",[forest_whitaker],"[based_on_novel, interracial_relationship, sin...",[twentieth_century_fox_film_corporation],[united_states_of_america]
4,11862,Father of the Bride Part II,[comedy],[en],"[steve_martin, diane_keaton, martin_short]",[charles_shyer],"[baby, midlife_crisis, confidence, aging, daug...","[sandollar_productions, touchstone_pictures]",[united_states_of_america]
...,...,...,...,...,...,...,...,...,...
46623,439050,Subdue,"[drama, family]",[fa],"[leila_hatami, kourosh_tahami, elham_korda]",[hamid_nematollah],[tragic_love],[],[iran]
46624,111109,Century of Birthing,[drama],[tl],"[angel_aquino, perry_dizon, hazel_orencio]",[lav_diaz],"[artist, play, pinoy]",[sine_olivia],[philippines]
46625,67758,Betrayal,"[action, drama, thriller]",[en],"[erika_eleniak, adam_baldwin, julie_du_page]",[mark_l._lester],[],[american_world_pictures],[united_states_of_america]
46626,227506,Satan Triumphant,[],[en],"[iwan_mosschuchin, nathalie_lissenko, pavel_pa...",[yakov_protazanov],[],[yermoliev],[russia]


In [29]:
feature_list = ['keywords','cast','crew','original_language','production_companies','production_countries']
movies['tags'] = movies['genres']
for i in feature_list:
    movies['tags'] = movies['tags'] + movies[i]

In [30]:
movies['tags'] = movies['tags'].apply(lambda x: " ".join(x))

In [48]:
movies['tags']

0        animation comedy family jealousy toy boy frien...
1        adventure fantasy family board_game disappeara...
2        romance comedy fishing best_friend duringcredi...
3        comedy drama romance based_on_novel interracia...
4        comedy baby midlife_crisis confidence aging da...
                               ...                        
46623    drama family tragic_love leila_hatami kourosh_...
46624    drama artist play pinoy angel_aquino perry_diz...
46625    action drama thriller erika_eleniak adam_baldw...
46626    iwan_mosschuchin nathalie_lissenko pavel_pavlo...
46627                      daisy_asquith en united_kingdom
Name: tags, Length: 45453, dtype: object

In [32]:
df = movies[['id','title','tags']]
df.head()

Unnamed: 0,id,title,tags
0,862,Toy Story,animation comedy family jealousy toy boy frien...
1,8844,Jumanji,adventure fantasy family board_game disappeara...
2,15602,Grumpier Old Men,romance comedy fishing best_friend duringcredi...
3,31357,Waiting to Exhale,comedy drama romance based_on_novel interracia...
4,11862,Father of the Bride Part II,comedy baby midlife_crisis confidence aging da...


In [140]:
movies['original_language'].apply(lambda x: x[0]).value_counts()

en    32263
fr     2439
it     1529
ja     1351
de     1079
      ...  
la        1
fy        1
rw        1
tg        1
si        1
Name: original_language, Length: 90, dtype: int64

In [134]:
lan = [['hi']]

In [80]:
movies[movies['original_language'].isin(lan)].shape

(32796, 28)

In [57]:
movies[movies['original_language'].apply(lambda x: x[0])== 'uz'].shape

(1, 28)

In [36]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=3000)
vectors = cv.fit_transform(df['tags']).toarray()

In [37]:
from sklearn.metrics.pairwise import cosine_similarity

In [52]:
def get_similarity(index):
    similarity = cosine_similarity([vectors[index]], vectors)
    return similarity[0]

def get_recommendations(iid):
    idx = df[df['id']==iid].index[0]
    print(idx)
    sim_scores = list(enumerate(get_similarity(idx)))
    print(sim_scores)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    print(sim_scores)
    sim_scores = sim_scores[1:7]
    print(sim_scores)

In [53]:
get_recommendations(19404)

10397
[(0, 0.09805806756909201), (1, 0.10206207261596575), (2, 0.10206207261596575), (3, 0.21320071635561041), (4, 0.10206207261596575), (5, 0.19695964928958382), (6, 0.0944911182523068), (7, 0.24999999999999994), (8, 0.11180339887498948), (9, 0.18257418583505533), (10, 0.17677669529663687), (11, 0.0944911182523068), (12, 0.08838834764831843), (13, 0.21320071635561041), (14, 0.08333333333333333), (15, 0.294174202707276), (16, 0.28347335475692037), (17, 0.18257418583505533), (18, 0.22360679774997896), (19, 0.2041241452319315), (20, 0.19364916731037082), (21, 0.1889822365046136), (22, 0.1386750490563073), (23, 0.21320071635561041), (24, 0.15811388300841894), (25, 0.22360679774997896), (26, 0.2041241452319315), (27, 0.40089186286863654), (28, 0.0), (29, 0.22360679774997896), (30, 0.294174202707276), (31, 0.08838834764831843), (32, 0.0), (33, 0.22360679774997896), (34, 0.3779644730092272), (35, 0.2261335084333227), (36, 0.2886751345948129), (37, 0.1178511301977579), (38, 0.2132007163556104

[(10397, 0.9999999999999998), (14270, 0.7905694150420948), (24961, 0.7216878364870323), (1490, 0.7071067811865475), (17503, 0.7071067811865475), (21541, 0.7071067811865475), (24863, 0.7071067811865475), (32753, 0.7071067811865475), (43798, 0.7071067811865475), (11174, 0.6681531047810609), (15325, 0.6681531047810609), (14015, 0.6396021490668312), (4996, 0.6324555320336758), (12430, 0.6324555320336758), (12926, 0.6324555320336758), (13973, 0.6324555320336758), (18297, 0.6324555320336758), (21714, 0.6324555320336758), (24666, 0.6324555320336758), (25229, 0.6324555320336758), (26848, 0.6324555320336758), (27295, 0.6324555320336758), (27445, 0.6324555320336758), (31472, 0.6324555320336758), (35206, 0.6324555320336758), (40688, 0.6324555320336758), (45020, 0.6324555320336758), (45135, 0.6324555320336758), (7147, 0.6249999999999998), (2195, 0.6123724356957945), (3767, 0.6123724356957945), (6698, 0.6123724356957945), (9574, 0.6123724356957945), (13325, 0.6123724356957945), (13756, 0.6123724356

In [1]:
import sklearn
sklearn.__version__

'1.1.1'

In [89]:
movies['release_date'] = pd.to_datetime(movies['release_date'])

In [91]:
movies['release_date'] = movies['release_date'].dt.year

In [92]:
movies['release_date'].value_counts()

2014.0    1973
2015.0    1905
2013.0    1890
2012.0    1722
2011.0    1667
          ... 
2020.0       1
1887.0       1
1878.0       1
1874.0       1
1883.0       1
Name: release_date, Length: 135, dtype: int64

In [144]:
movies.shape

(45453, 28)