In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv') 

In [3]:
movies = movies.merge(credits,on='title')

In [4]:
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]

In [5]:
import ast

def convert(text):
    L = []
    for i in ast.literal_eval(text):
        L.append(i['name']) 
    return L

In [6]:
movies.dropna(inplace=True)

In [7]:
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)
movies['cast'] = movies['cast'].apply(convert)
movies['cast'] = movies['cast'].apply(lambda x:x[0:3])

In [8]:
def fetch_director(text):
    L = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            L.append(i['name'])
    return L 

movies['crew'] = movies['crew'].apply(fetch_director)

In [9]:
movies.sample(5)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
1600,16871,Drag Me to Hell,After denying a woman the extension she needs ...,"[Horror, Thriller]","[gypsy, work, gore, curse, psychologist, psych...","[Alison Lohman, Justin Long, Lorna Raver]",[Sam Raimi]
1564,16340,Rugrats in Paris: The Movie,The Rugrats are back! There's Tommy the brave ...,"[Adventure, Animation, Comedy, Family]","[paris, invention]","[E.G. Daily, Susan Sarandon, John Lithgow]",[Stig Bergqvist]
3664,12486,Farce of the Penguins,"In this spoof of ""March of the Penguins,"" natu...","[Comedy, Documentary]","[penguin, balzen, antarctic]","[Samuel L. Jackson, Jim Belushi, Whoopi Goldberg]",[Bob Saget]
2855,1948,Crank,Professional assassin Chev Chelios learns his ...,"[Action, Thriller, Crime]","[poison, helicopter, assassin, nudity, hitman,...","[Jason Statham, Amy Smart, Jose Pablo Cantillo]","[Brian Taylor, Mark Neveldine]"
636,9021,The Santa Clause 2,Better watch out! The big guy in red is coming...,"[Fantasy, Comedy, Family]","[holiday, christmas party, home, santa claus, ...","[Tim Allen, Elizabeth Mitchell, David Krumholtz]",[Michael Lembeck]


In [10]:
def collapse(L):
    L1 = []
    for i in L:
        L1.append(i.replace(" ",""))
    return L1

movies['cast'] = movies['cast'].apply(collapse)
movies['crew'] = movies['crew'].apply(collapse)
movies['genres'] = movies['genres'].apply(collapse)
movies['keywords'] = movies['keywords'].apply(collapse)

In [11]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski]
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes]
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[ChristianBale, MichaelCaine, GaryOldman]",[ChristopherNolan]
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[TaylorKitsch, LynnCollins, SamanthaMorton]",[AndrewStanton]


In [12]:
movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [13]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [14]:
new = movies.drop(columns=['overview','genres','keywords','cast','crew'])

In [15]:
new['tags'] = new['tags'].apply(lambda x: " ".join(x))
new.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."


In [16]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')

In [17]:
vector = cv.fit_transform(new['tags']).toarray()

In [18]:
vector.shape

(4806, 5000)

In [19]:
from sklearn.metrics.pairwise import cosine_similarity

In [20]:
similarity = cosine_similarity(vector)

In [21]:
similarity

array([[1.        , 0.08964215, 0.06071767, ..., 0.02519763, 0.0277885 ,
        0.        ],
       [0.08964215, 1.        , 0.06350006, ..., 0.02635231, 0.        ,
        0.        ],
       [0.06071767, 0.06350006, 1.        , ..., 0.02677398, 0.        ,
        0.        ],
       ...,
       [0.02519763, 0.02635231, 0.02677398, ..., 1.        , 0.07352146,
        0.04774099],
       [0.0277885 , 0.        , 0.        , ..., 0.07352146, 1.        ,
        0.05264981],
       [0.        , 0.        , 0.        , ..., 0.04774099, 0.05264981,
        1.        ]])

In [22]:
new[new['title'] == 'The Lego Movie'].index[0]

744

In [23]:

def recommend(movie):
    index = new[new['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    for i in distances[1:6]:
        print(new.iloc[i[0]].title)

In [24]:

recommend('Gandhi')

Gandhi, My Father
The Wind That Shakes the Barley
A Passage to India
Guiana 1838
Ramanujan


In [25]:
import pickle
pickle.dump(new,open('movie_list.pkl','wb'))
pickle.dump(similarity,open('similarity.pkl','wb'))