In [1]:
import pandas as pd
import numpy as np
import ast
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from nltk.stem import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [2]:
data = pd.read_csv('movie_data_with_tags.csv')

In [3]:
data['tags']

0       ['Action', 'Adventure', 'Fantasy', 'ScienceFic...
1       ['Adventure', 'Fantasy', 'Action', 'ocean', 'd...
2       ['Action', 'Adventure', 'Crime', 'spy', 'based...
3       ['Action', 'Crime', 'Drama', 'Thriller', 'dcco...
4       ['Action', 'Adventure', 'ScienceFiction', 'bas...
                              ...                        
4801    ['Action', 'Crime', 'Thriller', 'unitedstates–...
4802    ['Comedy', 'Romance', 'A', 'newlywed', "couple...
4803    ['Comedy', 'Drama', 'Romance', 'TVMovie', 'dat...
4804    ['When', 'ambitious', 'New', 'York', 'attorney...
4805    ['Documentary', 'obsession', 'camcorder', 'cru...
Name: tags, Length: 4806, dtype: object

In [4]:
data['tags'].apply(lambda x : ' '.join(ast.literal_eval(x))) 

0       Action Adventure Fantasy ScienceFiction cultur...
1       Adventure Fantasy Action ocean drugabuse exoti...
2       Action Adventure Crime spy basedonnovel secret...
3       Action Crime Drama Thriller dccomics crimefigh...
4       Action Adventure ScienceFiction basedonnovel m...
                              ...                        
4801    Action Crime Thriller unitedstates–mexicobarri...
4802    Comedy Romance A newlywed couple's honeymoon i...
4803    Comedy Drama Romance TVMovie date loveatfirsts...
4804    When ambitious New York attorney Sam is sent t...
4805    Documentary obsession camcorder crush dreamgir...
Name: tags, Length: 4806, dtype: object

In [5]:
data['tags'] = data['tags'].apply(lambda x : ' '.join(ast.literal_eval(x))) 

In [6]:
data.head()

Unnamed: 0.1,Unnamed: 0,id,title,tags
0,0,19995,Avatar,Action Adventure Fantasy ScienceFiction cultur...
1,1,285,Pirates of the Caribbean: At World's End,Adventure Fantasy Action ocean drugabuse exoti...
2,2,206647,Spectre,Action Adventure Crime spy basedonnovel secret...
3,3,49026,The Dark Knight Rises,Action Crime Drama Thriller dccomics crimefigh...
4,4,49529,John Carter,Action Adventure ScienceFiction basedonnovel m...


In [7]:
data['tags'][1]

"Adventure Fantasy Action ocean drugabuse exoticisland eastindiatradingcompany loveofone'slife traitor shipwreck strongwoman ship alliance calypso afterlife fighter pirate swashbuckler aftercreditsstinger Captain Barbossa, long believed to be dead, has come back to life and is headed to the edge of the Earth with Will Turner and Elizabeth Swann. But nothing is quite as it seems. WaltDisneyPictures JerryBruckheimerFilms SecondMateProductions JohnnyDepp OrlandoBloom KeiraKnightley StellanSkarsgård ChowYun-fat BillNighy GeoffreyRush JackDavenport KevinMcNally TomHollander NaomieHarris JonathanPryce KeithRichards LeeArenberg MackenzieCrook GregEllis DavidBailie MartinKlebba DavidSchofield LaurenMaher VanessaBranch AngusBarnett GilesNew ReggieLee DominicScottKay TakayoFischer DavidMeunier Ho-KwanTse AndyBeckwith PeterDonaldBadalamentiII ChristopherS.Capp KeithRichards HakeemKae-Kazim GhassanMassoud"

In [8]:
data['tags'] = data['tags'].apply(lambda x : x.lower())

In [9]:
data.head(1)

Unnamed: 0.1,Unnamed: 0,id,title,tags
0,0,19995,Avatar,action adventure fantasy sciencefiction cultur...


In [10]:
data.shape

(4806, 4)

In [11]:
ps = PorterStemmer()
def stem(text):
    L = []
    for i in text.split():
        L.append(ps.stem(i))
    
    return ' '.join(L)

In [12]:
data['tags'] = data['tags'].apply(stem)

In [13]:
data.head()

Unnamed: 0.1,Unnamed: 0,id,title,tags
0,0,19995,Avatar,action adventur fantasi sciencefict culturecla...
1,1,285,Pirates of the Caribbean: At World's End,adventur fantasi action ocean drugabus exotici...
2,2,206647,Spectre,action adventur crime spi basedonnovel secreta...
3,3,49026,The Dark Knight Rises,action crime drama thriller dccomic crimefight...
4,4,49529,John Carter,action adventur sciencefict basedonnovel mar m...


In [14]:
cv = CountVectorizer(max_features=5000, stop_words='english')

In [15]:
vectors = cv.fit_transform(data['tags']).toarray()

In [16]:
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [17]:
similarity = cosine_similarity(vectors)

In [18]:
similarity.shape

(4806, 4806)

In [19]:
similarity[1]

array([0.06521739, 1.        , 0.07372098, ..., 0.02106314, 0.        ,
       0.02360961])

In [20]:
data[data['title'] == 'Batman Begins'].index[0]

119

In [21]:


def recommend(movies):
    movies_index = data[data['title'] == movies].index[0]
    distances = similarity[movies_index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key = lambda x : x[1])[1:10]
    
    for i in movies_list:
        print(data.iloc[i[0]].title)

In [22]:
recommend('Avatar')

Aliens vs Predator: Requiem
Predator
Titan A.E.
Independence Day
Aliens
Meet Dave
Battle: Los Angeles
Falcon Rising
Lifeforce


In [23]:
pickle.dump(similarity, open('similarity.pkl', 'wb'))

In [24]:
similarity = pickle.load(open('similarity.pkl', 'rb'))

In [25]:
recommend('Batman Begins')

The Dark Knight
Batman
The Dark Knight Rises
Batman
Batman v Superman: Dawn of Justice
Rockaway
10th & Wolf
RED
City By The Sea
