In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from fuzzywuzzy import process



In [2]:
df = pd.read_csv(r"E:\nlp\data\movies.csv")
df = df.reset_index()

In [3]:
df = df.rename(columns={"index":"movie_id"})

In [4]:
df.isna().sum()

movie_id         0
director_name    0
actor_1_name     0
actor_2_name     0
actor_3_name     0
genres           0
movie_title      0
dtype: int64

This dataset didn't have any null values.

In [5]:
print(df.info())
df.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5043 entries, 0 to 5042
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   movie_id       5043 non-null   int64 
 1   director_name  5043 non-null   object
 2   actor_1_name   5043 non-null   object
 3   actor_2_name   5043 non-null   object
 4   actor_3_name   5043 non-null   object
 5   genres         5043 non-null   object
 6   movie_title    5043 non-null   object
dtypes: int64(1), object(6)
memory usage: 275.9+ KB
None


(5043, 7)

Combine the importent features to get Cosine similarity score.

In [6]:
def importent_features(data):
    impt_features = []
    for i in range(0,len(data)):
        impt_features.append(data["movie_title"][i] + " "+data["director_name"][i] + " "+ data["actor_1_name"][i] + " " + data["actor_2_name"][i] + ' ' + data["actor_3_name"][i] + " " + data["genres"][i])
        
    return impt_features

In [7]:
df["importent_feature"] = importent_features(df)

In [8]:
df

Unnamed: 0,movie_id,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,importent_feature
0,0,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Action Adventure Fantasy Sci-Fi,avatar,avatar James Cameron CCH Pounder Joel David Mo...
1,1,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Action Adventure Fantasy,pirates of the caribbean: at world's end,pirates of the caribbean: at world's end Gore ...
2,2,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Action Adventure Thriller,spectre,spectre Sam Mendes Christoph Waltz Rory Kinnea...
3,3,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Action Thriller,the dark knight rises,the dark knight rises Christopher Nolan Tom Ha...
4,4,Doug Walker,Doug Walker,Rob Walker,unknown,Documentary,star wars: episode vii - the force awakens ...,star wars: episode vii - the force awakens ...
...,...,...,...,...,...,...,...,...
5038,5038,Scott Smith,Eric Mabius,Daphne Zuniga,Crystal Lowe,Comedy Drama,signed sealed delivered,signed sealed delivered Scott Smith Eric Mabiu...
5039,5039,unknown,Natalie Zea,Valorie Curry,Sam Underwood,Crime Drama Mystery Thriller,the following,the following unknown Natalie Zea ...
5040,5040,Benjamin Roberds,Eva Boehnke,Maxwell Moody,David Chandler,Drama Horror Thriller,a plague so pleasant,a plague so pleasant Benjamin Roberds Eva Boeh...
5041,5041,Daniel Hsia,Alan Ruck,Daniel Henney,Eliza Coupe,Comedy Drama Romance,shanghai calling,shanghai calling Daniel Hsia Alan Ruck Daniel ...


In [32]:
df.to_csv(r'movies_final.csv')

In [9]:
#Convert the text into a matrix by bag of words technique 
cv = CountVectorizer()
cv = cv.fit_transform(df["importent_feature"])

In [10]:
cv

<5043x12652 sparse matrix of type '<class 'numpy.int64'>'
	with 69301 stored elements in Compressed Sparse Row format>

In [11]:
#cosine similarity matrix
cs = cosine_similarity(cv)

In [12]:
cs

array([[1.        , 0.18257419, 0.1490712 , ..., 0.06900656, 0.        ,
        0.        ],
       [0.18257419, 1.        , 0.13608276, ..., 0.        , 0.        ,
        0.        ],
       [0.1490712 , 0.13608276, 1.        , ..., 0.07715167, 0.        ,
        0.        ],
       ...,
       [0.06900656, 0.        , 0.07715167, ..., 1.        , 0.06900656,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.06900656, 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [30]:
import pickle
pickle_out = open("cosine.pkl","wb")
pickle.dump(cs, pickle_out)
pickle_out.close()

In [13]:
cs.shape

(5043, 5043)

In [14]:
def top_10(movie_title):
    title = process.extractOne(movie_title,df["movie_title"])
    if title[1] > 80:
        movie_id = title[2]
        scores = list(enumerate(cs[movie_id]))
        sorted_score = sorted(scores,key = lambda x:x[1],reverse=True)
        top_10 = sorted_score[1:11]
        for i,j in zip(top_10,range(1,len(top_10)+1)):
            movie_title = df["movie_title"][i[0]]
            print(f"{j}.{movie_title}")
            print("")
            
            
    else:
        print("Sorry! The movie you requested is not in our database. Please check the spelling or try with other movies!")

In [15]:
top_10("iron man")

1.iron man 2

2.iron man 3

3.zathura: a space adventure

4.deep impact

5.made

6.the avengers

7.the avengers

8.avengers: age of ultron

9.captain america: civil war

10.tron: legacy

