# Importing essential libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

# Storing contents of csv file in a dataframe

In [2]:
movies = pd.read_csv('MoviesOnStreamingPlatforms_updated.csv')

In [3]:
movies.head()

Unnamed: 0.1,Unnamed: 0,ID,Title,Year,Age,IMDb,Rotten Tomatoes,Netflix,Hulu,Prime Video,Disney+,Type,Directors,Genres,Country,Language,Runtime
0,0,1,Inception,2010,13+,8.8,87%,1,0,0,0,0,Christopher Nolan,"Action,Adventure,Sci-Fi,Thriller","United States,United Kingdom","English,Japanese,French",148.0
1,1,2,The Matrix,1999,18+,8.7,87%,1,0,0,0,0,"Lana Wachowski,Lilly Wachowski","Action,Sci-Fi",United States,English,136.0
2,2,3,Avengers: Infinity War,2018,13+,8.5,84%,1,0,0,0,0,"Anthony Russo,Joe Russo","Action,Adventure,Sci-Fi",United States,English,149.0
3,3,4,Back to the Future,1985,7+,8.5,96%,1,0,0,0,0,Robert Zemeckis,"Adventure,Comedy,Sci-Fi",United States,English,116.0
4,4,5,"The Good, the Bad and the Ugly",1966,18+,8.8,97%,1,0,1,0,0,Sergio Leone,Western,"Italy,Spain,West Germany",Italian,161.0


# Dropping unnecessary columns

In [4]:
movies = movies.drop(['Unnamed: 0','ID','Netflix','Hulu','Prime Video','Disney+','Type'],axis=1)

In [5]:
movies.head()

Unnamed: 0,Title,Year,Age,IMDb,Rotten Tomatoes,Directors,Genres,Country,Language,Runtime
0,Inception,2010,13+,8.8,87%,Christopher Nolan,"Action,Adventure,Sci-Fi,Thriller","United States,United Kingdom","English,Japanese,French",148.0
1,The Matrix,1999,18+,8.7,87%,"Lana Wachowski,Lilly Wachowski","Action,Sci-Fi",United States,English,136.0
2,Avengers: Infinity War,2018,13+,8.5,84%,"Anthony Russo,Joe Russo","Action,Adventure,Sci-Fi",United States,English,149.0
3,Back to the Future,1985,7+,8.5,96%,Robert Zemeckis,"Adventure,Comedy,Sci-Fi",United States,English,116.0
4,"The Good, the Bad and the Ugly",1966,18+,8.8,97%,Sergio Leone,Western,"Italy,Spain,West Germany",Italian,161.0


In [6]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16744 entries, 0 to 16743
Data columns (total 10 columns):
Title              16744 non-null object
Year               16744 non-null int64
Age                7354 non-null object
IMDb               16173 non-null float64
Rotten Tomatoes    5158 non-null object
Directors          16018 non-null object
Genres             16469 non-null object
Country            16309 non-null object
Language           16145 non-null object
Runtime            16152 non-null float64
dtypes: float64(2), int64(1), object(7)
memory usage: 1.3+ MB


# Filling missing data

In [7]:
movies['Age'] = movies['Age'].fillna(' ')
movies['Rotten Tomatoes'] = movies['Rotten Tomatoes'].fillna(' ')
movies['Directors'] = movies['Directors'].fillna(' ')
movies['Genres'] = movies['Genres'].fillna(' ')
movies['Country'] = movies['Country'].fillna(' ')
movies['Language'] = movies['Language'].fillna(' ')

# Combinig all the columns into one column named 'Key Notes' 

In [8]:
movies['Key Notes'] = ''
for i in range(len(movies['Title'])):
    year = str(movies['Year'][i])
    age = str(movies['Age'][i])
    IMDb = str(movies['IMDb'][i])
    rt = str(movies['Rotten Tomatoes'][i])
    genre = ''.join(movies['Genres'][i].replace(',',' ')).lower()
    director = ''.join(movies['Directors'][i].replace(',',' ')).lower()
    country = ''.join(movies['Country'][i].replace(',', ' ')).lower()
    language = ''.join(movies['Language'][i].replace(',',' ')).lower()
    run = str(movies['Runtime'][i])
    movies['Key Notes'][i] = year+ ' '+age+' '+IMDb+' '+rt+' '+genre+' '+director+' '+country+' '+language+' '+run 

recommend = movies[['Title','Key Notes']]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


In [9]:
recommend.head()

Unnamed: 0,Title,Key Notes
0,Inception,2010 13+ 8.8 87% action adventure sci-fi thril...
1,The Matrix,1999 18+ 8.7 87% action sci-fi lana wachowski ...
2,Avengers: Infinity War,2018 13+ 8.5 84% action adventure sci-fi antho...
3,Back to the Future,1985 7+ 8.5 96% adventure comedy sci-fi robert...
4,"The Good, the Bad and the Ugly",1966 18+ 8.8 97% western sergio leone italy sp...


# Creating cosine similarity matrix from tokenized 'Key Notes' column

In [10]:
cv = CountVectorizer()
count_mat = cv.fit_transform(recommend['Key Notes'])
cosine_sim = cosine_similarity(count_mat,count_mat)
print(cosine_sim)

[[1.         0.4472136  0.48809353 ... 0.31622777 0.2981424  0.26967994]
 [0.4472136  1.         0.36380344 ... 0.26516504 0.25       0.22613351]
 [0.48809353 0.36380344 1.         ... 0.25724788 0.24253563 0.21938173]
 ...
 [0.31622777 0.26516504 0.25724788 ... 1.         0.58925565 0.53300179]
 [0.2981424  0.25       0.24253563 ... 0.58925565 1.         0.50251891]
 [0.26967994 0.22613351 0.21938173 ... 0.53300179 0.50251891 1.        ]]


# Function to find similar movies based on cosine similarity

In [11]:
indices = pd.Series(recommend['Title'])
def recommend_movie(name):
    movie=[]
    idx = indices[indices == name].index[0]
    sort_index = pd.Series(cosine_sim[idx]).sort_values(ascending = False)
    recommendation_5= sort_index.iloc[1:5]
    for i in recommendation_5.index:
        movie.append(indices[i])
    return movie

In [12]:
print(recommend_movie('Zindagi Na Milegi Dobara'))

['Luck by Chance', 'Love Breakups Zindagi', 'My Friend Pinto', 'Dil Dhadakne Do']


# Function for user to input a movie name and find similar movies

In [13]:
def rec():
    try:
        i = 1
        while(i > 0):
            name = input("Enter The Name of a Movie or Tv Show: ")
            if name.lower() == 'quit':
                break
            else:
                print(recommend_movie(name))

    except KeyboardInterrupt:
        print("The movie or Tv Show does not exist\n")
        rec()

    except IndexError:
        print("The movie or Tv Show does not exist\n")
        rec()
        

print("To exit Enter \"quit\" \n")
rec()

To exit Enter "quit" 

Enter The Name of a Movie or Tv Show: Avengers: Infinity War
['Captain America: The Winter Soldier', 'Avengers: Endgame', 'Captain America: Civil War', 'Solo: A Star Wars Story']
Enter The Name of a Movie or Tv Show: quit
