In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
import warnings
warnings.simplefilter('ignore')

In [2]:
df = pd.read_csv('movies.csv')
df['avg_vote'] = df['avg_vote'].astype('float16')
df['country'] = df['country'].astype(str)
df['description'] = df['description'].astype(str)
df['genre'] = df['genre'].astype(str)
df['actors'] = df['actors'].astype(str)
df.head()

Unnamed: 0,imdb_title_id,title,genre,country,actors,description,avg_vote
0,tt7822474,Suvarna Sundari,"Action, Thriller",India,"Shamna Kasim, Sakshi Chaudhary, Jaya Prada, Av...","The movie revolves around an idol, Suvarna Sun...",9.898438
1,tt11207902,Lejos de Casa pelicula Venezolana,"Drama, History",Venezuela,"Angibell, Gabriel Buitrago, Darwing, Dariana J...","Samuel, a young Venezuelan, emigrates from his...",9.796875
2,tt11976170,Jeeudo,"Drama, Romance",India,"Prashant Baraili, Davina, Pranisha Gahatraj, P...","""A TALE OF LOVE, MYTH, AND FATE"" ""JEEUDO"" (mea...",9.796875
3,tt12492650,Ek,"Action, Drama",India,"Bishnu Adhikari, Himanshi Khurana, Aparna Shar...",EK is a love story set in the context of ever ...,9.796875
4,tt9008642,Notuku Potu,"Horror, Mystery",India,"Mumait Khan, Manisha Koirala, Arjun Sarja",Notuku Potu is a crime story laced with elemen...,9.796875


In [3]:
tfidf = TfidfVectorizer(stop_words='english')
df['description'] = df['description'].fillna('')
tfidf_matrix = tfidf.fit_transform(df['description'][0:10000])
tfidf_matrix.shape

(10000, 22521)

In [4]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [5]:
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

In [6]:
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df['title'].iloc[movie_indices]

In [7]:
movie = input('Enter Movie Name: ')
recommended1 = get_recommendations(movie)

Enter Movie Name: The Avengers


In [8]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [9]:
# Apply clean_data function to your features.
features = ['country', 'actors', 'genre']

for feature in features:
    df[feature] = df[feature].apply(clean_data)

In [10]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df['title'].iloc[movie_indices]


In [11]:
def create_soup(x):
    return ' ' + x['genre'] + ' ' + x['country'] + ' ' + x['actors']
df['soup'] = df.apply(create_soup, axis=1)
df.head()

Unnamed: 0,imdb_title_id,title,genre,country,actors,description,avg_vote,soup
0,tt7822474,Suvarna Sundari,"action,thriller",india,"shamnakasim,sakshichaudhary,jayaprada,avantika...","The movie revolves around an idol, Suvarna Sun...",9.898438,"action,thriller india shamnakasim,sakshichaud..."
1,tt11207902,Lejos de Casa pelicula Venezolana,"drama,history",venezuela,"angibell,gabrielbuitrago,darwing,darianajozh,m...","Samuel, a young Venezuelan, emigrates from his...",9.796875,"drama,history venezuela angibell,gabrielbuitr..."
2,tt11976170,Jeeudo,"drama,romance",india,"prashantbaraili,davina,pranishagahatraj,pujang...","""A TALE OF LOVE, MYTH, AND FATE"" ""JEEUDO"" (mea...",9.796875,"drama,romance india prashantbaraili,davina,pr..."
3,tt12492650,Ek,"action,drama",india,"bishnuadhikari,himanshikhurana,aparnasharma,pr...",EK is a love story set in the context of ever ...,9.796875,"action,drama india bishnuadhikari,himanshikhu..."
4,tt9008642,Notuku Potu,"horror,mystery",india,"mumaitkhan,manishakoirala,arjunsarja",Notuku Potu is a crime story laced with elemen...,9.796875,"horror,mystery india mumaitkhan,manishakoiral..."


In [12]:
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df['soup'][0:15000])

In [13]:
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [14]:
indices = pd.Series(df.index, index=df['title'])

In [15]:
recommended2 = get_recommendations(movie, cosine_sim2)

In [16]:
print('Recommendation Based On [Country, Genre, Actors]: ')
print(recommended1)
print('Recommendation Based On Story: ')
print(recommended2)

Recommendation Based On [Country, Genre, Actors]: 
8336                      Avengers: Age of Ultron
2408                     E.T. - L'extra-terrestre
7806                                 Galaxy Quest
7450                                     Superman
2885                         Ultimatum alla Terra
218                                  Interstellar
2906                           Maksim Perepelitsa
4490                                     Loveless
9269    Queen Millennia - La regina dei 1000 anni
1050                                      La cosa
Name: title, dtype: object
Recommendation Based On Story: 
8336                 Avengers: Age of Ultron
459                   Avengers: Infinity War
2721              Captain America: Civil War
3425     Captain America: The Winter Soldier
460                        Avengers: Endgame
2032                                Iron Man
10155                             Iron Man 3
5593               Spider-Man: Far from Home
6765                  Spider-M