In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("movies.csv")
df.head()

Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count
0,278,The Shawshank Redemption,"Drama,Crime",en,Framed in the 1940s for the double murder of h...,94.075,1994-09-23,8.7,21862
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance",hi,"Raj is a rich, carefree, happy-go-lucky second...",25.408,1995-10-19,8.7,3731
2,238,The Godfather,"Drama,Crime",en,"Spanning the years 1945 to 1955, a chronicle o...",90.585,1972-03-14,8.7,16280
3,424,Schindler's List,"Drama,History,War",en,The true story of how businessman Oskar Schind...,44.761,1993-12-15,8.6,12959
4,240,The Godfather: Part II,"Drama,Crime",en,In the continuing saga of the Corleone crime f...,57.749,1974-12-20,8.6,9811


In [3]:
df.describe()

Unnamed: 0,id,popularity,vote_average,vote_count
count,10000.0,10000.0,10000.0,10000.0
mean,161243.505,34.697267,6.62115,1547.3094
std,211422.046043,211.684175,0.766231,2648.295789
min,5.0,0.6,4.6,200.0
25%,10127.75,9.15475,6.1,315.0
50%,30002.5,13.6375,6.6,583.5
75%,310133.5,25.65125,7.2,1460.0
max,934761.0,10436.917,8.7,31917.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 10000 non-null  int64  
 1   title              10000 non-null  object 
 2   genre              9997 non-null   object 
 3   original_language  10000 non-null  object 
 4   overview           9987 non-null   object 
 5   popularity         10000 non-null  float64
 6   release_date       10000 non-null  object 
 7   vote_average       10000 non-null  float64
 8   vote_count         10000 non-null  int64  
dtypes: float64(2), int64(2), object(5)
memory usage: 703.3+ KB


In [5]:
df.isnull().sum()

id                    0
title                 0
genre                 3
original_language     0
overview             13
popularity            0
release_date          0
vote_average          0
vote_count            0
dtype: int64

In [6]:
df.columns

Index(['id', 'title', 'genre', 'original_language', 'overview', 'popularity',
       'release_date', 'vote_average', 'vote_count'],
      dtype='object')

In [7]:
# dropping unnecessary values
df = df[['id','title','genre','overview','popularity']]
df.head()

Unnamed: 0,id,title,genre,overview,popularity
0,278,The Shawshank Redemption,"Drama,Crime",Framed in the 1940s for the double murder of h...,94.075
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance","Raj is a rich, carefree, happy-go-lucky second...",25.408
2,238,The Godfather,"Drama,Crime","Spanning the years 1945 to 1955, a chronicle o...",90.585
3,424,Schindler's List,"Drama,History,War",The true story of how businessman Oskar Schind...,44.761
4,240,The Godfather: Part II,"Drama,Crime",In the continuing saga of the Corleone crime f...,57.749


In [8]:
# merging important tags
df['tags'] = df['genre'] +" "+ df['overview']
df.head()

Unnamed: 0,id,title,genre,overview,popularity,tags
0,278,The Shawshank Redemption,"Drama,Crime",Framed in the 1940s for the double murder of h...,94.075,"Drama,Crime Framed in the 1940s for the double..."
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance","Raj is a rich, carefree, happy-go-lucky second...",25.408,"Comedy,Drama,Romance Raj is a rich, carefree, ..."
2,238,The Godfather,"Drama,Crime","Spanning the years 1945 to 1955, a chronicle o...",90.585,"Drama,Crime Spanning the years 1945 to 1955, a..."
3,424,Schindler's List,"Drama,History,War",The true story of how businessman Oskar Schind...,44.761,"Drama,History,War The true story of how busine..."
4,240,The Godfather: Part II,"Drama,Crime",In the continuing saga of the Corleone crime f...,57.749,"Drama,Crime In the continuing saga of the Corl..."


In [9]:
df = df.drop(columns=['genre','overview'])
df.head()

Unnamed: 0,id,title,popularity,tags
0,278,The Shawshank Redemption,94.075,"Drama,Crime Framed in the 1940s for the double..."
1,19404,Dilwale Dulhania Le Jayenge,25.408,"Comedy,Drama,Romance Raj is a rich, carefree, ..."
2,238,The Godfather,90.585,"Drama,Crime Spanning the years 1945 to 1955, a..."
3,424,Schindler's List,44.761,"Drama,History,War The true story of how busine..."
4,240,The Godfather: Part II,57.749,"Drama,Crime In the continuing saga of the Corl..."


In [10]:
 from sklearn.feature_extraction.text import CountVectorizer

In [11]:
cv = CountVectorizer(max_features=10000,stop_words='english')
cv

In [12]:
vector = cv.fit_transform(df['tags'].values.astype('U')).toarray()
vector

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [13]:
vector.shape

(10000, 10000)

In [14]:
# COSINE SIMILARITY
# 10000x10000
# max_features = 10000
# fit it using count vectorizer
# fits dimensions of 10000 vectors
# vectors have 10000 vectors
# example - there are two ACTIONS movie avatar and avenger
#           texts are similar here, so recommendations are similar
#           therefore, the angle is lowest here, which is the similarity
#           distance isn't used (not eucledian distance), only theita (cosine angle)

In [15]:
from sklearn.metrics.pairwise import cosine_similarity

In [16]:
sim = cosine_similarity(vector)
sim

array([[1.        , 0.05634362, 0.13041013, ..., 0.07559289, 0.11065667,
        0.06900656],
       [0.05634362, 1.        , 0.07715167, ..., 0.        , 0.03636965,
        0.        ],
       [0.13041013, 0.07715167, 1.        , ..., 0.02300219, 0.0673435 ,
        0.09449112],
       ...,
       [0.07559289, 0.        , 0.02300219, ..., 1.        , 0.03253   ,
        0.03042903],
       [0.11065667, 0.03636965, 0.0673435 , ..., 0.03253   , 1.        ,
        0.04454354],
       [0.06900656, 0.        , 0.09449112, ..., 0.03042903, 0.04454354,
        1.        ]])

In [17]:
# we need index for movie for recommendations
# search for title and then index
df[df['title']=='The Godfather'].index[0]

2

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          10000 non-null  int64  
 1   title       10000 non-null  object 
 2   popularity  10000 non-null  float64
 3   tags        9985 non-null   object 
dtypes: float64(1), int64(1), object(2)
memory usage: 312.6+ KB


In [19]:
# calc distance based on similarity
# enumerate all distances into a list and arrange to top 10 -> descending
# 2 -> index of the movie searched
# key -> accessing values based on vectors
# vector[1] -> vector used
distance = sorted(list(enumerate(sim[2])), reverse=True, key = lambda vector:vector[1])
distance
# higher value -> higher similarity

[(2, 1.0000000000000004),
 (4, 0.4763305116224667),
 (7419, 0.35634832254989923),
 (153, 0.33946736991660215),
 (2624, 0.32732683535398854),
 (9520, 0.31497039417435607),
 (2412, 0.3118047822311618),
 (330, 0.30860669992418377),
 (5010, 0.30304576336566325),
 (779, 0.29957234475763905),
 (7049, 0.29957234475763905),
 (9362, 0.2969569354582493),
 (4569, 0.29607826273189597),
 (3670, 0.29277002188455997),
 (4872, 0.29277002188455997),
 (1816, 0.2891776156271279),
 (4811, 0.28867513459481287),
 (6788, 0.28867513459481287),
 (6964, 0.28368325730679006),
 (4380, 0.28319693016191544),
 (734, 0.27914526311954124),
 (5605, 0.27914526311954124),
 (1223, 0.2788866755113585),
 (6565, 0.2788866755113585),
 (9245, 0.2788866755113585),
 (8555, 0.27774602993176545),
 (709, 0.2760262237369417),
 (519, 0.2700308624336608),
 (821, 0.2700308624336608),
 (250, 0.26937401188058957),
 (8503, 0.2693740118805895),
 (747, 0.26837252006084666),
 (3742, 0.2683725200608466),
 (233, 0.2672612419124244),
 (7866, 0.

In [20]:
# accessing top 5 movie recommendations
# i[0] -> 0th index is id
# using dis[1:6] since dis[0] is the movie searched itself

for i in distance[1:6]:
    print(df.iloc[i[0]].title)

The Godfather: Part II
Blood Ties
Joker
Bomb City
Gotti


In [21]:
# creating a func to access recommendations
# shifting all code here
def recommend(movie):
    #movie -> title of the movie searched
    index = df[df['title']==movie].index[0]
    distance = sorted(list(enumerate(sim[index])), reverse=True, key = lambda vector:vector[1])
    for i in distance[1:6]:
        print(df.iloc[i[0]].title)

In [22]:
# calling function
recommend('Iron Man')

Iron Man 3
Guardians of the Galaxy Vol. 2
Avengers: Age of Ultron
Star Wars: Episode III - Revenge of the Sith
Iron Man 2


In [23]:
# using that to access it in web applications
import pickle
pickle.dump(df,open('movies_list.pk1','wb'))
pickle.dump(sim,open('similarity.pk1','wb'))

In [24]:
# view the dataset using pickle using the name
pickle.load(open('movies_list.pk1','rb'))

Unnamed: 0,id,title,popularity,tags
0,278,The Shawshank Redemption,94.075,"Drama,Crime Framed in the 1940s for the double..."
1,19404,Dilwale Dulhania Le Jayenge,25.408,"Comedy,Drama,Romance Raj is a rich, carefree, ..."
2,238,The Godfather,90.585,"Drama,Crime Spanning the years 1945 to 1955, a..."
3,424,Schindler's List,44.761,"Drama,History,War The true story of how busine..."
4,240,The Godfather: Part II,57.749,"Drama,Crime In the continuing saga of the Corl..."
...,...,...,...,...
9995,10196,The Last Airbender,98.322,"Action,Adventure,Fantasy The story follows the..."
9996,331446,Sharknado 3: Oh Hell No!,12.490,"Action,TV Movie,Science Fiction,Comedy,Adventu..."
9997,13995,Captain America,18.333,"Action,Science Fiction,War During World War II..."
9998,2312,In the Name of the King: A Dungeon Siege Tale,15.159,"Adventure,Fantasy,Action,Drama A man named Far..."
