# **Movie Recommendation System**


# **Objective**
This project is used to make recommendations on movies based on previously watched movies data

# **Data Source**
The sample dataset used for this project is taken from kaggle.com


# **Import Library**

In [44]:
import pandas as pd
import numpy as np

# **Import data**

In [2]:
movies = pd.read_csv('dataset.csv')

# **Describe Data**

In [3]:
movies.head()

Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count
0,278,The Shawshank Redemption,"Drama,Crime",en,Framed in the 1940s for the double murder of h...,94.075,1994-09-23,8.7,21862
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance",hi,"Raj is a rich, carefree, happy-go-lucky second...",25.408,1995-10-19,8.7,3731
2,238,The Godfather,"Drama,Crime",en,"Spanning the years 1945 to 1955, a chronicle o...",90.585,1972-03-14,8.7,16280
3,424,Schindler's List,"Drama,History,War",en,The true story of how businessman Oskar Schind...,44.761,1993-12-15,8.6,12959
4,240,The Godfather: Part II,"Drama,Crime",en,In the continuing saga of the Corleone crime f...,57.749,1974-12-20,8.6,9811


In [4]:
movies.columns

Index(['id', 'title', 'genre', 'original_language', 'overview', 'popularity',
       'release_date', 'vote_average', 'vote_count'],
      dtype='object')

In [5]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 10000 non-null  int64  
 1   title              10000 non-null  object 
 2   genre              9997 non-null   object 
 3   original_language  10000 non-null  object 
 4   overview           9987 non-null   object 
 5   popularity         10000 non-null  float64
 6   release_date       10000 non-null  object 
 7   vote_average       10000 non-null  float64
 8   vote_count         10000 non-null  int64  
dtypes: float64(2), int64(2), object(5)
memory usage: 703.2+ KB


In [6]:
movies['Tags']=movies['genre']+movies['overview']

In [48]:
movies.head()

Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count,Tags
0,278,The Shawshank Redemption,"Drama,Crime",en,Framed in the 1940s for the double murder of h...,94.075,1994-09-23,8.7,21862,"Drama,CrimeFramed in the 1940s for the double ..."
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance",hi,"Raj is a rich, carefree, happy-go-lucky second...",25.408,1995-10-19,8.7,3731,"Comedy,Drama,RomanceRaj is a rich, carefree, h..."
2,238,The Godfather,"Drama,Crime",en,"Spanning the years 1945 to 1955, a chronicle o...",90.585,1972-03-14,8.7,16280,"Drama,CrimeSpanning the years 1945 to 1955, a ..."
3,424,Schindler's List,"Drama,History,War",en,The true story of how businessman Oskar Schind...,44.761,1993-12-15,8.6,12959,"Drama,History,WarThe true story of how busines..."
4,240,The Godfather: Part II,"Drama,Crime",en,In the continuing saga of the Corleone crime f...,57.749,1974-12-20,8.6,9811,"Drama,CrimeIn the continuing saga of the Corle..."


In [8]:
new_df = movies[['id','title','genre','overview','Tags']]


In [9]:
new_df=new_df.drop(columns=['genre','overview'])

In [10]:
new_df.head()

Unnamed: 0,id,title,Tags
0,278,The Shawshank Redemption,"Drama,CrimeFramed in the 1940s for the double ..."
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,RomanceRaj is a rich, carefree, h..."
2,238,The Godfather,"Drama,CrimeSpanning the years 1945 to 1955, a ..."
3,424,Schindler's List,"Drama,History,WarThe true story of how busines..."
4,240,The Godfather: Part II,"Drama,CrimeIn the continuing saga of the Corle..."


# **Data Preprocessing**


In [11]:
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
cv=CountVectorizer(max_features=1000,stop_words='english')

In [13]:
cv

In [14]:
vec = cv.fit_transform(new_df['Tags'].values.astype('U')).toarray()

In [15]:
vec

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [16]:
vec.shape

(10000, 1000)

In [17]:
from sklearn.metrics.pairwise import cosine_similarity

In [18]:
sim = cosine_similarity(vec)

In [19]:
sim

array([[1.        , 0.11009638, 0.09534626, ..., 0.1254363 , 0.11396058,
        0.05025189],
       [0.11009638, 1.        , 0.17320508, ..., 0.        , 0.        ,
        0.        ],
       [0.09534626, 0.17320508, 1.        , ..., 0.0438529 , 0.05976143,
        0.        ],
       ...,
       [0.1254363 , 0.        , 0.0438529 , ..., 1.        , 0.05241424,
        0.04622502],
       [0.11396058, 0.        , 0.05976143, ..., 0.05241424, 1.        ,
        0.06299408],
       [0.05025189, 0.        , 0.        , ..., 0.04622502, 0.06299408,
        1.        ]])

In [22]:
new_df[new_df['title']=='Dilwale Dulhania Le Jayenge']

Unnamed: 0,id,title,Tags
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,RomanceRaj is a rich, carefree, h..."


In [23]:
dist = sorted(list(enumerate(sim[0])),reverse=True, key = lambda vec:vec[1])

In [24]:
dist

[(0, 1.0),
 (3709, 0.3636363636363636),
 (9006, 0.3344968040028363),
 (698, 0.3046358979224712),
 (1175, 0.30151134457776363),
 (1480, 0.3015113445777636),
 (715, 0.2984810028978546),
 (7271, 0.2956561979945413),
 (1646, 0.29346959282671103),
 (4201, 0.29346959282671103),
 (1009, 0.28603877677367767),
 (1264, 0.28603877677367767),
 (3159, 0.28603877677367767),
 (2605, 0.28426762180748055),
 (6758, 0.28426762180748055),
 (7478, 0.28426762180748055),
 (2963, 0.2771326538627135),
 (3649, 0.2696799449852968),
 (4068, 0.2696799449852968),
 (4632, 0.2696799449852968),
 (6294, 0.2696799449852968),
 (5428, 0.2667325346846322),
 (4273, 0.266500895444513),
 (9718, 0.266500895444513),
 (6093, 0.26637086328481074),
 (7324, 0.2638224265055431),
 (4490, 0.2611164839335468),
 (4987, 0.2611164839335468),
 (5027, 0.2611164839335468),
 (4811, 0.26111648393354675),
 (1507, 0.25854384499750954),
 (4234, 0.25854384499750954),
 (9520, 0.25854384499750954),
 (1038, 0.25712973861328997),
 (2925, 0.25712973861

In [39]:
for i in dist[0:5]:
  print(new_df.iloc[i[0]].title)

The Shawshank Redemption
Anything for Her
The Getaway
Cool Hand Luke
Undisputed III: Redemption


# **Prediction**

In [40]:
def recommend(movies):
  index = new_df[new_df['title']==movies].index[0]
  distance = sorted(list(enumerate(sim[0])),reverse=True, key = lambda vec:vec[1])
  for i in distance[0:5]:
    print(new_df.iloc[i[0]].title)