# Data preparation

### Import Libararies

In [None]:
# Importing libararies and packages
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

### Import Dataset

In [None]:
# Importing the dataset
movies_dataset = pd.read_csv('./movies_dataset.csv')

In [None]:
# Print the first 5 rows in the dataset
movies_dataset.head()

Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count
0,278,The Shawshank Redemption,"Drama,Crime",en,Framed in the 1940s for the double murder of h...,94.075,1994-09-23,8.7,21862
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance",hi,"Raj is a rich, carefree, happy-go-lucky second...",25.408,1995-10-19,8.7,3731
2,238,The Godfather,"Drama,Crime",en,"Spanning the years 1945 to 1955, a chronicle o...",90.585,1972-03-14,8.7,16280
3,424,Schindler's List,"Drama,History,War",en,The true story of how businessman Oskar Schind...,44.761,1993-12-15,8.6,12959
4,240,The Godfather: Part II,"Drama,Crime",en,In the continuing saga of the Corleone crime f...,57.749,1974-12-20,8.6,9811


In [None]:
# Print the first 5 rows in the dataset
movies_dataset.tail()

Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count
9995,10196,The Last Airbender,"Action,Adventure,Fantasy",en,"The story follows the adventures of Aang, a yo...",98.322,2010-06-30,4.7,3347
9996,331446,Sharknado 3: Oh Hell No!,"Action,TV Movie,Science Fiction,Comedy,Adventure",en,The sharks take bite out of the East Coast whe...,12.49,2015-07-22,4.7,417
9997,13995,Captain America,"Action,Science Fiction,War",en,"During World War II, a brave, patriotic Americ...",18.333,1990-12-14,4.6,332
9998,2312,In the Name of the King: A Dungeon Siege Tale,"Adventure,Fantasy,Action,Drama",en,A man named Farmer sets out to rescue his kidn...,15.159,2007-11-29,4.7,668
9999,455957,Domino,"Thriller,Action,Crime",en,Seeking justice for his partner’s murder by an...,16.482,2019-05-31,4.6,221


In [None]:
# Get the columns of our dataset
movies_dataset.columns

Index(['id', 'title', 'genre', 'original_language', 'overview', 'popularity',
       'release_date', 'vote_average', 'vote_count'],
      dtype='object')

In [None]:
movies_dataset.describe()

Unnamed: 0,id,popularity,vote_average,vote_count
count,10000.0,10000.0,10000.0,10000.0
mean,161243.505,34.697267,6.62115,1547.3094
std,211422.046043,211.684175,0.766231,2648.295789
min,5.0,0.6,4.6,200.0
25%,10127.75,9.15475,6.1,315.0
50%,30002.5,13.6375,6.6,583.5
75%,310133.5,25.65125,7.2,1460.0
max,934761.0,10436.917,8.7,31917.0


In [None]:
movies_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 10000 non-null  int64  
 1   title              10000 non-null  object 
 2   genre              9997 non-null   object 
 3   original_language  10000 non-null  object 
 4   overview           9987 non-null   object 
 5   popularity         10000 non-null  float64
 6   release_date       10000 non-null  object 
 7   vote_average       10000 non-null  float64
 8   vote_count         10000 non-null  int64  
dtypes: float64(2), int64(2), object(5)
memory usage: 703.2+ KB


### Feature Selection

In [None]:
# Here we chose the desired features that we will use in our model
movies_dataset = movies_dataset [ [ 'id', 'title', 'overview', 'genre'] ]

In [None]:
# Print the last 5 rows in the dataset
movies_dataset.tail()

Unnamed: 0,id,title,overview,genre
9995,10196,The Last Airbender,"The story follows the adventures of Aang, a yo...","Action,Adventure,Fantasy"
9996,331446,Sharknado 3: Oh Hell No!,The sharks take bite out of the East Coast whe...,"Action,TV Movie,Science Fiction,Comedy,Adventure"
9997,13995,Captain America,"During World War II, a brave, patriotic Americ...","Action,Science Fiction,War"
9998,2312,In the Name of the King: A Dungeon Siege Tale,A man named Farmer sets out to rescue his kidn...,"Adventure,Fantasy,Action,Drama"
9999,455957,Domino,Seeking justice for his partner’s murder by an...,"Thriller,Action,Crime"


In [None]:
# Merge the overview and genre columns to gather and add it in a new column called tags
movies_dataset['tags'] = movies_dataset['overview']+movies_dataset['genre']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_dataset['tags'] = movies_dataset['overview']+movies_dataset['genre']


In [None]:
# Print the first 5 rows of the dataset
movies_dataset.head()

Unnamed: 0,id,title,overview,genre,tags
0,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,"Drama,Crime",Framed in the 1940s for the double murder of h...
1,19404,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second...","Comedy,Drama,Romance","Raj is a rich, carefree, happy-go-lucky second..."
2,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","Drama,Crime","Spanning the years 1945 to 1955, a chronicle o..."
3,424,Schindler's List,The true story of how businessman Oskar Schind...,"Drama,History,War",The true story of how businessman Oskar Schind...
4,240,The Godfather: Part II,In the continuing saga of the Corleone crime f...,"Drama,Crime",In the continuing saga of the Corleone crime f...


In [None]:
# Drop the overview and genre columns
movies_dataset.drop(columns=['overview','genre'],inplace = True)

In [None]:
# The shape of the dataset
movies_dataset.shape

(10000, 3)

In [None]:
# Print the first 5 rows of the dataset
movies_dataset.head()

Unnamed: 0,id,title,tags
0,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...
1,19404,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second..."
2,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o..."
3,424,Schindler's List,The true story of how businessman Oskar Schind...
4,240,The Godfather: Part II,In the continuing saga of the Corleone crime f...


In [None]:
# Print the last 5 rows of the dataset
movies_dataset.tail()

Unnamed: 0,id,title,tags
9995,10196,The Last Airbender,"The story follows the adventures of Aang, a yo..."
9996,331446,Sharknado 3: Oh Hell No!,The sharks take bite out of the East Coast whe...
9997,13995,Captain America,"During World War II, a brave, patriotic Americ..."
9998,2312,In the Name of the King: A Dungeon Siege Tale,A man named Farmer sets out to rescue his kidn...
9999,455957,Domino,Seeking justice for his partner’s murder by an...


### Convert Text To Numberical Using NLP (CountVectorizer)


In [None]:
# Intialize CountVectorizer
count_vectorizer = CountVectorizer(max_features=10000,stop_words='english')

# Fit and transform the tags column
vector = count_vectorizer.fit_transform(movies_dataset['tags'].values.astype('U')).toarray()

In [None]:
# The vector shape
vector.shape

(10000, 10000)

### Data Cleaning

In [None]:
# Check is there null or nots
movies_dataset.isnull().sum()

id        0
title     0
tags     15
dtype: int64

# Building The Movie Recommender Model

### Consine Similarity Object

In [None]:
# Creating the similarity object
similarity = cosine_similarity(vector)

In [None]:
similarity

array([[1.        , 0.05634362, 0.12888482, ..., 0.07559289, 0.11065667,
        0.06388766],
       [0.05634362, 1.        , 0.07624929, ..., 0.        , 0.03636965,
        0.        ],
       [0.12888482, 0.07624929, 1.        , ..., 0.02273314, 0.06655583,
        0.08645856],
       ...,
       [0.07559289, 0.        , 0.02273314, ..., 1.        , 0.03253   ,
        0.02817181],
       [0.11065667, 0.03636965, 0.06655583, ..., 0.03253   , 1.        ,
        0.0412393 ],
       [0.06388766, 0.        , 0.08645856, ..., 0.02817181, 0.0412393 ,
        1.        ]])

### Testing Example

In [None]:
# Our movie's ID is 13995 (Captain America), We are trying to print the index of it
movies_dataset[movies_dataset['id'] == 13995].index[0]

9997

In [None]:
# Here we are printing the dimentions of move of index 2
sorted(list(enumerate(similarity[9997])),reverse=True, key=lambda vector: vector[1])

[(9997, 1.0),
 (3329, 0.4222222222222223),
 (1788, 0.37712361663282534),
 (862, 0.32338083338177737),
 (1441, 0.31622776601683794),
 (1004, 0.3108349360801046),
 (4268, 0.29408584883752314),
 (5576, 0.2892406104248749),
 (2072, 0.2854496128592251),
 (2031, 0.26943012562182533),
 (4996, 0.2683281572999748),
 (4896, 0.267739776300833),
 (7655, 0.26024001945294223),
 (1813, 0.2592724864350674),
 (2630, 0.25609110844884536),
 (6179, 0.25425669046549126),
 (6409, 0.25425669046549126),
 (7256, 0.25425669046549126),
 (9529, 0.25425669046549126),
 (1353, 0.25308553412176554),
 (2743, 0.24806946917841693),
 (1633, 0.2459549291242073),
 (1593, 0.23870495801314429),
 (4520, 0.23851391759997756),
 (968, 0.23354968324845693),
 (2093, 0.23333333333333334),
 (1251, 0.23094010767585033),
 (2809, 0.2309401076758503),
 (1558, 0.2295101242196988),
 (2859, 0.2295101242196988),
 (5757, 0.2295101242196988),
 (7026, 0.22771001702132446),
 (1032, 0.2253744679276044),
 (4444, 0.2253744679276044),
 (295, 0.2236

In [None]:
# The result of the disered movie
movies_dataset.loc[3329,'title']

'Captain America: The First Avenger'

In [None]:
# A function for recommend moevies and printing movies titles
def recommend_movies_titles(movie_id):

  # Get the movie index via movie_id
  movie_index = movies_dataset.index[movies_dataset['id'] == movie_id].tolist()[0]

  # The movies distances from the user movie and the other movies
  distances =  sorted(list(enumerate(similarity[movie_index])),reverse=True, key= lambda vector:vector[1])

  # Here we will get the top 10 movies that similar to the user's moive
  for i in distances[1:10]:
    # The recommended movie
    recommended_movie = movies_dataset.iloc[i[0]].title
    # return the recommended movies IDs
    print('> {}'.format(recommended_movie))

In [None]:
# Here, We're trying to get the related movies to a desired movie ID
recommend_movies_titles(13995)

> Captain America: The First Avenger
> Team Thor
> Captain America: The Winter Soldier
> Letters from Iwo Jima
> Justice Society: World War II
> Ultimate Avengers: The Movie
> Red Tails
> When the Wind Blows
> The Counterfeiters


# Recommend Movies Function

In [None]:
# A function for recommend moevies and returning the movies IDs
def recommend_movies(movie_id):

  # Get the movie index via movie_id
  movie_index = movies_dataset.index[movies_dataset['id'] == movie_id].tolist()[0]

  # The movies distances from the user movie and the other movies
  distances =  sorted(list(enumerate(similarity[movie_index])),reverse=True, key= lambda vector:vector[1])


  # The list of movies IDs, That will fill
  recommended_movies_ids = []

  # Here we will get the top 10 movies that similar to the user's moive
  for i in distances[1:10]:
    # The recommended movie
    rec_movie_id = movies_dataset.iloc[i[0]].id
    # Add the recommended movies IDs to recommended_movies_ids
    recommended_movies_ids.append(rec_movie_id)

  # Returning the movies IDs
  return recommended_movies_ids

In [None]:
recommend_movies(238)

[240, 190955, 475557, 396774, 339103, 13012, 1580, 382322, 14580]