As always, we first import the **libraries** that we are going to use



In [1]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
import string
pd.set_option('display.max_rows', None)
np.set_printoptions(edgeitems=100)
np.core.arrayprint._line_width = 200


## Content-based filtering ##

We will first develop the **content-based filtering system**.
1. Data **exploration**
2. Data **cleaning**
3. Features **plotting**
4. System builder

### Data exploration ###

Uploading data set with **+100k  movies** and info

In [2]:
movies=pd.read_csv("Datasets/IMDb movies.csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [3]:
movies.head()

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
0,tt0000009,Miss Jerry,Miss Jerry,1894,1894-10-09,Romance,45,USA,,Alexander Black,...,"Blanche Bayliss, William Courtenay, Chauncey D...",The adventures of a female reporter in the 1890s.,5.9,154,,,,,1.0,2.0
1,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,"Biography, Crime, Drama",70,Australia,,Charles Tait,...,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...,6.1,589,$ 2250,,,,7.0,7.0
2,tt0001892,Den sorte drøm,Den sorte drøm,1911,1911-08-19,Drama,53,"Germany, Denmark",,Urban Gad,...,"Asta Nielsen, Valdemar Psilander, Gunnar Helse...",Two men of high rank are both wooing the beaut...,5.8,188,,,,,5.0,2.0
3,tt0002101,Cleopatra,Cleopatra,1912,1912-11-13,"Drama, History",100,USA,English,Charles L. Gaskill,...,"Helen Gardner, Pearl Sindelar, Miss Fielding, ...",The fabled queen of Egypt's affair with Roman ...,5.2,446,$ 45000,,,,25.0,3.0
4,tt0002130,L'Inferno,L'Inferno,1911,1911-03-06,"Adventure, Drama, Fantasy",68,Italy,Italian,"Francesco Bertolini, Adolfo Padovan",...,"Salvatore Papa, Arturo Pirovano, Giuseppe de L...",Loosely adapted from Dante's Divine Comedy and...,7.0,2237,,,,,31.0,14.0


In [4]:
movies.dtypes

imdb_title_id             object
title                     object
original_title            object
year                      object
date_published            object
genre                     object
duration                   int64
country                   object
language                  object
director                  object
writer                    object
production_company        object
actors                    object
description               object
avg_vote                 float64
votes                      int64
budget                    object
usa_gross_income          object
worlwide_gross_income     object
metascore                float64
reviews_from_users       float64
reviews_from_critics     float64
dtype: object

We first create a new df with the features we are going to use to match similar movies 

In [5]:
movies_def= movies[["title","director","genre","country","description"]].copy()
movies_def.head()

Unnamed: 0,title,director,genre,country,description
0,Miss Jerry,Alexander Black,Romance,USA,The adventures of a female reporter in the 1890s.
1,The Story of the Kelly Gang,Charles Tait,"Biography, Crime, Drama",Australia,True story of notorious Australian outlaw Ned ...
2,Den sorte drøm,Urban Gad,Drama,"Germany, Denmark",Two men of high rank are both wooing the beaut...
3,Cleopatra,Charles L. Gaskill,"Drama, History",USA,The fabled queen of Egypt's affair with Roman ...
4,L'Inferno,"Francesco Bertolini, Adolfo Padovan","Adventure, Drama, Fantasy",Italy,Loosely adapted from Dante's Divine Comedy and...


We check for **NAN** and **fill** them with blank space

In [6]:
movies_def.isnull().sum()

title             0
director         87
genre             0
country          64
description    2115
dtype: int64

In [7]:
columns=["title","director","genre","country","description"]
for column in columns:
    movies_def[column]=movies_def[column].fillna(" ")
movies_def.head()

Unnamed: 0,title,director,genre,country,description
0,Miss Jerry,Alexander Black,Romance,USA,The adventures of a female reporter in the 1890s.
1,The Story of the Kelly Gang,Charles Tait,"Biography, Crime, Drama",Australia,True story of notorious Australian outlaw Ned ...
2,Den sorte drøm,Urban Gad,Drama,"Germany, Denmark",Two men of high rank are both wooing the beaut...
3,Cleopatra,Charles L. Gaskill,"Drama, History",USA,The fabled queen of Egypt's affair with Roman ...
4,L'Inferno,"Francesco Bertolini, Adolfo Padovan","Adventure, Drama, Fantasy",Italy,Loosely adapted from Dante's Divine Comedy and...


Define a **function** to combine all the features in one single row

In [8]:
def combined_features (row):
    try:
        return row["title"]+" "+row["director"]+" "+row["genre"]+" "+row["country"]+" "+row["description"]
    except:
        return "Error",row



Then we **apply** the function to the dataset so it combines all the feature columns into one containing a string with all the features

In [9]:
movies_def["combined_features"]=movies_def.apply(combined_features, axis =1)

In [10]:
movies_def["combined_features"]=movies_def["combined_features"].str.lower()# As the matrix doesn't take upper cases

Double check that the dataframe looks like we wanted and there is no missing values

In [11]:
movies_def.head()


Unnamed: 0,title,director,genre,country,description,combined_features
0,Miss Jerry,Alexander Black,Romance,USA,The adventures of a female reporter in the 1890s.,miss jerry alexander black romance usa the adv...
1,The Story of the Kelly Gang,Charles Tait,"Biography, Crime, Drama",Australia,True story of notorious Australian outlaw Ned ...,the story of the kelly gang charles tait biogr...
2,Den sorte drøm,Urban Gad,Drama,"Germany, Denmark",Two men of high rank are both wooing the beaut...,"den sorte drøm urban gad drama germany, denmar..."
3,Cleopatra,Charles L. Gaskill,"Drama, History",USA,The fabled queen of Egypt's affair with Roman ...,"cleopatra charles l. gaskill drama, history us..."
4,L'Inferno,"Francesco Bertolini, Adolfo Padovan","Adventure, Drama, Fantasy",Italy,Loosely adapted from Dante's Divine Comedy and...,"l'inferno francesco bertolini, adolfo padovan ..."


In [12]:
moviestest=movies_def.sample(10000)


In [13]:
movies_def.isnull().sum()

title                0
director             0
genre                0
country              0
description          0
combined_features    0
dtype: int64

Now we have to create the **count matrix** and compute **cosine similarity** for this new column with all the features values

In [14]:
# We import the model and fit the column into the matrix 
cv=CountVectorizer()
#count_matrix=cv.fit_transform(movies_def["combined_features"])

count_matrixtest=cv.fit_transform(moviestest["combined_features"]) # Test as the full one is to big 

In [15]:
# Compute the cosine similarity in the count matrix
#cos_sim=cosine_similarity(count_matrix)


cos_simtest=cosine_similarity(count_matrixtest)# Test as the full one is to big 
cos_simtest

array([[1.        , 0.10269923, 0.06512896, 0.02567481, 0.15996801,
        0.13815939, 0.14256649, 0.1301448 , 0.1450037 , 0.08924215,
        0.        , 0.1136331 , 0.13968606, 0.07502345, 0.09877296,
        0.14354634, 0.07317073, 0.15160183, 0.02302656, 0.08700222,
        0.06907969, 0.05609927, 0.05902813, 0.10726058, 0.05205792,
        0.15263729, 0.16868694, 0.16480856, 0.        , 0.09988907,
        0.09756098, 0.12097168, 0.17325923, 0.0378777 , 0.0208696 ,
        0.06625892, 0.12503908, 0.13121221, 0.04211693, 0.09838022,
        0.12251278, 0.04462107, 0.03107974, 0.13251783, 0.08580846,
        0.06435635, 0.09756098, 0.12837404, 0.09639254, 0.16311817,
        0.16671393, 0.03062819, 0.10269923, 0.11043153, 0.01779765,
        0.08553989, 0.06011131, 0.08132837, 0.08662962, 0.24502556,
        0.05702659, 0.08553989, 0.13251783, 0.05902813, 0.12503908,
        0.08202648, 0.09312404, 0.04938648, 0.12493901, 0.14354634,
        0.06011131, 0.07808688, 0.04708816, 0.06

Defining functions to get **index from title and title from index**

In [16]:
def get_index(title):
    return movies[movies.title==title].index.values[0]
def get_title(index):
    return movies[movies.index==index]["title"].values[0]


Defining the function to get  top 5 **recommended movies** based on similarity

In [17]:
def movie_recomendation():
    movie_user_likes=input("Please write your choice here...")
    movie_user_index=get_index(movie_user_likes)
    sim_movies=list(enumerate(cos_simtest[movie_user_index]))
    sorted_sim_movies=sorted(sim_movies,key=lambda x: x[1],reverse= True)
    i=0
    for movie in sorted_sim_movies:
        print(get_title(movie[0]))
        i=i+1
        if i>10:
            break

In [25]:
movie_recomendation()

Please write your choice here...Cinderella
Cinderella
Shadow in the Sky
The Peterville Diamond
La tragedia del 'Silver Queen'
La strada proibita
Flowing Gold
Peccatori
Quegli anni selvaggi
Redskin
Sensation Hunters
The Mark of the Whistler
