# Developing a Movie Recommendation System using spaCy and NLP

## Getting Started
To begin, ensure you have spaCy installed in your environment:

In [89]:
# !pip install spacy
!pip3 install spacy



In [90]:
# !python -m spacy download en_core_web_md
# !python3 -m spacy download en_core_web_md
!python3 -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


# Phase 1: Data Preparation

First, download the dataset from Kaggle, specifically the `movies_metadata.csv` file.

## Loading the Dataset

In [91]:
import pandas as pd
# Load the dataset
movies_metadata = pd.read_csv('./dataset/movies_metadata.csv', low_memory=False)


In [92]:
movies_metadata.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [93]:
movies_metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [94]:
movies_metadata.isnull().sum()

adult                        0
belongs_to_collection    40972
budget                       0
genres                       0
homepage                 37684
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   5
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      6
runtime                    263
spoken_languages             6
status                      87
tagline                  25054
title                        6
video                        6
vote_average                 6
vote_count                   6
dtype: int64

## Initial Data Cleaning

In [95]:
# Selecting relevant columns and removing missing values
movies_metadata = movies_metadata[['title', 'overview']].dropna()

In [96]:
movies_metadata.head()

Unnamed: 0,title,overview
0,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,Jumanji,When siblings Judy and Peter discover an encha...
2,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
4,Father of the Bride Part II,Just when George Banks has recovered from his ...


# Phase 2: NLP Preprocessing

We'll use spaCy for text processing to prepare our movie descriptions.

## Preprocessing Function

In [97]:
import spacy
nlp = spacy.load('en_core_web_lg')

def preprocess(text):
    doc = nlp(text)
    return ' '.join([token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct])


## Applying Preprocessing

In [129]:
# This takes many minutes depending on internet speed (rather run the next cell)
movies_metadata['processed_overview'] = movies_metadata['overview'].apply(preprocess)


In [130]:
# Only use a small subset of the data

# movies_metadata = movies_metadata.sample(5000, random_state=42)

# movies_metadata.head()

In [100]:
# A bit quicker with less accuracy

# movies_metadata['processed_overview'] = movies_metadata['overview'].apply(preprocess)

In [131]:
movies_metadata.head()

Unnamed: 0,title,overview,processed_overview,vector
18072,Undertow,An unusual ghost story set on the Peruvian sea...,unusual ghost story set peruvian seaside marri...,"[-1.1456169, -0.036510006, -1.3001771, -1.2833..."
5076,Burial Ground,Professor Ayres discovers a secret in an ancie...,professor ayres discover secret ancient stone ...,"[-0.09990842, -0.24466726, -0.99048996, -0.693..."
40871,Diggers,A couple hires a professional digger (undergro...,couple hire professional digger underground st...,"[-0.036168355, 0.7710308, -2.230075, -0.929048..."
7695,College,"To reconcile with his girlfriend, a bookish co...",reconcile girlfriend bookish college student t...,"[1.2536714, 1.7119963, -2.3417928, -1.2373142,..."
24381,Charlie Chan at Treasure Island,Charlie's investigation of a phony psychic dur...,charlie investigation phony psychic 1939 world...,"[-1.408762, -0.22870494, -1.0439839, -0.309689..."


# Phase 3: Feature Extraction

We'll transform movie descriptions into numerical vectors using spaCy's word vectors.

## Extracting Word Vectors

In [132]:
def get_vector(text):
    doc = nlp(text)
    return doc.vector

movies_metadata['vector'] = movies_metadata['processed_overview'].apply(get_vector)


In [133]:
# Vector dimensions for an example
movies_metadata['vector'].iloc[0].shape

# This is a 300-dimensional vector

(300,)

In [134]:
movies_metadata.head()

Unnamed: 0,title,overview,processed_overview,vector
18072,Undertow,An unusual ghost story set on the Peruvian sea...,unusual ghost story set peruvian seaside marri...,"[-1.1456169, -0.036510006, -1.3001771, -1.2833..."
5076,Burial Ground,Professor Ayres discovers a secret in an ancie...,professor ayres discover secret ancient stone ...,"[-0.09990842, -0.24466726, -0.99048996, -0.693..."
40871,Diggers,A couple hires a professional digger (undergro...,couple hire professional digger underground st...,"[-0.036168355, 0.7710308, -2.230075, -0.929048..."
7695,College,"To reconcile with his girlfriend, a bookish co...",reconcile girlfriend bookish college student t...,"[1.2536714, 1.7119963, -2.3417928, -1.2373142,..."
24381,Charlie Chan at Treasure Island,Charlie's investigation of a phony psychic dur...,charlie investigation phony psychic 1939 world...,"[-1.408762, -0.22870494, -1.0439839, -0.309689..."


# Phase 4: Building the Recommendation System

In [135]:
from sklearn.metrics.pairwise import cosine_similarity


## Recommendation Function

In [136]:
def recommend(input_description, n_recommendations=5):
    # Process the input description to get its vector
    input_vector = get_vector(preprocess(input_description))
    input_vector = input_vector.reshape(1, -1)  # Reshape for compatibility with cosine_similarity

    # Compute similarity with all movies
    similarity_scores = cosine_similarity(input_vector, list(movies_metadata['vector']))
    
    # Get top N similarities
    sim_scores_indices = similarity_scores[0].argsort()[-n_recommendations-1:-1][::-1]  # Exclude the input movie itself
    
    # Get movie titles based on indices
    recommended_titles = movies_metadata['title'].iloc[sim_scores_indices].tolist()
    
    # Get the corresponding similarity scores
    recommended_scores = similarity_scores[0][sim_scores_indices].tolist()
    
    # Combine titles and scores into a list of tuples
    recommendations = list(zip(recommended_titles, recommended_scores))
    
    return recommendations

# Phase 5: Evaluation and Refinement

In [137]:
# Get recommendations based on a few movie descriptions
recommend("A family of undercover superheroes, while trying to live the quiet suburban life, are forced into action to save the world.")

[('8 Minutes Idle', 0.8856506943702698),
 ('Refuge', 0.8841311931610107),
 ('A Few Dollars for Django', 0.8793590664863586),
 ('Tiny Times', 0.8778293132781982),
 ('The Twilight Saga: Eclipse', 0.8769623041152954)]

In [138]:
# Get another recommendation
recommend("A young lion prince is cast out of his pride by his cruel uncle, who claims he killed his father. While the uncle rules with an iron paw, the prince grows up beyond the Savannah, living by a philosophy: No worries for the rest of your days.")

[('Behind the Sun', 0.8700199723243713),
 ('Red Riding Hood', 0.8589057326316833),
 ('The Salt Prince', 0.8579161763191223),
 ('Robin Hood', 0.8540676236152649),
 ('Alluda Majaka', 0.8536922335624695)]

In [139]:
# Get another recommendation
recommend("A superhero from Krypton is sent to Earth to protect it from evil forces.")

[('Clash of the Titans', 0.821895956993103),
 ('Captain Kronos: Vampire Hunter', 0.8041558265686035),
 ('Deathstalker II', 0.8011834621429443),
 ('Atomic Rulers', 0.7917302250862122),
 ('Yu-Gi-Oh! 3D: Bonds Beyond Time', 0.7916430830955505)]