## Import All Packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from nltk.stem.porter import PorterStemmer
import pickle
import re

## Read the movie dataset

In [2]:
df = pd.read_csv('movies.csv', encoding='latin-1')

## Exploratory Data Analysis

In [3]:
df.head(10)

Unnamed: 0,Name,Year of Release,Genres,Rating,Actors,Director,Description,Image
0,Road House,2024,Action Thriller,6.2,JakeGyllenhaal DanielaMelchior ConorMcGregor,DougLiman,Ex-UFC fighter Dalton takes a job as a bouncer...,https://m.media-amazon.com/images/M/MV5BNTFiNT...
1,Dune: Part Two,2024,Action Adventure Drama,8.8,TimothéeChalamet Zendaya RebeccaFerguson,DenisVilleneuve,Paul Atreides unites with Chani and the Fremen...,https://m.media-amazon.com/images/M/MV5BN2QyZG...
2,Godzilla x Kong: The New Empire,2024,Action Adventure Sci-Fi,6.6,RebeccaHall BrianTyreeHenry DanStevens,AdamWingard,"Two ancient titans, Godzilla and Kong, clash i...",https://m.media-amazon.com/images/M/MV5BY2QwOG...
3,Ghostbusters: Frozen Empire,2024,Adventure Comedy Fantasy,6.5,PaulRudd CarrieCoon FinnWolfhard,GilKenan,When the discovery of an ancient artifact unle...,https://m.media-amazon.com/images/M/MV5BNGE5MW...
4,Poor Things,2023,Comedy Drama Romance,8.0,EmmaStone MarkRuffalo WillemDafoe,YorgosLanthimos,An account of the fantastical evolution of Bel...,https://m.media-amazon.com/images/M/MV5BNGIyYW...
5,Road House,1989,Action Thriller,6.7,PatrickSwayze KellyLynch SamElliott,RowdyHerrington,A bouncer hired to clean up the baddest honkyt...,https://m.media-amazon.com/images/M/MV5BMTU1MT...
6,Beetlejuice Beetlejuice,2024,Comedy Fantasy Horror,,,,This is a follow-up to the comedy Beetlejuice ...,https://m.media-amazon.com/images/M/MV5BYTk2Yj...
7,Dune: Part One,2021,Action Adventure Drama,8.0,TimothéeChalamet RebeccaFerguson Zendaya,DenisVilleneuve,A noble family becomes embroiled in a war for ...,https://m.media-amazon.com/images/M/MV5BMDQ0Nj...
8,Damsel,2024,Action Adventure Fantasy,6.1,MillieBobbyBrown RayWinstone AngelaBassett,JuanCarlosFresnadillo,A dutiful damsel agrees to marry a handsome pr...,https://m.media-amazon.com/images/M/MV5BODRiMT...
9,Oppenheimer,2023,Biography Drama History,8.3,CillianMurphy EmilyBlunt MattDamon,ChristopherNolan,The story of American scientist J. Robert Oppe...,https://m.media-amazon.com/images/M/MV5BMDBmYT...


In [4]:
df.tail(10)

Unnamed: 0,Name,Year of Release,Genres,Rating,Actors,Director,Description,Image
1040,Us,2019,Horror Mystery Thriller,6.8,LupitaNyong'o WinstonDuke ElisabethMoss,JordanPeele,Adelaide Wilson and her family are attacked by...,https://m.media-amazon.com/images/M/MV5BZTliNW...
1041,Shaun of the Dead,2004,Comedy Horror,7.9,SimonPegg NickFrost KateAshfield,EdgarWright,"The uneventful, aimless lives of a London elec...",https://m.media-amazon.com/images/M/MV5BMTg5Mj...
1042,Room,2015,Drama Thriller,8.1,BrieLarson JacobTremblay SeanBridgers,LennyAbrahamson,A little boy is held captive in a room with hi...,https://m.media-amazon.com/images/M/MV5BMjE4Nz...
1043,Superhero Movie,2008,Action Comedy Sci-Fi,4.7,DrakeBell LeslieNielsen SaraPaxton,CraigMazin,Orphaned high school student Rick Riker is bit...,https://m.media-amazon.com/images/M/MV5BMTc0Nj...
1044,Speed,1994,Action Adventure Thriller,7.3,KeanuReeves DennisHopper SandraBullock,JandeBont,A young police officer must prevent a bomb exp...,https://m.media-amazon.com/images/M/MV5BYjc0Mj...
1045,Persuasion,2022,Drama Romance,5.8,RichardE.Grant HenryGolding BenBaileySmith,CarrieCracknell,Eight years after Anne Elliot was persuaded no...,https://m.media-amazon.com/images/M/MV5BZDg3Mz...
1046,The French Dispatch,2021,Comedy Drama Romance,7.1,BenicioDelToro AdrienBrody TildaSwinton,WesAnderson,A love letter to journalists set in an outpost...,https://m.media-amazon.com/images/M/MV5BNmQxZT...
1047,The Rules of Attraction,2002,Comedy Drama Romance,6.6,JamesVanDerBeek IanSomerhalder ShannynSossamon,RogerAvary,The incredibly spoiled and overprivileged stud...,https://m.media-amazon.com/images/M/MV5BOTc4OT...
1048,A Beautiful Day in the Neighborhood,2019,Biography Drama,7.2,MatthewRhys TomHanks ChrisCooper,MarielleHeller,Based on the true story of a real-life friends...,https://m.media-amazon.com/images/M/MV5BZWE3ZD...
1049,Life of Brian,1979,Comedy,8.0,GrahamChapman JohnCleese MichaelPalin,TerryJones,Born on the original Christmas in the stable n...,https://m.media-amazon.com/images/M/MV5BMDA1ZW...


In [5]:
df.isna().sum()

Name                0
Year of Release     0
Genres              8
Rating             58
Actors             58
Director           58
Description         2
Image               4
dtype: int64

In [6]:
df.columns

Index(['Name', 'Year of Release', 'Genres', 'Rating', 'Actors', 'Director',
       'Description', 'Image'],
      dtype='object')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1050 entries, 0 to 1049
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             1050 non-null   object 
 1   Year of Release  1050 non-null   object 
 2   Genres           1042 non-null   object 
 3   Rating           992 non-null    float64
 4   Actors           992 non-null    object 
 5   Director         992 non-null    object 
 6   Description      1048 non-null   object 
 7   Image            1046 non-null   object 
dtypes: float64(1), object(7)
memory usage: 65.8+ KB


## Preprocess and clean the data

### 1. Drop all the rows with no values (given as NaN, Na, N/A and so on)

In [8]:
df = df.dropna(axis=0).reset_index().drop(columns='index')
df

Unnamed: 0,Name,Year of Release,Genres,Rating,Actors,Director,Description,Image
0,Road House,2024,Action Thriller,6.2,JakeGyllenhaal DanielaMelchior ConorMcGregor,DougLiman,Ex-UFC fighter Dalton takes a job as a bouncer...,https://m.media-amazon.com/images/M/MV5BNTFiNT...
1,Dune: Part Two,2024,Action Adventure Drama,8.8,TimothéeChalamet Zendaya RebeccaFerguson,DenisVilleneuve,Paul Atreides unites with Chani and the Fremen...,https://m.media-amazon.com/images/M/MV5BN2QyZG...
2,Godzilla x Kong: The New Empire,2024,Action Adventure Sci-Fi,6.6,RebeccaHall BrianTyreeHenry DanStevens,AdamWingard,"Two ancient titans, Godzilla and Kong, clash i...",https://m.media-amazon.com/images/M/MV5BY2QwOG...
3,Ghostbusters: Frozen Empire,2024,Adventure Comedy Fantasy,6.5,PaulRudd CarrieCoon FinnWolfhard,GilKenan,When the discovery of an ancient artifact unle...,https://m.media-amazon.com/images/M/MV5BNGE5MW...
4,Poor Things,2023,Comedy Drama Romance,8.0,EmmaStone MarkRuffalo WillemDafoe,YorgosLanthimos,An account of the fantastical evolution of Bel...,https://m.media-amazon.com/images/M/MV5BNGIyYW...
...,...,...,...,...,...,...,...,...
985,Persuasion,2022,Drama Romance,5.8,RichardE.Grant HenryGolding BenBaileySmith,CarrieCracknell,Eight years after Anne Elliot was persuaded no...,https://m.media-amazon.com/images/M/MV5BZDg3Mz...
986,The French Dispatch,2021,Comedy Drama Romance,7.1,BenicioDelToro AdrienBrody TildaSwinton,WesAnderson,A love letter to journalists set in an outpost...,https://m.media-amazon.com/images/M/MV5BNmQxZT...
987,The Rules of Attraction,2002,Comedy Drama Romance,6.6,JamesVanDerBeek IanSomerhalder ShannynSossamon,RogerAvary,The incredibly spoiled and overprivileged stud...,https://m.media-amazon.com/images/M/MV5BOTc4OT...
988,A Beautiful Day in the Neighborhood,2019,Biography Drama,7.2,MatthewRhys TomHanks ChrisCooper,MarielleHeller,Based on the true story of a real-life friends...,https://m.media-amazon.com/images/M/MV5BZWE3ZD...


### 2. Clean the names of actors/directors by replacing non-alphabetic characters with blanks

In [9]:
def clean_text(text):
    cleaned_text = re.sub(r'[^\w\sÀ-ÿ]', '', str(text)) # removes non-alphabetic characters except accented letters and whitespace
    return cleaned_text

In [10]:
df['Director'] = df['Director'].apply(clean_text)
df['Actors'] = df['Actors'].apply(clean_text)

In [11]:
df

Unnamed: 0,Name,Year of Release,Genres,Rating,Actors,Director,Description,Image
0,Road House,2024,Action Thriller,6.2,JakeGyllenhaal DanielaMelchior ConorMcGregor,DougLiman,Ex-UFC fighter Dalton takes a job as a bouncer...,https://m.media-amazon.com/images/M/MV5BNTFiNT...
1,Dune: Part Two,2024,Action Adventure Drama,8.8,TimothéeChalamet Zendaya RebeccaFerguson,DenisVilleneuve,Paul Atreides unites with Chani and the Fremen...,https://m.media-amazon.com/images/M/MV5BN2QyZG...
2,Godzilla x Kong: The New Empire,2024,Action Adventure Sci-Fi,6.6,RebeccaHall BrianTyreeHenry DanStevens,AdamWingard,"Two ancient titans, Godzilla and Kong, clash i...",https://m.media-amazon.com/images/M/MV5BY2QwOG...
3,Ghostbusters: Frozen Empire,2024,Adventure Comedy Fantasy,6.5,PaulRudd CarrieCoon FinnWolfhard,GilKenan,When the discovery of an ancient artifact unle...,https://m.media-amazon.com/images/M/MV5BNGE5MW...
4,Poor Things,2023,Comedy Drama Romance,8.0,EmmaStone MarkRuffalo WillemDafoe,YorgosLanthimos,An account of the fantastical evolution of Bel...,https://m.media-amazon.com/images/M/MV5BNGIyYW...
...,...,...,...,...,...,...,...,...
985,Persuasion,2022,Drama Romance,5.8,RichardEGrant HenryGolding BenBaileySmith,CarrieCracknell,Eight years after Anne Elliot was persuaded no...,https://m.media-amazon.com/images/M/MV5BZDg3Mz...
986,The French Dispatch,2021,Comedy Drama Romance,7.1,BenicioDelToro AdrienBrody TildaSwinton,WesAnderson,A love letter to journalists set in an outpost...,https://m.media-amazon.com/images/M/MV5BNmQxZT...
987,The Rules of Attraction,2002,Comedy Drama Romance,6.6,JamesVanDerBeek IanSomerhalder ShannynSossamon,RogerAvary,The incredibly spoiled and overprivileged stud...,https://m.media-amazon.com/images/M/MV5BOTc4OT...
988,A Beautiful Day in the Neighborhood,2019,Biography Drama,7.2,MatthewRhys TomHanks ChrisCooper,MarielleHeller,Based on the true story of a real-life friends...,https://m.media-amazon.com/images/M/MV5BZWE3ZD...


### 3. Merge the essential columns

In [12]:
df['features'] =  df['Description'] + ' ' + df['Genres'] + df['Actors'] + df['Director']

In [13]:
df['features']

0      Ex-UFC fighter Dalton takes a job as a bouncer...
1      Paul Atreides unites with Chani and the Fremen...
2      Two ancient titans, Godzilla and Kong, clash i...
3      When the discovery of an ancient artifact unle...
4      An account of the fantastical evolution of Bel...
                             ...                        
985    Eight years after Anne Elliot was persuaded no...
986    A love letter to journalists set in an outpost...
987    The incredibly spoiled and overprivileged stud...
988    Based on the true story of a real-life friends...
989    Born on the original Christmas in the stable n...
Name: features, Length: 990, dtype: object

### 4. Convert the features to lower case for best results

In [14]:
df['features'] = df['features'].apply(lambda x: x.lower())
df['features'][0]

'ex-ufc fighter dalton takes a job as a bouncer at a florida keys roadhouse, only to discover that this paradise is not all it seems. action thriller jakegyllenhaal danielamelchior conormcgregor dougliman'

### 5. Apply stemming modules to normalize each feature

In [15]:
ps = PorterStemmer()

In [16]:
def stem(text):
    y=[]
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [17]:
df['features'] = df['features'].apply(stem)

In [18]:
df['features'][0]

'ex-ufc fighter dalton take a job as a bouncer at a florida key roadhouse, onli to discov that thi paradis is not all it seems. action thriller jakegyllenha danielamelchior conormcgregor dougliman'

## Build a recommender model using vectorization techniques

In [19]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['features'])

In [20]:
# Calculate cosine similarity between TF-IDF vectors
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

## Recommend 10 movies similar to the given movie

In [21]:
def recommendations(movie_name, n=10):
    movie_idx = df.index[df['Name'] == movie_name].tolist()
    if not movie_idx:
        print("Movie not found!")
        return None
    movie_idx = movie_idx[0]
    sim_scores = list(enumerate(cosine_sim[movie_idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:n+1]  # Do not include the movie itself (sim_scores[0])
    recommended_movies = [df.iloc[i[0]]['Name'] for i in sim_scores]
    return recommended_movies

In [22]:
movie_name = 'John Wick'  # Specify the movie name for which you want recommendations
top_recommendations = recommendations(movie_name)
if top_recommendations:
    for x in top_recommendations:
        print(x)

John Wick: Chapter 3 - Parabellum
John Wick: Chapter 2
Skyfall
The Fall Guy
Deadpool
Zodiac
John Wick: Chapter 4
Snatch
Taken
Mission: Impossible - Dead Reckoning Part One


## Convert the model and the dataset into pickle file format for use in front-end

In [23]:
dataset = df.to_dict('records')

In [24]:
pickle.dump(dataset,open('movie_list.pkl','wb'))
pickle.dump(cosine_sim,open('similarity.pkl','wb'))