In [192]:
import pandas as pd
import numpy as np
from collections import Counter
import pickle

pd.set_option('display.max_columns', None)

In [2]:
movies = pd.read_csv('movies.csv')
movies.head()

Unnamed: 0,id,title,overview,release_date,popularity,vote_average,vote_count
0,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",1972-03-14,109.72,8.7,17355
1,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,1994-09-23,92.048,8.7,23144
2,315162,Puss in Boots: The Last Wish,Puss in Boots discovers that his passion for a...,2022-12-07,6689.647,8.6,2504
3,240,The Godfather Part II,In the continuing saga of the Corleone crime f...,1974-12-20,57.998,8.6,10503
4,424,Schindler's List,The true story of how businessman Oskar Schind...,1993-12-15,55.299,8.6,13687


In [3]:
movies.shape

(10000, 7)

In [4]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            10000 non-null  int64  
 1   title         10000 non-null  object 
 2   overview      9998 non-null   object 
 3   release_date  10000 non-null  object 
 4   popularity    10000 non-null  float64
 5   vote_average  10000 non-null  float64
 6   vote_count    10000 non-null  int64  
dtypes: float64(2), int64(2), object(3)
memory usage: 547.0+ KB


In [10]:
movies = movies.dropna()

In [11]:
movies.head()

Unnamed: 0,id,title,overview,release_date,popularity,vote_average,vote_count
0,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",1972-03-14,109.72,8.7,17355
1,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,1994-09-23,92.048,8.7,23144
2,315162,Puss in Boots: The Last Wish,Puss in Boots discovers that his passion for a...,2022-12-07,6689.647,8.6,2504
3,240,The Godfather Part II,In the continuing saga of the Corleone crime f...,1974-12-20,57.998,8.6,10503
4,424,Schindler's List,The true story of how businessman Oskar Schind...,1993-12-15,55.299,8.6,13687


In [12]:
movies = movies.drop('id', axis=1)

In [16]:
movies['release_date'] = pd.to_datetime(movies['release_date'])

In [17]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9998 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   title         9998 non-null   object        
 1   overview      9998 non-null   object        
 2   release_date  9998 non-null   datetime64[ns]
 3   popularity    9998 non-null   float64       
 4   vote_average  9998 non-null   float64       
 5   vote_count    9998 non-null   int64         
dtypes: datetime64[ns](1), float64(2), int64(1), object(2)
memory usage: 546.8+ KB


In [18]:
movies['year'] = movies['release_date'].dt.year

In [22]:
movies = movies.drop('release_date', axis=1)

In [23]:
movies.head()

Unnamed: 0,title,overview,popularity,vote_average,vote_count,year
0,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",109.72,8.7,17355,1972
1,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,92.048,8.7,23144,1994
2,Puss in Boots: The Last Wish,Puss in Boots discovers that his passion for a...,6689.647,8.6,2504,2022
3,The Godfather Part II,In the continuing saga of the Corleone crime f...,57.998,8.6,10503,1974
4,Schindler's List,The true story of how businessman Oskar Schind...,55.299,8.6,13687,1993


In [58]:
vocab = []
for i in range(len(movies)):
    blob = movies.iloc[i, 1].split()
    
    for word in blob:
        vocab.append(word)

In [60]:
len(vocab)

466766

In [79]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [76]:
import string

In [77]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [80]:
def removing_punctions_stemming(text):
    y = []
    
    for i in text.split():
        if i not in string.punctuation:
            y.append(ps.stem(i))       
    
    return " ".join(y)
        

In [86]:
movies['overview'] = movies['overview'].apply(removing_punctions_stemming)

In [87]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')

In [88]:
vectors = cv.fit_transform(movies['overview']).toarray()

In [89]:
vectors.shape

(9998, 5000)

In [90]:
cv.get_feature_names_out()

array(['000', '10', '100', ..., 'zombies', 'zone', 'zoo'], dtype=object)

In [108]:
movies.iloc[:, 2:].values

array([[1.097200e+02, 8.700000e+00, 1.735500e+04, 1.972000e+03],
       [9.204800e+01, 8.700000e+00, 2.314400e+04, 1.994000e+03],
       [6.689647e+03, 8.600000e+00, 2.504000e+03, 2.022000e+03],
       ...,
       [1.783100e+01, 5.400000e+00, 9.810000e+02, 2.013000e+03],
       [9.472000e+00, 5.400000e+00, 8.350000e+02, 2.011000e+03],
       [2.210500e+01, 5.400000e+00, 1.415000e+03, 2.001000e+03]])

In [109]:
final = np.concatenate((vectors, movies.iloc[:, 2:].values), axis=1)

In [110]:
final.shape

(9998, 5004)

In [111]:
from sklearn.metrics.pairwise import cosine_similarity

In [176]:
similarities = cosine_similarity(final)

In [177]:
similarities.shape

(9998, 9998)

In [178]:
similarities[4]

array([0.99950266, 0.99828037, 0.37662985, ..., 0.56304759, 0.51255441,
       0.6889963 ])

In [188]:
def recommendation(movie):
    index = np.where(movies['title'] == movie)[0][0]
    similar_movies = sorted(list(enumerate(similarities[index])), key = lambda x:x[1], reverse=True)[1:11]
    for i in similar_movies:
        print(movies.iloc[i[0], 0])
    

In [191]:
recommendation('The Godfather Part II')

Prometheus
Pirates of the Caribbean: Dead Men Tell No Tales
1917
Call Me by Your Name
Raiders of the Lost Ark
The Devil Wears Prada
Me Before You
A Star Is Born
Kick-Ass
American Beauty


In [196]:
with open("final.pkl", "wb") as f:
    # Use the pickle module to dump the data to the file
    pickle.dump(movies, f)

In [194]:
with open("similarity_scores.pkl", "wb") as f:
    # Use the pickle module to dump the data to the file
    pickle.dump(similarities, f)

In [197]:
movies

Unnamed: 0,title,overview,popularity,vote_average,vote_count,year
0,The Godfather,"span the year 1945 to 1955, a chronicl of the ...",109.720,8.7,17355,1972
1,The Shawshank Redemption,frame in the 1940 for the doubl murder of hi w...,92.048,8.7,23144,1994
2,Puss in Boots: The Last Wish,puss in boot discov that hi passion for advent...,6689.647,8.6,2504,2022
3,The Godfather Part II,in the continu saga of the corleon crime famil...,57.998,8.6,10503,1974
4,Schindler's List,the true stori of how businessman oskar schind...,55.299,8.6,13687,1993
...,...,...,...,...,...,...
9995,Sleeping Beauty,"a haunt erot fairytal about lucy, a young univ...",12.709,5.4,525,2011
9996,Freejack,time-travel bounti hunter find a doom race-car...,10.705,5.4,263,1992
9997,The Colony,"forc underground by the next ice age, a strugg...",17.831,5.4,981,2013
9998,The Roommate,"when sara, a young design student from iowa, a...",9.472,5.4,835,2011


🍿 - Popcorn (often used to represent watching a movie or going to the cinema)
🎥 - Movie camera
🎬 - Clapper board (often used to represent the film industry or movie-making)
🎞️ - Film frames
🎭 - Theater masks (often used to represent drama or acting)
🎦 - Cinema
🍫 - Chocolate (often associated with movie theater snacks)
🎵 - Musical note (often used to represent musical movies)
📽️ - Film projector
🔍 - Magnifying glass (often used to represent mystery or detective movies)
👻 - Ghost (often used to represent horror movies)