In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [2]:
df = pd.read_csv('dataset.csv')
df.head()

Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count
0,278,The Shawshank Redemption,"Drama,Crime",en,Framed in the 1940s for the double murder of h...,94.075,1994-09-23,8.7,21862
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance",hi,"Raj is a rich, carefree, happy-go-lucky second...",25.408,1995-10-19,8.7,3731
2,238,The Godfather,"Drama,Crime",en,"Spanning the years 1945 to 1955, a chronicle o...",90.585,1972-03-14,8.7,16280
3,424,Schindler's List,"Drama,History,War",en,The true story of how businessman Oskar Schind...,44.761,1993-12-15,8.6,12959
4,240,The Godfather: Part II,"Drama,Crime",en,In the continuing saga of the Corleone crime f...,57.749,1974-12-20,8.6,9811


In [3]:
df.shape

(10000, 9)

In [4]:
df.isna().sum()

id                    0
title                 0
genre                 3
original_language     0
overview             13
popularity            0
release_date          0
vote_average          0
vote_count            0
dtype: int64

### Handle Missing Data

In [None]:
df['overview'] = df['overview'].fillna('')
df['genre'] = df['genre'].fillna('')

### Create a Feature Column (tag)

In [None]:
# Combining 'overview' and 'genre' into a single column
df['tag'] = df['overview'] + " " + df['genre']

### Convert Text into Vectors (TF-IDF)

In [None]:
# Initializing the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english')

# Converting text to a numerical matrix
tfidf_matrix = vectorizer.fit_transform(df['tag'])

# Displaying shape (rows = movies, columns = words)
print(tfidf_matrix.shape)

(10000, 27965)


### Compute Cosine Similarity

In [None]:
# Computing cosine similarity between all movies
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Displaying shape (rows = movies, columns = movies)
print(cosine_sim.shape)

(10000, 10000)


### Save the Required Objects Using Pickle

In [None]:
# Saving all necessary objects using pickle
with open('cosine_similarity.pkl', 'wb') as file:
    pickle.dump(cosine_sim, file)

with open('movies_df.pkl', 'wb') as file:
    pickle.dump(df, file)