In [124]:
import pandas as pd
# Import necessary modules from the NLTK library for text processing
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Download NLTK resources for tokenization, lemmatization, and stopwords
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Import CountVectorizer from scikit-learn for text vectorization
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Import train_test_split from scikit-learn for splitting data into training and test sets
from sklearn.model_selection import train_test_split

# Import cosine_similarity from scikit-learn for computing similarity between vectors
from sklearn.metrics.pairwise import cosine_similarity

# Import the pickle module for serializing Python objects
import pickle


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\prajj\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\prajj\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prajj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Movie recommender system using TF-IDF Vectorization and Cosine similarity

In [3]:
movies_dataset = pd.read_csv("top10K-TMDB-movies.csv")

In [4]:
movies_dataset.head()

Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count
0,278,The Shawshank Redemption,"Drama,Crime",en,Framed in the 1940s for the double murder of h...,94.075,1994-09-23,8.7,21862
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance",hi,"Raj is a rich, carefree, happy-go-lucky second...",25.408,1995-10-19,8.7,3731
2,238,The Godfather,"Drama,Crime",en,"Spanning the years 1945 to 1955, a chronicle o...",90.585,1972-03-14,8.7,16280
3,424,Schindler's List,"Drama,History,War",en,The true story of how businessman Oskar Schind...,44.761,1993-12-15,8.6,12959
4,240,The Godfather: Part II,"Drama,Crime",en,In the continuing saga of the Corleone crime f...,57.749,1974-12-20,8.6,9811


In [5]:
movies_dataset.columns

Index(['id', 'title', 'genre', 'original_language', 'overview', 'popularity',
       'release_date', 'vote_average', 'vote_count'],
      dtype='object')

In [62]:
movies_dataset.shape

(10000, 4)

In [10]:
movies_dataset = movies_dataset[['id', 'title', 'genre', 'overview']]

In [11]:
movies_dataset.head()

Unnamed: 0,id,title,genre,overview
0,278,The Shawshank Redemption,"Drama,Crime",Framed in the 1940s for the double murder of h...
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance","Raj is a rich, carefree, happy-go-lucky second..."
2,238,The Godfather,"Drama,Crime","Spanning the years 1945 to 1955, a chronicle o..."
3,424,Schindler's List,"Drama,History,War",The true story of how businessman Oskar Schind...
4,240,The Godfather: Part II,"Drama,Crime",In the continuing saga of the Corleone crime f...


In [12]:
#combining genre and overview to get more enrich feature vector
movies_dataset["description"] = movies_dataset["genre"] + movies_dataset["overview"]

In [16]:
#dropping overview and genre column
movies_dataset.drop(["genre", "overview"],axis = 1, inplace = True)

In [18]:
movies_dataset.head()

Unnamed: 0,id,title,description
0,278,The Shawshank Redemption,"Drama,CrimeFramed in the 1940s for the double ..."
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,RomanceRaj is a rich, carefree, h..."
2,238,The Godfather,"Drama,CrimeSpanning the years 1945 to 1955, a ..."
3,424,Schindler's List,"Drama,History,WarThe true story of how busines..."
4,240,The Godfather: Part II,"Drama,CrimeIn the continuing saga of the Corle..."


In [35]:
#function to preprocess the text
def preprocess_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'[^\w\s\d]', ' ', text)
        #tokenize the text into words
        words = word_tokenize(text)
        #define english stopwords
        stop_words = set(stopwords.words("english"))
        #removing stopwords from the word tokens
        words = [word for word in words if word not in stop_words]
        #initializing the WordNet Lemmatizer
        lemmatizer = WordNetLemmatizer()
        #lemmatizing each word
        words = [lemmatizer.lemmatize(word) for word in words]
        #join words back into string
        text = ' '.join(words)
        return text

In [36]:
movies_dataset["clean description"] = movies_dataset['description'].apply(preprocess_text)

In [37]:
movies_dataset.head()

Unnamed: 0,id,title,description,clean description
0,278,The Shawshank Redemption,"Drama,CrimeFramed in the 1940s for the double ...",drama crimeframed 1940s double murder wife lov...
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,RomanceRaj is a rich, carefree, h...",comedy drama romanceraj rich carefree happy go...
2,238,The Godfather,"Drama,CrimeSpanning the years 1945 to 1955, a ...",drama crimespanning year 1945 1955 chronicle f...
3,424,Schindler's List,"Drama,History,WarThe true story of how busines...",drama history warthe true story businessman os...
4,240,The Godfather: Part II,"Drama,CrimeIn the continuing saga of the Corle...",drama crimein continuing saga corleone crime f...


In [115]:
# Initialize a TfidfVectorizer object with a maximum of 10,000 features and English stop words
tfidf_vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')

In [116]:
# Fit the TF-IDF vectorizer to the 'clean description' column and transform the text data into a numerical vector representation
feature_vectors = tfidf_vectorizer.fit_transform(movies_dataset['clean description'].values.astype('U')).toarray()

In [117]:
# Retrieve the feature names used in the TF-IDF matrix
feature_names = tfidf_vectorizer.get_feature_names_out()

In [118]:
feature_names

array(['000', '007', '10', ..., 'zombie', 'zone', 'zoo'], dtype=object)

In [119]:
# Check the shape of the resulting vector
feature_vectors.shape

(10000, 10000)

In [120]:
# Calculate the cosine similarity between vectors
similarity = cosine_similarity(feature_vectors)

In [121]:
# Calculate similarity scores for the second movie with all other movies, sort them, and store the result
distance = sorted(list(enumerate(similarity[1])), reverse=True, key=lambda vector: vector[1])

In [122]:
# Print the titles of the first five movies most similar to the second movie
for i in distance[1:6]:
    print(movies_dataset.iloc[i[0]].title)

Padmaavat
A Passage to India
The Cheetah Girls: One World
After the Wedding
The Kid with a Bike


In [123]:
# Define a function to recommend the top 5 similar movies for a given movie title
def recommend(movies):
    # Find the index of the given movie in the DataFrame
    index = movies_dataset[movies_dataset['title'] == movies].index[0]
    # Calculate similarity scores, sort them, and print titles of the top 5 similar movies
    distance = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda vector: vector[1])
    for i in distance[1:6]:
        print(movies_dataset.iloc[i[0]].title)

# Call the recommend function with "Iron Man" as the argument
recommend("The Shawshank Redemption")

Brubaker
In Hell
Escape Plan
No Escape
Le Trou


In [125]:
# Serialize the 'movies_dataset' DataFrame and save it to a file
pickle.dump(movies_dataset, open('movies_list.pkl', 'wb'))
pickle.dump(movies_dataset, open('similarity.pkl', 'wb'))

# Deserialize the 'movies_list.pkl' file back into a Python object
pickle.load(open('movies_list.pkl', 'rb'))

# Import the os module for interacting with the operating system
import os

# Print the current working directory
print(os.getcwd())

C:\Users\prajj\production grade movie recommendation systyem
