In [1]:
import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

# Download necessary NLTK datasets
nltk.download('stopwords')
from nltk.corpus import stopwords


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sm865\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Load the dataset from a CSV file
df = pd.read_csv('dataset/movies_data.csv')

# Check the data structure
print(df.head())

# Fill missing values
df.fillna('', inplace=True)

# Combine relevant features into a single text field for each movie
df['combined_features'] = df['genres'] + ' ' + df['keywords'] + ' ' + df['cast'] + ' ' + df['crew'] + ' ' + df['tags']


   movie_id                                   title  \
0   1118031  Apocalypse Z: The Beginning of the End   
1   1184918                          The Wild Robot   
2    933260                           The Substance   
3    912649                   Venom: The Last Dance   
4   1034541                             Terrifier 3   

                                            overview  \
0  When a kind of rabies that transforms people i...   
1  After a shipwreck, an intelligent robot called...   
2  A fading celebrity decides to use a black mark...   
3  Eddie and Venom are on the run. Hunted by both...   
4  Five years after surviving Art the Clown's Hal...   

                               genres  \
0               Drama, Action, Horror   
1  Animation, Science Fiction, Family   
2      Drama, Horror, Science Fiction   
3  Science Fiction, Action, Adventure   
4           Horror, Thriller, Mystery   

                                            keywords  \
0  based on novel or book, ca

In [3]:
# Preprocessing: Remove stopwords and perform TF-IDF vectorization
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Remove stopwords from the combined text features
    return ' '.join([word for word in text.split() if word.lower() not in stop_words])

df['combined_features'] = df['combined_features'].apply(preprocess_text)

# Use TF-IDF Vectorizer to convert text into a matrix of features
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['combined_features'])


In [4]:
# Compute cosine similarity between the movies
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Create a mapping of movie_id to index
movie_indices = pd.Series(df.index, index=df['movie_id']).to_dict()


In [5]:
def recommend_movies_by_name(movie_name, num_recommendations=5):
    # Get the index of the movie by its title (case-insensitive match)
    idx = df[df['title'].str.contains(movie_name, case=False, na=False)].index

    if len(idx) == 0:
        return "Movie not found."

    # Get the index of the first matched movie (in case of multiple matches)
    idx = idx[0]

    # Get pairwise similarity scores for the movie with all other movies
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on similarity scores (in descending order)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the top 'num_recommendations' movies
    sim_scores = sim_scores[1:num_recommendations + 1]

    # Get the movie indices of the recommended movies
    movie_indices_recommended = [i[0] for i in sim_scores]

    # Return the recommended movies
    recommended_movies = df.iloc[movie_indices_recommended]
    return recommended_movies[['movie_id', 'title', 'overview']]

# Example usage
movie_name = "Venom"  # Replace with a valid movie title from your dataset
recommended_movies = recommend_movies_by_name(movie_name, 5)
print(recommended_movies)


      movie_id                        title  \
13      580489  Venom: Let There Be Carnage   
5453   1065311    Starring Jerry As Himself   
12      335983                        Venom   
81      634649      Spider-Man: No Way Home   
5745    617127                        Blade   

                                               overview  
13    After finding a host body in investigative rep...  
5453  Jerry, an ordinary immigrant dad, retired in O...  
12    Investigative journalist Eddie Brock attempts ...  
81    Peter Parker is unmasked and no longer able to...  
5745  An upcoming film in the Marvel Cinematic Unive...  


In [6]:
# Save the movie list (df) to a pickle file
with open('model/movie_list.pkl', 'wb') as f:
    pickle.dump(df, f)

# Save the similarity matrix (cosine_sim) to a pickle file
with open('model/movie_similarity.pkl', 'wb') as f:
    pickle.dump(cosine_sim, f)


In [7]:
# Load movie list from pickle file
with open('model/movie_list.pkl', 'rb') as f:
    df_loaded = pickle.load(f)

# Load similarity matrix from pickle file
with open('model/movie_similarity.pkl', 'rb') as f:
    cosine_sim_loaded = pickle.load(f)

# Now you can use the loaded data to make recommendations
