In [1]:
import pandas as pd
import os

In [28]:


file_path = r"C:\Users\pricc\Downloads\movies_enriched_full.csv"
df = pd.read_csv(file_path, header=None)
print(df.head())


        0                         1                             2     3   \
0  movieId                     title                        genres  year   
1        1          Toy Story (1995)   Animation|Children's|Comedy  1995   
2        2            Jumanji (1995)  Adventure|Children's|Fantasy  1995   
3        3   Grumpier Old Men (1995)                Comedy|Romance  1995   
4        4  Waiting to Exhale (1995)                  Comedy|Drama  1995   

                  4        5   \
0        clean_title  tmdb_id   
1          Toy Story    862.0   
2            Jumanji   8844.0   
3   Grumpier Old Men  15602.0   
4  Waiting to Exhale  31357.0   

                                                  6   \
0                                           overview   
1  Led by Woody, Andy's toys live happily in his ...   
2  When siblings Judy and Peter discover an encha...   
3  A family wedding reignites the ancient feud be...   
4  Cheated on, mistreated and stepped on, the wom...   

       

In [26]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import os

# ----------------------------------------
# Load Enriched Movie Data
# ----------------------------------------
df = pd.read_csv("movies_enriched_full.csv")

# ----------------------------------------
# SCombine Metadata Fields
# ----------------------------------------

def combine_metadata(row):
    return " ".join([
        str(row["tmdb_genres"]) if pd.notnull(row["tmdb_genres"]) else "",
        str(row["keywords"]) if pd.notnull(row["keywords"]) else "",
        str(row["top_3_cast"]) if pd.notnull(row["top_3_cast"]) else "",
        str(row["directors"]) if pd.notnull(row["directors"]) else ""
    ]).lower().replace(",", " ").replace(":", " ").replace("-", " ")

df["metadata"] = df.apply(combine_metadata, axis=1)

# ----------------------------------------
#  Build Vectorizers
# ----------------------------------------

# Count Vectorizer
count_vectorizer = CountVectorizer(stop_words='english')
count_matrix = count_vectorizer.fit_transform(df["metadata"])

# TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df["metadata"])

# ----------------------------------------
#  Compute Cosine Similarity
# ----------------------------------------

cosine_sim_count = cosine_similarity(count_matrix)
cosine_sim_tfidf = cosine_similarity(tfidf_matrix)

# ----------------------------------------
#  Save Results
# ----------------------------------------

# Save similarity matrices as NumPy arrays
np.save("cosine_sim_count.npy", cosine_sim_count)
np.save("cosine_sim_tfidf.npy", cosine_sim_tfidf)

# Optional: Save similarity matrices as CSVs
pd.DataFrame(cosine_sim_count, index=df["title"], columns=df["title"]).to_csv("cosine_sim_count.csv")
pd.DataFrame(cosine_sim_tfidf, index=df["title"], columns=df["title"]).to_csv("cosine_sim_tfidf.csv")

print(" Models built and similarity matrices saved.")

 Models built and similarity matrices saved.


In [25]:
def recommend_movies(title, similarity_matrix, df, top_n=10):
    idx = df[df["title"] == title].index[0]
    sim_scores = list(enumerate(similarity_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    similar_movies = [(df.iloc[i]["title"], score) for i, score in sim_scores]
    return similar_movies

# Example: Recommend movies similar to "Toy Story (1995)"
print("\nTop 5 similar movies using TF-IDF:")
for movie, score in recommend_movies("Toy Story (1995)", cosine_sim_tfidf, df, top_n=5):
    print(f"{movie} (Score: {score:.4f})")


Top 5 similar movies using TF-IDF:
Small Soldiers (1998) (Score: 0.3903)
Toy Story 2 (1999) (Score: 0.3539)
Indian in the Cupboard, The (1995) (Score: 0.3101)
Toys (1992) (Score: 0.2656)
Babes in Toyland (1961) (Score: 0.2394)


CountVectorizer

In [19]:
import pandas as pd
import numpy as np

# Load movie data and similarity matrices
df = pd.read_csv("movies_enriched_full.csv")
cosine_sim_count = np.load("cosine_sim_count.npy")
cosine_sim_tfidf = np.load("cosine_sim_tfidf.npy")
print(" Cosine Similarity Matrix (CountVectorizer):")
print(pd.DataFrame(cosine_sim_count, index=df["title"], columns=df["title"]).iloc[:5, :5])

 Cosine Similarity Matrix (CountVectorizer):
title                               Toy Story (1995)  Jumanji (1995)  \
title                                                                  
Toy Story (1995)                            1.000000        0.054433   
Jumanji (1995)                              0.054433        1.000000   
Grumpier Old Men (1995)                     0.054433        0.000000   
Waiting to Exhale (1995)                    0.066227        0.064889   
Father of the Bride Part II (1995)          0.046676        0.034300   

title                               Grumpier Old Men (1995)  \
title                                                         
Toy Story (1995)                                   0.054433   
Jumanji (1995)                                     0.000000   
Grumpier Old Men (1995)                            1.000000   
Waiting to Exhale (1995)                           0.097333   
Father of the Bride Part II (1995)                 0.068599   

title   

Toy story has 1.000000 self-similarity to Toy Story

0.054433 is how similar Toy Story is to Jumanji

0.000000 means no content overlap between the movies for example Grumpier Old Men and Jumanji have no overlapping keywords/cast/etc.

Waiting to Exhale has 6.6% similarity to Toy Story under CountVectoriz

TF-IDF

In [18]:
print("\n Cosine Similarity Matrix (TF-IDF):")
print(pd.DataFrame(cosine_sim_tfidf, index=df["title"], columns=df["title"]).iloc[:5, :5])



 Cosine Similarity Matrix (TF-IDF):
title                               Toy Story (1995)  Jumanji (1995)  \
title                                                                  
Toy Story (1995)                            1.000000        0.014150   
Jumanji (1995)                              0.014150        1.000000   
Grumpier Old Men (1995)                     0.022040        0.000000   
Waiting to Exhale (1995)                    0.028133        0.017912   
Father of the Bride Part II (1995)          0.009242        0.009244   

title                               Grumpier Old Men (1995)  \
title                                                         
Toy Story (1995)                                   0.022040   
Jumanji (1995)                                     0.000000   
Grumpier Old Men (1995)                            1.000000   
Waiting to Exhale (1995)                           0.017067   
Father of the Bride Part II (1995)                 0.024845   

title           

Print Top 5 Similar Movies for a Given Title

In [24]:
def print_recommendations(title, similarity_matrix, df, top_n=5):
    idx = df[df["title"] == title].index[0]
    sim_scores = list(enumerate(similarity_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    print(f"\n Top {top_n} similar movies to '{title}':")
    for i, (movie_idx, score) in enumerate(sim_scores, 1):
        print(f"{i}. {df.iloc[movie_idx]['title']} (Similarity: {score:.4f})")

In [23]:
print('TF-IDF Recommendations')
print_recommendations("Toy Story (1995)", cosine_sim_tfidf, df, top_n=5)

print(' CountVectorizer Recommendations')
print_recommendations("Toy Story (1995)", cosine_sim_count, df, top_n=5)

TF-IDF Recommendations

 Top 5 similar movies to 'Toy Story (1995)':
1. Small Soldiers (1998) (Similarity: 0.3903)
2. Toy Story 2 (1999) (Similarity: 0.3539)
3. Indian in the Cupboard, The (1995) (Similarity: 0.3101)
4. Toys (1992) (Similarity: 0.2656)
5. Babes in Toyland (1961) (Similarity: 0.2394)
 CountVectorizer Recommendations

 Top 5 similar movies to 'Toy Story (1995)':
1. Toy Story 2 (1999) (Similarity: 0.4518)
2. Small Soldiers (1998) (Similarity: 0.3790)
3. Indian in the Cupboard, The (1995) (Similarity: 0.2887)
4. Big (1988) (Similarity: 0.2502)
5. Babes in Toyland (1961) (Similarity: 0.2485)
