#### importing important libraries

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
# Load MovieLens data
movies = pd.read_csv('./dataset/ml-32m/movies.csv')   
tags = pd.read_csv('./dataset/ml-32m/tags.csv')       

# View basic info
print(movies.head())
print(tags.head())


   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
   userId  movieId          tag   timestamp
0      22    26479  Kevin Kline  1583038886
1      22    79592     misogyny  1581476297
2      22   247150   acrophobia  1622483469
3      34     2174        music  1249808064
4      34     2174        weird  1249808102


In [3]:
# Step 3 (Fixed): Group tags by movieId and handle NaN

# First, drop NaN tags and convert to string just to be sure
tags['tag'] = tags['tag'].fillna('').astype(str)

# Group by movieId and join all tags into a single string
tags_grouped = tags.groupby('movieId')['tag'].apply(lambda x: " ".join(x)).reset_index()

# Merge with movie metadata
movie_data = pd.merge(movies, tags_grouped, on='movieId', how='left')


In [4]:
# Fill missing values
movie_data['tag'] = movie_data['tag'].fillna('')
movie_data['genres'] = movie_data['genres'].fillna('')

# Combine genres and tags into a single content field
movie_data['content'] = movie_data['genres'] + ' ' + movie_data['tag']


In [5]:
# Filter top 2000 movies for faster prototyping
movie_data = movie_data.head(2000)

# TF-IDF with max features
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf.fit_transform(movie_data['content'])

# Optional: Check shape
print("TF-IDF matrix shape:", tfidf_matrix.shape)

# Cast to float32
tfidf_matrix = tfidf_matrix.astype('float32')




TF-IDF matrix shape: (2000, 5000)


In [6]:
# Compute similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Optional: View similarity scores
print(cosine_sim[:5, :5])


[[0.99999964 0.06925565 0.03222631 0.00519176 0.03132839]
 [0.06925565 1.         0.00950474 0.07960154 0.0341245 ]
 [0.03222631 0.00950474 1.0000001  0.02056679 0.09993804]
 [0.00519176 0.07960154 0.02056679 0.99999994 0.02018245]
 [0.03132839 0.0341245  0.09993804 0.02018245 0.9999999 ]]


In [7]:
# Create a reverse map of movie titles to index
indices = pd.Series(movie_data.index, index=movie_data['title'].str.lower()).drop_duplicates()


In [8]:
def get_recommendations(title, cosine_sim=cosine_sim):
    title = title.lower()
    idx = indices.get(title)

    if idx is None:
        return "Movie not found. Please check the name."

    # Get pairwise similarity scores for the movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort movies based on similarity score
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get top 5 recommendations (excluding the input movie itself)
    sim_scores = sim_scores[1:6]
    movie_indices = [i[0] for i in sim_scores]

    return movie_data['title'].iloc[movie_indices].tolist()


In [9]:
print(movie_data['title'].sample(10))


1831                     Small Soldiers (1998)
127                      Pie in the Sky (1996)
1165     Cheech and Chong's Up in Smoke (1978)
1906     Poltergeist II: The Other Side (1986)
1806                  Can't Hardly Wait (1998)
1623         I Love You, I Love You Not (1996)
1352                             Jaws 2 (1978)
1251                     Big Sleep, The (1946)
559                   Naked in New York (1994)
298     Queen Margot (Reine Margot, La) (1994)
Name: title, dtype: object


In [10]:
movie_input = "The Dark Knight"
recommendations = get_recommendations(movie_input)

print(f"\nTop 5 recommendations for '{movie_input}':")

# Check if result is a list of movies
if isinstance(recommendations, list):
    for i, movie in enumerate(recommendations, 1):
        print(f"{i}. {movie}")
else:
    # It's an error message (string)
    print(recommendations)



Top 5 recommendations for 'The Dark Knight':
Movie not found. Please check the name.


In [11]:
print(get_recommendations("Stand by Me (1986)"))

['Fausto (1993)', 'Cure, The (1995)', 'My Life as a Dog (Mitt liv som hund) (1985)', 'Asfour Stah (1990)', 'King of the Hill (1993)']
