In [5]:
import pandas as pd
import numpy as np

movies_df = pd.read_csv('../data/raw/movies.csv')
ratings_df = pd.read_csv('../data/raw/ratings.csv')

 # Merging ratings with mobie details to get titles alonside ratings
df = pd.merge(ratings_df, movies_df, on='movieId', how='left')

print("Data Reloaded and Merged. Head of Merged DataFrame:\n", df.head())


Data Reloaded and Merged. Head of Merged DataFrame:
    userId  movieId  rating  timestamp                        title  \
0       1        1     4.0  964982703             Toy Story (1995)   
1       1        3     4.0  964981247      Grumpier Old Men (1995)   
2       1        6     4.0  964982224                  Heat (1995)   
3       1       47     5.0  964983815  Seven (a.k.a. Se7en) (1995)   
4       1       50     5.0  964982931   Usual Suspects, The (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                               Comedy|Romance  
2                        Action|Crime|Thriller  
3                             Mystery|Thriller  
4                       Crime|Mystery|Thriller  


In [6]:
# Creating the User-Item Matrix
user_movie_matrix = df.pivot_table(index='userId', columns='title', values='rating')

print("\nUser-Movie Matrix Head (first 5 users, first few movies alphabetically):")
print(user_movie_matrix.iloc[:5, :10])
print("\nShape of User-Movie Matrix:", user_movie_matrix.shape)
print("\nNumber of NaN values in User-Movie Matrix:", user_movie_matrix.isnull().sum().sum())



User-Movie Matrix Head (first 5 users, first few movies alphabetically):
title   '71 (2014)  'Hellboy': The Seeds of Creation (2004)  \
userId                                                        
1              NaN                                      NaN   
2              NaN                                      NaN   
3              NaN                                      NaN   
4              NaN                                      NaN   
5              NaN                                      NaN   

title   'Round Midnight (1986)  'Salem's Lot (2004)  \
userId                                                
1                          NaN                  NaN   
2                          NaN                  NaN   
3                          NaN                  NaN   
4                          NaN                  NaN   
5                          NaN                  NaN   

title   'Til There Was You (1997)  'Tis the Season for Love (2015)  \
userId                      

In [7]:
# Calculating correlation between movies
item_similarity_df = user_movie_matrix.corr(method='pearson')

print("Item Similarity DataFrame Head (showing correlations between movies):")
print(item_similarity_df.iloc[:5, :5])
print("\nShape of Item Similarity DataFrame:", item_similarity_df.shape)

Item Similarity DataFrame Head (showing correlations between movies):
title                                    '71 (2014)  \
title                                                 
'71 (2014)                                      NaN   
'Hellboy': The Seeds of Creation (2004)         NaN   
'Round Midnight (1986)                          NaN   
'Salem's Lot (2004)                             NaN   
'Til There Was You (1997)                       NaN   

title                                    'Hellboy': The Seeds of Creation (2004)  \
title                                                                              
'71 (2014)                                                                   NaN   
'Hellboy': The Seeds of Creation (2004)                                      NaN   
'Round Midnight (1986)                                                       NaN   
'Salem's Lot (2004)                                                          NaN   
'Til There Was You (1997)               

In [8]:
# creating recommendation function
def get_movie_recommendations(movie_title, item_similarity_df, user_movie_matrix, num_recommendations=10):
    """
    Recommends similar movies based on item-item similarity.

    Args:
        movie_title (str): The title of the movie for which to find recommendations.
        item_similarity_df (pd.DataFrame): DataFrame of item-item similarities.
        user_movie_matrix (pd.DataFrame): User-item matrix.
        num_recommendations (int): Number of recommendations to return.

    Returns:
        pd.Series: A Series of recommend movie titles.
    """
    if movie_title not in item_similarity_df.columns:
        print(f"Movie '{movie_title}' not found in the dataset.")
        return pd.Series()

    similar_scores = item_similarity_df[movie_title].sort_values(ascending=False)
    similar_scores = similar_scores.drop(movie_title)
    similar_scores = similar_scores.dropna()
    return similar_scores.head(num_recommendations)

print("Recommendation function defined.")
    

Recommendation function defined.


In [9]:
# Testing recommendation function
recommended_movies = get_movie_recommendations('Toy Story (1995)', item_similarity_df, user_movie_matrix)

print("\nRecomendations for 'Toy Story (1995)':")
print(recommended_movies)

# Testing -2
recommended_movies2 = get_movie_recommendations('Forrest Gump (1994)', item_similarity_df, user_movie_matrix)

print("\nRecommendations for 'Forrest Gump (1994)':")
print(recommended_movies2)


Recomendations for 'Toy Story (1995)':
title
Senna (2010)                                          1.0
Eddie Murphy Delirious (1983)                         1.0
Brigadoon (1954)                                      1.0
Stalker (1979)                                        1.0
Claim, The (2000)                                     1.0
Hearts of Darkness: A Filmmakers Apocalypse (1991)    1.0
Halloween III: Season of the Witch (1982)             1.0
Hall Pass (2011)                                      1.0
Guy Thing, A (2003)                                   1.0
Persuasion (2007)                                     1.0
Name: Toy Story (1995), dtype: float64

Recommendations for 'Forrest Gump (1994)':
title
Memories (Memorîzu) (1995)                   1.0
Locke (2013)                                 1.0
Batman: Gotham Knight (2008)                 1.0
Mirror, The (Zerkalo) (1975)                 1.0
Blue Collar Comedy Tour: The Movie (2003)    1.0
Time Lapse (2014)                       

In [10]:
# Improving the recommendation function, to only recommend movies that specified userId has not rated yet

def get_personalized_recommendations(user_id, movie_title, item_similarity_df, user_movie_matrix, num_recommendations=10, min_ratings_threshold=50):
    """
    Recommends similar movies for a specific user, excluding movies they've already rated,
    and considering a minimum number of common ratings for similarity calculation.

    Args:
        user_id (int): The ID of the user for whom to generate recommendations.
        movie_title (str): The title of the movie the user liked.
        item_similarity_df (pd.DataFrame): DataFrame of item-item similarities.
        user_movie_matrix (pd.DataFrame): User-item matrix.
        num_recommendations (int): Number of recommendations to return.
        min_ratings_threshold (int): Minimum number of common ratings required for a valid similarity.

    Returns:
        pd.Series: A Series of recommended movie titles.
    """
    
    if movie_title not in item_similarity_df.columns:
        print(f"Movie '{movie_title}' not found in the dataset.")
        return pd.Series()

    similar_scores = item_similarity_df[movie_title].sort_values(ascending=False)
    user_rated_movies = user_movie_matrix.loc[user_id].dropna().index.tolist()

    recommended_titles = []
    for movie, score in similar_scores.items():
        if pd.isna(score): 
            continue
        if movie == movie_title:
            continue
        if movie in user_rated_movies:
            continue
        recommended_titles.append((movie, score))
        if len(recommended_titles) >= num_recommendations:
            break
    return pd.Series([title for title, score in recommended_titles],
                     index=[score for title, score in recommended_titles])

print("Personalized Recommendation function defined.")


Personalized Recommendation function defined.


In [11]:
# Example: Get personalized recommendations for User ID 1 who liked 'Toy Story (1995)'
user_id_to_test = 1
movie_liked_by_user = 'Toy Story (1995)'
personalized_recommendations = get_personalized_recommendations(
    user_id_to_test,
    movie_liked_by_user,
    item_similarity_df,
    user_movie_matrix,
    num_recommendations=5 
)

print(f"\nPersonalized Recommendations for User {user_id_to_test} (based on liking '{movie_liked_by_user}'):")
print(personalized_recommendations)



Personalized Recommendations for User 1 (based on liking 'Toy Story (1995)'):
1.0                     Senna (2010)
1.0    Eddie Murphy Delirious (1983)
1.0                 Brigadoon (1954)
1.0                   Stalker (1979)
1.0                Claim, The (2000)
dtype: object
