# First Thing First
Import necessary libraries (pandas,numpy,scipy, etc.), define file paths, import MovieLens dataset (CSV files), display the first few rows of all the files and print the total number of ratings in the dataset.

In [2]:
import pandas as pd
import numpy as np
import pandas as pd
from scipy.stats import pearsonr
import time
file_path = 'ml-latest-small'

#importing all the movielens's main csv files

link_csv= file_path + '/links.csv'
movies_csv = file_path + '/movies.csv'
ratings_file = file_path + '/ratings.csv'
tags_csv = file_path + '/tags.csv'

links=pd.read_csv(link_csv)
movies = pd.read_csv(movies_csv)
ratings = pd.read_csv(ratings_file)
tags = pd.read_csv(tags_csv)

print("Links Data:")
print(links.head())

print("\n Movies Data:")
print(movies.head())

print("\n Ratings Data:")
print(ratings.head())

print("\n Tags Data: ")
print(tags.head())

#count number of total ratings by counting the total rows
ratings_count= ratings.shape[0]
print(f"\n Total count of ratings (rows) in the dataset: {ratings_count}")

Links Data:
   movieId  imdbId   tmdbId
0        1  114709    862.0
1        2  113497   8844.0
2        3  113228  15602.0
3        4  114885  31357.0
4        5  113041  11862.0

 Movies Data:
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  

 Ratings Data:
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4   

# Removing movies without any rating
This prune_movies_df function is removing movies from the movies DataFrame which have no rating from any of the Users.

In [3]:
def prune_movies_df(movies, ratings):
    movie_ids = np.array(movies['movieId'])
    #print(len(movie_ids))  # Uncomment for debugging purposes
    rated_movie_ids = set(list(np.array(ratings['movieId'])))
    #print(len(rated_movie_ids))  # Uncomment for debugging purposes
    uncommon_movies = []
    # Getting movies that are in movies.csv but not in ratings.csv
    for i in movie_ids:
        if i not in rated_movie_ids:
            uncommon_movies.append(i)
    ind = []
    for movie_id in uncommon_movies:
        movies.drop(movies.loc[movies['movieId'] == movie_id].index,
                    inplace=True)  # removing those movies from the dataframe
    print(f'There were {len(uncommon_movies)} movies without any ratings. We removed them.')
    return movies





# Get similar users
We are creating a user item matrix (user_rating_mat), where the indexes are the userId, columns are movieId and values are the ratings of those movies ratings assigned by the user IDs. Then we define the get_similarUsers function to find similar users using Pearson correlation.

In [4]:
# Grouping ratings and creating user-item matrix
movies = prune_movies_df(movies, ratings)  # pruning the movies from movies_df that weren't rated by anyone
user_rating_mat = ratings.pivot(index='userId', columns='movieId', values='rating')

def get_similarUsers(u_index, ui_matrix, topk_users=30):  # get top k similar users
    sim = []
    total_users = ui_matrix.shape[0]
    x = np.array(ui_matrix.iloc[u_index, :])  # converting our main user's ratings into a numpy array
    for i in range(total_users):
        if i != u_index:
            y = np.array(ui_matrix.iloc[i, :])  # converting similar user's ratings into a numpy array
            xy_stacked = np.vstack([x, y])  # Stacking two users
            xy = xy_stacked[:, ~np.any(np.isnan(xy_stacked), axis=0)]  # only getting the vector where both arrays have values

            if len(xy[0]) > 3 and len(xy[1]) > 3:  # assumption
                r, _ = pearsonr(xy[0], xy[1])
                if np.isnan(r):  # Checking if pearson coefficient is Nan
                    r = 0
                sim.append((i, r))  # appending (index, value) tuple in the similarity list
    sim.sort(key=lambda o: o[1])  # sorting the tuples based on similarity
    sim_top=list(reversed(sim[-topk_users:]))
    return sim_top  # getting the top 30 most similar users to our user


There were 18 movies without any ratings. We removed them.


# Prediction Function
This pred function predicts ratings for movies for a user if the user has not rated it yet. We are using top 30 similar users' similarity score for calculating the predicted ratings.

In [5]:
def pred(ui_matrix, u_index, sim_matrix, topk_recommendations):
    item_pred = []
    x = np.array(ui_matrix.iloc[u_index, :])
    x_mean = np.nanmean(x)

    for item in range(len(x)):
        if np.isnan(ui_matrix.iloc[u_index, item]):
            simsum_centering = []
            sim_sum = 0  # Initialize sim_sum for this item

            for index, similarity in sim_matrix:
                if not np.isnan(ui_matrix.iloc[index, item]):
                    y = np.array(ui_matrix.iloc[index, :])
                    y_mean = np.nanmean(y)
                    rating = ui_matrix.iloc[index, item]
                    rating_centered = rating - y_mean
                    simsum_centering.append(similarity * rating_centered)

                    sim_sum += similarity  # Add similarity for users who rated the item

            if sim_sum != 0 and np.sum(simsum_centering) != 0: # check zero values
                pred = x_mean + (np.sum(simsum_centering) / abs(sim_sum))
            else:
                pred = x_mean  # If either is zero, set pred to x_mean
            item_pred.append((item, pred))

    item_pred.sort(key=lambda u: u[1])
    return list(reversed(item_pred[-topk_recommendations:]))

# Recommendation based on Pearson collaborative filtering approach
By running this cell a user will be able to enter a user ID. Then our code will find similar users by using get_similarUsers, predicts movie ratings using pred. It will then print the top 10 similar users and top 20 movie recommended movies (based on highest predicted ratings) along with their genres and predicted scores.

In [6]:

while True:
    user_id_input = input("Enter the user ID: ").strip()  # Get input and trim spaces
    try:
        user_id = int(user_id_input)  # Try converting to integer
        break  # Exit loop if successful
    except ValueError:
        print("Invalid input. Please enter an integer.")
sim = get_similarUsers(u_index=user_id - 1, ui_matrix=user_rating_mat,
                       topk_users=30)  # topk_users argument takes 30 top similar users default
predictions = pred(ui_matrix=user_rating_mat, u_index=user_id - 1, sim_matrix=sim, topk_recommendations=20)

# Top 10 Most Similar Users
print("Top 10 Most Similar Users for user: ", user_id)
for index, score in sim[:10]:
    print ("User ID:", index + 1, ", Score: ", score)  # + 1 because users start from 1

# Top 20 Movie Recommendations for User
print("Top 20 Recommended Movies for user: ", user_id)
print('\n')
p_list = []
for index, value in predictions:
    movie_id = movies.get('movieId')[index]  # get the Movie ID corresponding the index
    movie_title = movies.get('title')[index]  # get the Movie Title corresponding the index
    movie_genre = movies.get('genres')[index]  # get the Movie Genres corresponding the index

    p_list.append(movie_id)
    print("Movie Id:", movie_id, ", Movie: ", movie_title, ", Genre: ", movie_genre, ", Rating: ", value)


  r, _ = pearsonr(xy[0], xy[1])


Top 10 Most Similar Users for user:  1
User ID: 473 , Score:  0.9622504486493761
User ID: 511 , Score:  0.9258200997725516
User ID: 9 , Score:  0.9185586535436918
User ID: 13 , Score:  0.8783100656536796
User ID: 366 , Score:  0.8728715609439694
User ID: 401 , Score:  0.8669214468630106
User ID: 535 , Score:  0.8664002254439633
User ID: 90 , Score:  0.8215838362577492
User ID: 157 , Score:  0.8017837257372732
User ID: 139 , Score:  0.7905694150420948
Top 20 Recommended Movies for user:  1


Movie Id: 319 , Movie:  Shallow Grave (1994) , Genre:  Comedy|Drama|Thriller , Rating:  6.769157088122605
Movie Id: 2761 , Movie:  Iron Giant, The (1999) , Genre:  Adventure|Animation|Children|Drama|Sci-Fi , Rating:  6.237161922155337
Movie Id: 27482 , Movie:  Cube 2: Hypercube (2002) , Genre:  Horror|Mystery|Sci-Fi , Rating:  6.223522167487685
Movie Id: 105653 , Movie:  Escape Plan (2013) , Genre:  Action|Mystery|Thriller , Rating:  6.222049413437611
Movie Id: 1475 , Movie:  Kama Sutra: A Tale of L

### Hybrid Jaccard-Pearson Correlation

This cell introduces a hybrid similarity. It combines the Jaccard similarity and Pearson similarity in a unique way.  

* **Jaccard Index:** Measures the similarity between two users based on the number of common movies they rated.
The Jaccard similarity is calculated as:
$$
\text{Jaccard}(x, y) = \frac{|I_x \cap I_y|}{|I_x \cup I_y|}
$$
where \\( I_x \\) and \\( I_y \\) represent the sets of items rated by users \\( x \\) and \\( y \\), respectively. Here, \\( |I_x \cap I_y| \\) gives the number of movies both users have rated, while \\( |I_x \cup I_y| \\) gives the total number of unique items or movies rated by either user. This ratio emphasizes the intensity of overlap between the two users’ rated movies, which is very much important when users have rated only a few movies.

* **Pearson Correlation:** Measures the linear relationship between two sets of numerical values. It captures the tendency of two users to rate movies similarly.

**Hybrid Approach:**

The `hybrid_jaccard_pearson` function calculates a weighted average of the Jaccard similarity and Pearson correlation. The weight is controlled by the `alpha` parameter (0 <= alpha <= 1).

* `alpha = 0`: Pure Pearson correlation.
* `alpha = 1`: Pure Jaccard similarity.
* `0 < alpha < 1`: A combination of both. We set the alpha as 0.4.

This hybrid approach aims to use the strengths of both approaches, potentially leading to more accurate similarity estimates and improved recommendation quality.

In [7]:
#Function for hybrid Jaccard-Pearson correlation
def hybrid_jaccard_pearson(x, y, alpha=0.4):
    # Stack the ratings of both users
    xy_stacked = np.vstack([x, y])
    # Mask only for common items
    common_mask = ~np.isnan(x) & ~np.isnan(y)
    x_common, y_common = x[common_mask], y[common_mask]
    n_common = np.sum(common_mask)

    # Return 0 if fewer than min_common items are rated in common
    min_common = 4
    if n_common < min_common:
        return 0
    x_mean, y_mean = np.mean(x_common), np.mean(y_common)
    numerator = np.sum((x_common - x_mean) * (y_common - y_mean))
    denominator = np.sqrt(np.sum((x_common - x_mean) ** 2) * np.sum((y_common - y_mean) ** 2))

    raw_pearson = numerator / denominator if denominator != 0 else 0

    # Calculate Jaccard similarity based on the union of rated items
    total_mask = ~np.isnan(x) | ~np.isnan(y)
    total_items = np.sum(total_mask)
    jaccard_similarity = n_common / total_items if total_items > 0 else 0
    hybrid_similarity = alpha * jaccard_similarity + (1 - alpha) * raw_pearson
    return hybrid_similarity

# Function to find similar users
def get_similarUsers_hjp(u_index, ui_matrix, topk_users=30, alpha=0.4):
    sim = []
    total_users = ui_matrix.shape[0]
    x = np.array(ui_matrix.iloc[u_index, :])

    for i in range(total_users):
        if i != u_index:
            y = np.array(ui_matrix.iloc[i, :])

            # Calculate the hybrid Jaccard-Pearson correlation
            r = hybrid_jaccard_pearson(x, y, alpha=alpha)
            sim.append((i, r))
    sim.sort(key=lambda o: o[1])
    sim_top = list(reversed(sim[-topk_users:]))
    return sim_top  # returning the top k most similar users to the target user


# Prediction Function for Hybrid Jaccard-Pearson Correlation
This pred function will use Hybrid Jaccard-Pearson Correlation or similarity.  We are using top 30 similar users' similarity score for calculating the predicted ratings.

In [8]:
def pred_hjp(ui_matrix, u_index, sim_matrix, topk_recommendations=30):
    item_pred = []
    x = np.array(ui_matrix.iloc[u_index, :])
    x_mean = np.nanmean(x)
    for item in range(len(x)):
        if np.isnan(ui_matrix.iloc[u_index, item]):
            simsum_centering = []
            sim_sum = 0  # Initialize sim_sum for this item
            for index, similarity in sim_matrix:
                if not np.isnan(ui_matrix.iloc[index, item]):
                    y = np.array(ui_matrix.iloc[index, :])
                    y_mean = np.nanmean(y)
                    rating = ui_matrix.iloc[index, item]
                    rating_centered = rating - y_mean
                    simsum_centering.append(similarity * rating_centered)

                    sim_sum += similarity  # Add similarity for users who rated the item

            if sim_sum != 0 and np.sum(simsum_centering) != 0: # check zero values
                pred = x_mean + (np.sum(simsum_centering) / abs(sim_sum))
            else:
                pred = x_mean  # If either is zero, set pred to x_mean
            item_pred.append((item, pred))

    item_pred.sort(key=lambda u: u[1])
    return list(reversed(item_pred[-topk_recommendations:]))

# Recommendation based on Hybrid Jaccard-Pearson Correlation collaborative filtering approach

Here we will predict movie ratings by using Hybrid Jaccard-Pearson Correlation. By running this cell a user will be able to enter a user ID. Then our code will find similar users by using get_similarUsers, predicts movie ratings using pred. It will then print the top 10 similar users and top 20 movie recommended movies (based on highest predicted ratings) along with their genres and predicted scores.

In [9]:

while True:
    user_id_input = input("Enter the user ID: ").strip()  # Get input and trim spaces
    try:
        user_id = int(user_id_input)  # Try converting to integer
        break  # Exit loop if successful
    except ValueError:
        print("Invalid input. Please enter an integer.")
sim_hjp = get_similarUsers_hjp(u_index=user_id - 1, ui_matrix=user_rating_mat,
                       topk_users=30)  # topk_users argument takes 30 top similar users default
predictions_hjp = pred_hjp(ui_matrix=user_rating_mat, u_index=user_id - 1, sim_matrix=sim_hjp, topk_recommendations=20)

# Top 10 Most Similar Users
print("Top 10 Most Similar Users for user: ", user_id)
for index, score in sim_hjp[:10]:
    print ("User ID:", index + 1, ", Score: ", score)  # + 1 because users start from 1

# Top 30 Movie Recommendations for User
print('\n')
print("Top 20 Recommended Movies for user: ", user_id)
print('\n')
p_list = []
for index, value in predictions_hjp:
    movie_id = movies.get('movieId')[index]  # get the Movie ID corresponding the index
    movie_title = movies.get('title')[index]  # get the Movie Title corresponding the index
    movie_genre = movies.get('genres')[index]  # get the Movie Genres corresponding the index

    p_list.append(movie_id)
    print("Movie Id:", movie_id, ", Movie: ", movie_title, ", Genre: ", movie_genre, ", Rating: ", value)

Top 10 Most Similar Users for user:  1
User ID: 473 , Score:  0.5833880050386824
User ID: 511 , Score:  0.5641251533887106
User ID: 9 , Score:  0.5584611994522223
User ID: 13 , Score:  0.536324560792986
User ID: 366 , Score:  0.5314748745508778
User ID: 535 , Score:  0.5292151352663781
User ID: 401 , Score:  0.5268418313285088
User ID: 90 , Score:  0.501521730326078
User ID: 157 , Score:  0.48913475157139613
User ID: 476 , Score:  0.4873339411717199


Top 20 Recommended Movies for user:  1


Movie Id: 319 , Movie:  Shallow Grave (1994) , Genre:  Comedy|Drama|Thriller , Rating:  6.769157088122605
Movie Id: 27482 , Movie:  Cube 2: Hypercube (2002) , Genre:  Horror|Mystery|Sci-Fi , Rating:  6.223522167487685
Movie Id: 105653 , Movie:  Escape Plan (2013) , Genre:  Action|Mystery|Thriller , Rating:  6.222049413437611
Movie Id: 1475 , Movie:  Kama Sutra: A Tale of Love (1996) , Genre:  Romance , Rating:  6.012212643678161
Movie Id: 1428 , Movie:  Angel Baby (1995) , Genre:  Drama , Rating:  