<h2>Recommendation Systems using Collaborative Filtering</h2>

This project aims at generating movie recommendations for users using their past ratings, generes and movies watched.

<p>Load the dataset of movies and ratings </p>

In [15]:
import pandas as pd
movies = pd.read_csv("./ml-latest-small/movies.csv")
ratings = pd.read_csv("./ml-latest-small/ratings.csv")

<p>Get year from title</p>

In [16]:
import re

def get_numbers(title):
    comp = re.compile(r"\([0-9]*\)")
    m = comp.search(title)
    try:
        return m.group(0)[1:-1]
    except:
        return None

get_numbers("abc (1880) ")

'1880'

<h3>Generate hash from the generes</h3>

<p>Process</p>
<ul>
<li>Sort the genere values</li>
<li>hash the genere values to generate a hash</li>
<li>Select the first 10 charachters of the hash</li>
</ul>

In [17]:
import base64
import hashlib

def generate_hash(generes):
    generes =sorted(generes)
    return base64.b64encode(hashlib.sha256(str(generes).encode('utf-8')).digest()).decode("utf-8")

generate_hash(movies["genres"].iloc[0])[0:6]

'K9iU4C'

<ul>
    <li>Apply get_numbers and get year from the title</li>
    <li>Apply generate hash to get hash of generes </li>
</ul>

In [18]:
movies["genres"] = movies["genres"].apply(lambda x: x.split("|")).apply(lambda x: sorted(x)).apply(lambda x: list(set(x)))
movies["year"] = movies["title"].apply(get_numbers)
movies["hash"] = movies["genres"].apply(generate_hash).apply(lambda x: x[0:6])
movies

Unnamed: 0,movieId,title,genres,year,hash
0,1,Toy Story (1995),"[Comedy, Fantasy, Animation, Adventure, Children]",1995,Tc25zt
1,2,Jumanji (1995),"[Fantasy, Adventure, Children]",1995,uJWHzH
2,3,Grumpier Old Men (1995),"[Comedy, Romance]",1995,qWkntQ
3,4,Waiting to Exhale (1995),"[Comedy, Romance, Drama]",1995,s0KnRK
4,5,Father of the Bride Part II (1995),[Comedy],1995,5Sz8a+
...,...,...,...,...,...
9120,162672,Mohenjo Daro (2016),"[Adventure, Romance, Drama]",2016,0aBSrv
9121,163056,Shin Godzilla (2016),"[Sci-Fi, Fantasy, Adventure, Action]",2016,fmLiNY
9122,163949,The Beatles: Eight Days a Week - The Touring Y...,[Documentary],2016,0iz2x0
9123,164977,The Gay Desperado (1936),[Comedy],1936,5Sz8a+


<h3>Merge movies and ratings on movieId </h3>

In [19]:
merged = movies.merge(ratings, on=["movieId"])
merged 

Unnamed: 0,movieId,title,genres,year,hash,userId,rating,timestamp
0,1,Toy Story (1995),"[Comedy, Fantasy, Animation, Adventure, Children]",1995,Tc25zt,7,3.0,851866703
1,1,Toy Story (1995),"[Comedy, Fantasy, Animation, Adventure, Children]",1995,Tc25zt,9,4.0,938629179
2,1,Toy Story (1995),"[Comedy, Fantasy, Animation, Adventure, Children]",1995,Tc25zt,13,5.0,1331380058
3,1,Toy Story (1995),"[Comedy, Fantasy, Animation, Adventure, Children]",1995,Tc25zt,15,2.0,997938310
4,1,Toy Story (1995),"[Comedy, Fantasy, Animation, Adventure, Children]",1995,Tc25zt,19,3.0,855190091
...,...,...,...,...,...,...,...,...
99999,161944,The Last Brickmaker in America (2001),[Drama],2001,oUSIbJ,287,5.0,1470167824
100000,162376,Stranger Things,[Drama],,oUSIbJ,73,4.5,1474255532
100001,162542,Rustom (2016),"[Thriller, Romance]",2016,wNdQXu,611,5.0,1471520667
100002,162672,Mohenjo Daro (2016),"[Adventure, Romance, Drama]",2016,0aBSrv,611,3.0,1471523986


<h3>Select columns which are very necessary</h3>

In [20]:
df = merged[["userId","movieId","rating","timestamp","hash"]]

In [21]:
movieHash = df[["movieId","hash"]].drop_duplicates("movieId").reset_index(drop=True)
movieHash

Unnamed: 0,movieId,hash
0,1,Tc25zt
1,2,uJWHzH
2,3,qWkntQ
3,4,s0KnRK
4,5,5Sz8a+
...,...,...
9061,161944,oUSIbJ
9062,162376,oUSIbJ
9063,162542,wNdQXu
9064,162672,0aBSrv


<h3>Generate index for hash</h3>

In [22]:
hashIndexes = movieHash["hash"].drop_duplicates().reset_index(drop=True).reset_index()

In [23]:
df['hashIndex'] = df['hash'].apply(lambda x: hashIndexes[hashIndexes["hash"] == x]["index"].values[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['hashIndex'] = df['hash'].apply(lambda x: hashIndexes[hashIndexes["hash"] == x]["index"].values[0])


<h3>Create Pivot Table between userId and hash Indexes</h3>

In [10]:
def create_pivot_table(df):
    user_item_matrix = df.pivot_table(index='userId', columns='hashIndex', values='rating', fill_value=0)
    return user_item_matrix

user_item_matrix = create_pivot_table(df)

In [11]:
user_item_matrix

hashIndex,0,1,2,3,4,5,6,7,8,9,...,891,892,893,894,895,896,897,898,899,900
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2.00,0.0,0.000000,0.000000,0.000000,4.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.00,0.0,3.428571,3.000000,3.000000,3.666667,0.0,0.0,4.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.00,3.0,0.000000,4.000000,3.000000,3.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.00,5.0,4.571429,4.000000,4.416667,4.333333,0.0,0.0,4.333333,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,3.75,4.0,4.214286,3.700000,3.909091,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,0.00,0.0,3.166667,3.750000,4.500000,3.000000,0.0,0.0,3.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
668,0.00,0.0,0.000000,0.000000,1.666667,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
669,0.00,0.0,4.000000,4.000000,3.500000,4.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
670,4.00,0.0,2.000000,0.000000,4.500000,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


<p>Create a cosine similarity index between user and items</p>

In [24]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def create_similarity_matrix(user_item_matrix):
    # Compute cosine similarity matrix between users
    similarity_matrix = cosine_similarity(user_item_matrix)

    return similarity_matrix

similarity_matrix = create_similarity_matrix(user_item_matrix)
similarity_matrix

array([[1.        , 0.21315519, 0.2037731 , ..., 0.31787832, 0.1989366 ,
        0.2420832 ],
       [0.21315519, 1.        , 0.40112379, ..., 0.34201091, 0.39901595,
        0.48401261],
       [0.2037731 , 0.40112379, 1.        , ..., 0.37339854, 0.28741602,
        0.41732093],
       ...,
       [0.31787832, 0.34201091, 0.37339854, ..., 1.        , 0.24220445,
        0.29604803],
       [0.1989366 , 0.39901595, 0.28741602, ..., 0.24220445, 1.        ,
        0.39611284],
       [0.2420832 , 0.48401261, 0.41732093, ..., 0.29604803, 0.39611284,
        1.        ]])

In [25]:
# Neighborhood selection
neighborhoods = []
k = 3
def get_similar_users(similarity_matrix):
    for i in range(len(similarity_matrix)):
        # Sort the similarity values and get the indices of top-k similar users
        similar_users_indices = np.argsort(similarity_matrix[i])[:-k-1:-1]
        # Append the indices of top-k similar users to the neighborhoods list
        neighborhoods.append(similar_users_indices)
    return neighborhoods

neighborhoods = get_similar_users(similarity_matrix)

In [27]:
# Number of similar users to consider for prediction
k = 4

# Predict ratings for items


def predict_ratings(user_item_matrix, similarity_matrix, k):
    predicted_ratings = np.zeros_like(user_item_matrix, dtype=np.float64)
    for i in range(user_item_matrix.shape[0]):  # For each user
        for j in range(user_item_matrix.shape[1]):  # For each item
            if user_item_matrix.iloc[i, j] == 0:  # If the user hasn't rated the item
                # Get indices of top-k similar users
                similar_users_indices = np.argsort(similarity_matrix[i])[:-k-1:-1]
                # Calculate weighted average of ratings from similar users
                weighted_sum = 0
                sum_of_weights = 0
                for index in similar_users_indices:
                    if user_item_matrix.iloc[index, j] != 0:  # If the similar user has rated the item
                        similarity_weight = similarity_matrix[i, index]
                        weighted_sum += similarity_weight * user_item_matrix.iloc[index, j]
                        sum_of_weights += similarity_weight
                if sum_of_weights != 0:
                    predicted_ratings[i, j] = weighted_sum / sum_of_weights
    return predicted_ratings

predicted_ratings = predict_ratings(user_item_matrix, similarity_matrix, k)

In [28]:
# Generate recommendations for each user
recommendations = []

for i in range(user_item_matrix.shape[0]):  # For each user
    user_ratings = user_item_matrix.iloc[i]  # Ratings given by the user
    predicted_user_ratings = predicted_ratings[i]  # Predicted ratings for the user
    # Find indices of items that the user hasn't interacted with and have high predicted ratings
    recommended_indices = np.where((user_ratings == 0) & (predicted_user_ratings > 0))[0]
    recommendations.append(recommended_indices)


In [29]:
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error
# Flatten actual and predicted ratings matrices
actual_ratings_flat = user_item_matrix.values.flatten()
predicted_ratings_flat = predicted_ratings.flatten()

# Remove elements where actual rating is 0 (unrated items are not considered for evaluation)
nonzero_indices = np.nonzero(actual_ratings_flat)
actual_ratings_nonzero = actual_ratings_flat[nonzero_indices]
predicted_ratings_nonzero = predicted_ratings_flat[nonzero_indices]

# Calculate MAE and RMSE
mae = mean_absolute_error(actual_ratings_nonzero, predicted_ratings_nonzero)
rmse = np.sqrt(mean_squared_error(actual_ratings_nonzero, predicted_ratings_nonzero))

print("Mean Absolute Error (MAE):", mae)
print("Root Mean Squared Error (RMSE):", rmse)

Mean Absolute Error (MAE): 3.5803180049329524
Root Mean Squared Error (RMSE): 3.714044185988285


In [30]:

# Select a random user
random_user_index = np.random.randint(0, len(df))
random_user = df.iloc[random_user_index]

# Select another user for comparison (can be the same user)
comparison_user_index = np.random.randint(0, len(df))
comparison_user = df.iloc[comparison_user_index]

# Display the ratings of both users
print("Random User:")
print(random_user)

print("\nComparison User:")
print(comparison_user)

# Compare the ratings
ratings_comparison = random_user.compare(comparison_user)

print("\nRatings Comparison:")
print(ratings_comparison)

Random User:
userId              311
movieId             830
rating              1.5
timestamp    1062015170
hash             5Sz8a+
hashIndex             4
Name: 21745, dtype: object

Comparison User:
userId             452
movieId           2718
rating             4.0
timestamp    976420184
hash            5Sz8a+
hashIndex            4
Name: 54290, dtype: object

Ratings Comparison:
                 self      other
userId            311        452
movieId           830       2718
rating            1.5        4.0
timestamp  1062015170  976420184


In [31]:
df

Unnamed: 0,userId,movieId,rating,timestamp,hash,hashIndex
0,7,1,3.0,851866703,Tc25zt,0
1,9,1,4.0,938629179,Tc25zt,0
2,13,1,5.0,1331380058,Tc25zt,0
3,15,1,2.0,997938310,Tc25zt,0
4,19,1,3.0,855190091,Tc25zt,0
...,...,...,...,...,...,...
99999,287,161944,5.0,1470167824,oUSIbJ,11
100000,73,162376,4.5,1474255532,oUSIbJ,11
100001,611,162542,5.0,1471520667,wNdQXu,296
100002,611,162672,3.0,1471523986,0aBSrv,244


In [32]:
top_recommendations = {}

for user in user_item_matrix.index:
    prt = pd.DataFrame(predicted_ratings)
    prt.columns = user_item_matrix.columns
    prt.index = user_item_matrix.index
    prt_pd = pd.DataFrame(prt.loc[user]).sort_values(by=user, ascending=False).head(3).index.tolist()
    top_hashes = hashIndexes[hashIndexes['index'].isin(prt_pd)]
    movies_user = movieHash[movieHash['hash'].isin(top_hashes['hash'])].merge(movies, on='hash', how='inner')
    #uitm_pd = pd.DataFrame(user_item_matrix.loc[user]).sort_values(by=1, ascending=False).head(3).index.tolist()
    top_year_uti = merged[merged['userId']==user].sort_values(by='rating', ascending=False).head(3)['year'].tolist()

    top_recommendations[user]= {
        "recommendations":movies[(movies['hash'].isin(top_hashes['hash'])) & (movies['year'].isin(top_year_uti))],
        "past_watched":merged[merged['userId']==user]
    }


<p>Users Most Watched Past: Id 2</p>

In [33]:
top_recommendations[2]["past_watched"].sort_values(by='rating', ascending=False).head(10)

Unnamed: 0,movieId,title,genres,year,hash,userId,rating,timestamp
16483,551,"Nightmare Before Christmas, The (1993)","[Fantasy, Animation, Children, Musical]",1993,tIGw/C,2,5.0,835355767
2116,39,Clueless (1995),"[Comedy, Romance]",1995,qWkntQ,2,5.0,835355604
7946,266,Legends of the Fall (1994),"[Western, War, Romance, Drama]",1994,omsiPr,2,5.0,835355586
17820,592,Batman (1989),"[Crime, Thriller, Action]",1989,gkQ1nl,2,5.0,835355395
17618,590,Dances with Wolves (1990),"[Western, Adventure, Drama]",1990,7S/Hcc,2,5.0,835355395
17381,589,Terminator 2: Judgment Day (1991),"[Sci-Fi, Action]",1991,AGigca,2,5.0,835355697
16870,585,"Brady Bunch Movie, The (1995)",[Comedy],1995,5Sz8a+,2,5.0,835355817
6444,222,Circle of Friends (1995),"[Romance, Drama]",1995,UbctDl,2,5.0,835355840
4533,150,Apollo 13 (1995),"[Adventure, IMAX, Drama]",1995,0QJIjY,2,5.0,835355395
7884,265,Like Water for Chocolate (Como agua para choco...,"[Fantasy, Romance, Drama]",1992,MeNlI5,2,5.0,835355697


<p>Users Recommendations: Id 2</p>

In [34]:
top_recommendations[2]["recommendations"]

Unnamed: 0,movieId,title,genres,year,hash
15,16,Casino (1995),"[Crime, Drama]",1995,J1NhkN
26,27,Now and Then (1995),"[Children, Drama]",1995,yAya2U
29,30,Shanghai Triad (Yao a yao yao dao waipo qiao) ...,"[Crime, Drama]",1995,J1NhkN
32,34,Babe (1995),"[Children, Drama]",1995,yAya2U
34,36,Dead Man Walking (1995),"[Crime, Drama]",1995,J1NhkN
74,80,"White Balloon, The (Badkonake sefid) (1995)","[Children, Drama]",1995,yAya2U
89,97,"Hate (Haine, La) (1995)","[Crime, Drama]",1995,J1NhkN
106,117,"Young Poisoner's Handbook, The (1995)","[Crime, Drama]",1995,J1NhkN
213,241,Fluke (1995),"[Children, Drama]",1995,yAya2U
217,245,The Glass Shield (1994),"[Crime, Drama]",1994,J1NhkN


<h2>Train Test on divided datasets</h2>

In [35]:
from sklearn.model_selection import train_test_split

def train_test_split_custom(df):
    return train_test_split(df, test_size=0.2, stratify=df["userId"], random_state=42)

train, test = train_test_split_custom(df)

In [36]:
create_pivot_table(train)

hashIndex,0,1,2,3,4,5,6,7,8,9,...,890,891,892,893,894,895,897,898,899,900
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2.00,0.0,0.000000,0.0,0.000000,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.00,0.0,3.200000,3.0,4.000000,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.00,3.0,0.000000,4.0,3.000000,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.00,5.0,4.500000,4.0,4.777778,4.0,0.0,0.0,4.5,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,3.75,4.0,4.166667,3.7,3.928571,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,0.00,0.0,3.166667,4.0,4.000000,3.0,0.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
668,0.00,0.0,0.000000,0.0,1.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
669,0.00,0.0,4.000000,4.0,3.750000,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
670,0.00,0.0,2.000000,0.0,4.500000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
from sklearn.model_selection import StratifiedKFold

num_folds = 5

kf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)

fold_scores = []
match_rate = []
for fold, (train_ind, test_ind) in enumerate(kf.split(df, df['userId'])):
    train_dat, test_dat = df.iloc[train_ind], df.iloc[test_ind]
    user_item_matrix = create_pivot_table(train_dat)
    similarity_matrix = create_similarity_matrix(user_item_matrix)
    neighborhoods = get_similar_users(similarity_matrix)
    predicted_ratings = predict_ratings(user_item_matrix, similarity_matrix, k)
    recommendations = []
    match = 0
    for i in range(user_item_matrix.shape[0]):  # For each user
        user_ratings = user_item_matrix.iloc[i]  # Ratings given by the user
        predicted_user_ratings = predicted_ratings[i]  # Predicted ratings for the user
        # Find indices of items that the user hasn't interacted with and have high predicted ratings
        recommended_indices = np.where((user_ratings == 0) & (predicted_user_ratings > 0))[0]
        recommendations.append(recommended_indices)
        
        # find hashes related to the recommended_indices
        hashes = hashIndexes[hashIndexes["index"].isin(recommended_indices)]['hash'].tolist()
        actual_mv_ids = test[test["userId"].isin([i])].movieId.tolist()
        hashed_mv_ids = test[(test["userId"].isin([i])) & (test['hash'].isin(hashes))].movieId.tolist()

        if len(set(actual_mv_ids).intersection(set(hashed_mv_ids))) > 0:
            match +=1

    match_rate.append(match/user_item_matrix.shape[0])


In [39]:
print("Average Match Rate ",np.mean(match_rate))

Average Match Rate  0.8834575260804769
