In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
links = pd.read_csv('../../data/links.csv')
movies = pd.read_csv('../../data/movies.csv')
ratings = pd.read_csv('../../data/ratings.csv')
tags = pd.read_csv('../../data/tags.csv')

In [4]:
print(f'There are {movies["title"].nunique()} movies in the dataset')
print(f'There is data from {ratings["userId"].nunique()} users in the dataset')

avg_ratings = len(ratings)/ratings['userId'].nunique()
print(f'The average amount of ratings per user is {avg_ratings}')

There are 9737 movies in the dataset
There is data from 610 users in the dataset
The average amount of ratings per user is 165.30491803278687


In [15]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [5]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [31]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [17]:
movies_ratings = movies.merge(ratings, how='left', on='movieId')
movies_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,964982700.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,847435000.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7.0,4.5,1106636000.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15.0,2.5,1510578000.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17.0,4.5,1305696000.0


In [33]:
with_tags = movies_ratings.merge(tags, how='left', on=['movieId', 'userId'])
with_tags.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp_x,tag,timestamp_y
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,964982700.0,,
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,847435000.0,,
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7.0,4.5,1106636000.0,,
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15.0,2.5,1510578000.0,,
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17.0,4.5,1305696000.0,,


In [34]:
with_tags['tag'].count()

np.int64(3476)

In [7]:
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix

def ids_encoder(ratings):
    users = sorted(ratings['userId'].unique())
    items = sorted(ratings['movieId'].unique())

    # create users and items encoders
    uencoder = LabelEncoder()
    iencoder = LabelEncoder()

    # fit users and items ids to the corresponding encoder and transform
    uencoder.fit(users)
    iencoder.fit(items)
    ratings['userId'] = uencoder.transform(ratings['userId'].tolist())
    ratings['movieId'] = iencoder.transform(ratings['movieId'].tolist())

    return ratings, uencoder, iencoder

ratings, uencoder, iencoder = ids_encoder(ratings)

def ratings_matrix(ratings):
    return csr_matrix(pd.crosstab(ratings['userId'], ratings['movieId'], values=ratings['rating'], aggfunc="sum").fillna(0).values)

R = ratings_matrix(ratings)
R

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 100836 stored elements and shape (610, 9724)>

# Non-Personalised Recommender Systems
We compute the mean rating of each user and the normalized ratings for each item. The DataFrame mean contains the mean rating for each user. With the mean rating of each user, we can add an extra column norm_rating to the ratings's DataFrame which can be accessed to make predictions. This simplifies needed calculations.

In [8]:
# mean ratings for each user
user_means = ratings.groupby(by='userId', as_index=False)['rating'].mean()
mean_ratings = pd.merge(ratings, user_means, suffixes=('','_mean'), on='userId')
# normalized ratings for each items
mean_ratings['norm_rating'] = mean_ratings['rating'] - mean_ratings['rating_mean']

user_means = user_means.to_numpy()

print(user_means)
print(mean_ratings)
print(ratings[ratings['movieId'] == 10])
print(mean_ratings[mean_ratings['movieId'] == 10])



[[  0.           4.36637931]
 [  1.           3.94827586]
 [  2.           2.43589744]
 ...
 [607.           3.13417569]
 [608.           3.27027027]
 [609.           3.68855607]]
        userId  movieId  rating   timestamp  rating_mean  norm_rating
0            0        0     4.0   964982703     4.366379    -0.366379
1            0        2     4.0   964981247     4.366379    -0.366379
2            0        5     4.0   964982224     4.366379    -0.366379
3            0       43     5.0   964983815     4.366379     0.633621
4            0       46     5.0   964982931     4.366379     0.633621
...        ...      ...     ...         ...          ...          ...
100831     609     9416     4.0  1493848402     3.688556     0.311444
100832     609     9443     5.0  1493850091     3.688556     1.311444
100833     609     9444     5.0  1494273047     3.688556     1.311444
100834     609     9445     5.0  1493846352     3.688556     1.311444
100835     609     9485     3.0  1493846415     3.

To consider deviation of user ratings, (How much does a user like an item compared to how much he normally likes an item?), we normalize the ratings of a particular user against a particular item by considering the average rating of that user across all items. It might be useful to note that:

- `ratings[u,:]` represents the uth row of a rating matrix `rating`, i.e., ratings given by a user u across all items.
- `ratings[:,i]` represents the ith column of a rating matrix `rating`, i.e., ratings given to an item by all users.


In [9]:
def normalized_average_predict(userid, itemid, ratings, mean_ratings, user_means):
    """
    predict what score userid would have given to itemid using normalized average.

    :param
        - userid : user id for which we want to make 'prediction'
        - itemid : item id on which we want to make 'prediction'
        - ratings : ratings df
        - mean_ratings : df with normalized ratings
        - user_means : array of user mean ratings

    :return
        - r_hat : predicted rating of user userid on item itemid
    """
    user_mean = user_means[userid][1]
    item_ratings = ratings[ratings['movieId'] == itemid] #all ratings for item id

    if len(item_ratings) == 0:
        return user_mean

    norm_item_ratings = mean_ratings[mean_ratings['movieId'] == itemid]['norm_rating'].values

    if len(norm_item_ratings) == 0 or np.sum(np.abs(norm_item_ratings)) == 0:
        return user_mean

    r_hat = user_mean + np.mean(norm_item_ratings)

    return r_hat


## Weighted Rating in Recommender Systems

In many recommender systems, it is crucial to balance the impact of an item's individual ratings with the overall average rating across all items. This approach prevents items with a limited number of reviews from appearing disproportionately high or low due to statistical noise. One effective method to achieve this is by using a weighted scoring system, sometimes referred to as the Bayesian average or shrinkage method.

The weighted rating for an item **j** is computed as follows:

$$
WR(j) = \frac{v}{v + m} \cdot U(j) + \frac{m}{v + m} \cdot C
$$

Where:

- **U(j)** is the mean (average) rating for item **j**.
- **v** is the number of ratings that item **j** has received.
- **C** is the global mean rating (i.e., the average rating across all items).
- **m** is a threshold representing the minimum number of ratings required for an item's rating to be considered reliable.

This method is widely used in scenarios where items (such as movies, books, or products) have varying numbers of reviews. By integrating this weighted scoring approach, the recommender system can reduce the impact of items with very few ratings that might otherwise skew the recommendation and provide a more stable and trustworthy ranking by combining individual item ratings with the overall trend.





In [10]:
C = ratings['rating'].mean()

def weighted_rating_predict(movieId, ratings, m):
    item_ratings = ratings[ratings['movieId'] == movieId]

    if item_ratings.empty:
        return C

    v = len(item_ratings)
    U = item_ratings['rating'].mean()

    weighted_rating = (v/(v+m)) * U + (m/(v+m)) * C
    return weighted_rating

m = 5
unique_movie_ids = ratings['movieId'].unique()
weighted_scores = {
    movieId: weighted_rating_predict(movieId, ratings, m)
    for movieId in unique_movie_ids
}

weighted_ratings_df = pd.DataFrame(
    list(weighted_scores.items()),
    columns=['movieId', 'weighted_rating']
)


top_n = pd.merge(weighted_ratings_df, movies, how="inner", on='movieId')
top_n = top_n.sort_values(by="weighted_rating", ascending=False)
top_n


Unnamed: 0,movieId,weighted_rating,title,genres
164,277,4.414620,Miracle on 34th Street (1994),Drama
1575,840,4.280311,House Arrest (1996),Children|Comedy
2555,882,4.235752,"Trigger Effect, The (1996)",Drama|Thriller
782,921,4.231401,My Favorite Year (1982),Comedy
529,602,4.230468,"Great Day in Harlem, A (1994)",Documentary
...,...,...,...,...
3189,2104,2.154445,Tex (1982),Drama
4113,5170,2.150778,All About the Benjamins (2002),Action|Comedy|Crime
2784,2103,2.147517,Tall Tale (1995),Adventure|Children|Fantasy|Western
3140,3799,2.077522,Pokémon the Movie 2000 (2000),Animation|Children


## Product Association Driven Non-Personalised Recommender System
- **Support** measueres how frequently an itemset appears in the transaction data. Indicates the popularity of the itemset. For an itemset A:
$$Support(A) = \frac{Number of transactions containing A}{Total number of transaction}$$
- **Confidence** quantifies the likelihood of finding the consequent in transactions under the conditions that these transactions also contain the antecedent. It is the strenght or how reliable the rule is. For a rule A -> B:
$$Confidence(A -> B) = \frac{Support(A union B)}{Support(A)}$$
- **Lift** compares the observed confidence of the rule to the confidence expected if the items were independent, in such a way that if lift > 1, then A and B appear together more often that they would be expected to by chance.
$$Lift(A -> B) = \frac{Confidence(A->B)}{Support(B)}$$
- **Coverage** is the proportion that contain the antecedent of a rule. It reflects the general applicability of the rule. High coverage means the rule can potentially impact a large portion of transactions.


In [14]:
from mlxtend.frequent_patterns import apriori, association_rules

# threshold to consider a movie "liked" by a user.
threshold_rating = 4.0
liked_ratings = ratings[ratings['rating'] >= threshold_rating]
basket = liked_ratings.groupby(['userId', 'movieId'])['rating'].count().unstack().fillna(0)
print(basket)

# Binarize the data: 1 if the movie is liked by the user, else 0.
basket = basket.applymap(lambda x: 1 if x >= 1 else 0)

min_support = 0.20  # Adjust based on dataset size and desired sensitivity
frequent_itemsets = apriori(basket, min_support=min_support, use_colnames=True)
print(frequent_itemsets)

# ---------------------------
# Association Rule Generation
# ---------------------------
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
# Compute coverage as the antecedent support (i.e., the fraction of users for which the rule applies)
rules['coverage'] = rules['antecedent support']

# Display the rules with key metrics
print("Association Rules (selected columns):")
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift', 'coverage']])


movieId  0     1     2     4     5     6     7     8     9     10    ...  \
userId                                                               ...   
0         1.0   0.0   1.0   0.0   1.0   0.0   0.0   0.0   0.0   0.0  ...   
1         0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
2         0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
3         0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
4         1.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
...       ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   
605       0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
606       1.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
607       0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   1.0   0.0  ...   
608       0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   1.0   0.0  ...   
609       1.0   0.0   0.0   0.0   1.0   0.0   0.0   0.0   0.0   0.0  ...   

movieId  96

  basket = basket.applymap(lambda x: 1 if x >= 1 else 0)


     support      itemsets
0   0.241379           (0)
1   0.229885          (43)
2   0.267652          (46)
3   0.272578          (97)
4   0.208539         (123)
5   0.330049         (224)
6   0.400657         (257)
7   0.449918         (277)
8   0.408867         (314)
9   0.224959         (398)
10  0.238095         (418)
11  0.287356         (461)
12  0.246305         (507)
13  0.369458         (510)
14  0.224959         (520)
15  0.259442         (659)
16  0.275862         (897)
17  0.267652         (899)
18  0.246305         (910)
19  0.231527        (1502)
20  0.364532        (1938)
21  0.205255        (2077)
22  0.249589        (2144)
23  0.293924        (2224)
24  0.200328        (3136)
25  0.239737        (3633)
26  0.215107        (4131)
27  0.229885        (4791)
28  0.203612     (257, 46)
29  0.236453    (224, 897)
30  0.206897    (224, 910)
31  0.259442    (257, 277)
32  0.223317    (257, 314)
33  0.249589    (257, 510)
34  0.208539   (257, 1938)
35  0.200328   (2224, 257)
3



In [17]:
def product_association_recommender(product, rules, top_n=5):
    """
    Recommend products based on association rules for a given product.

    Parameters:
        product (int): The product (movieId) for which we want recommendations.
        rules (DataFrame): Association rules with columns 'antecedents' and 'consequents'.
        top_n (int): Number of recommendations to return.

    Returns:
        DataFrame: Top association rules containing the product in the antecedents.
    """
    # Filter rules where the product is in the antecedents (antecedents are stored as frozensets)
    rules_filtered = rules[rules['antecedents'].apply(lambda x: product in x)]

    if rules_filtered.empty:
        print(f"No association rules found for product {product}.")
        return pd.DataFrame(columns=['consequents', 'support', 'confidence', 'lift', 'coverage'])

    # Sort the filtered rules by lift and confidence in descending order
    rules_filtered = rules_filtered.sort_values(by=['lift', 'confidence'], ascending=False)

    return rules_filtered.head(top_n)

# ---------------------------
# Step 5: Generate Recommendations
# ---------------------------
# Example usage: Recommend products for movieId 10.
recommended_rules = product_association_recommender(46, rules, top_n=5)

# Extract recommended product IDs from the consequents of the rules
recommended_products = []
for _, row in recommended_rules.iterrows():
    for prod in row['consequents']:
        recommended_products.append(prod)
# Remove duplicates if a product appears in multiple rules
recommended_products = list(set(recommended_products))

# Merge with the movies DataFrame to get movie details
recommended_movies = movies[movies['movieId'].isin(recommended_products)]
print("\nRecommended Movies based on Product Association:")
print(recommended_movies)


Recommended Movies based on Product Association:
     movieId              title            genres
221      257  Just Cause (1995)  Mystery|Thriller
