<a href="https://colab.research.google.com/github/sarand0/Mass-Personalization-in-Recommender-Systems-Independent-Research/blob/main/Mass_Personalization_in_Recommender_Systems_An_Analysis_of_Amazon_User_Ratings_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Mass Personalization in Recommender Systems: An Analysis of Amazon User Ratings Data by Saran Duncan in Collaboration with Research Advisor Professor Forrest Davis**

Exploring the use of the Amazon Data from [this](https://amazon-reviews-2023.github.io/index.html#data-fields) repo.

### Dataset
They provide several processed datasets from May 2000 to Sep. 2023:  I focus on the Amazon data under the category Beauty and Personal Care.

train.rating:
- Train file.
- Each Line is a training instance: userID\t itemID\t rating\t timestamp (if have)

test.rating:
- Test file (positive instances).
- Each Line is a testing instance: userID\t itemID\t rating\t timestamp (if have)

test.negative
- Test file (negative instances).
- Each line corresponds to the line of test.rating, containing 99 negative samples.  
- Each line is in the format: (userID,itemID)\t negativeItemID1\t negativeItemID2 ...

In [None]:
!pip install datasets
from datasets import load_dataset

In [None]:
data = load_dataset("McAuley-Lab/Amazon-Reviews-2023",
                      "0core_timestamp_Beauty_and_Personal_Care",
                      trust_remote_code=True, split='train').to_pandas()

In [None]:
print(data.head())

In [None]:
# Check the length of the DataFrame
print(f"Data Length: {len(data)}")

# Check for max/min values in specific columns
print(f"Max User ID: {data['user_id'].max()}")
print(f"Min Parent ASIN: {data['parent_asin'].min()}")
print(f"Min Rating: {data['rating'].min()}")
print(f"Min Timestamp: {data['timestamp'].min()}")
print(f"Max Timestamp: {data['timestamp'].max()}")

In [None]:
print(data.columns)

In [None]:
# Extract a subset of columns and rename them:
column_names2 = ['User ID', 'Product ID', 'Rating', 'Timestamp']
data_subset = data[['user_id', 'parent_asin', 'rating', 'timestamp']]

# Rename columns as 'column_names2' for the subset
data_subset.columns = column_names2[0:]

# Display the new DataFrame with renamed columns
print(data_subset.head())

In [None]:
#print data with timestamps
import pandas as pd
timestamps = data['timestamp']
df = pd.DataFrame(timestamps)
# Convert timestamps to datetime, assuming they are in milliseconds
df['date'] = pd.to_datetime(df['timestamp'], unit='ms')
data['year'] = df['date'].dt.year
print(data.head())

In [None]:
#number of years a user has rated products for
import pandas as pd
data.groupby('user_id')['year'].nunique().sort_values(ascending=False)

In [None]:
#Users who gave more than 4 ratings in each year
filtered_data = data.groupby(['user_id', 'year'])['rating'].count().reset_index(name='rating_count')
filtered_data = filtered_data[filtered_data['rating_count'] > 4]
filtered_data = filtered_data.sort_values(by='rating_count', ascending=False)
print(filtered_data)

In [None]:
# Get groups of users who rated the same products in the same year
product_groups = data.groupby(['parent_asin', 'year'])['user_id'].apply(list).reset_index(name='users')
product_groups = product_groups[product_groups['users'].apply(len)>1]
product_groups['user count'] = product_groups['users'].apply(len)
print(product_groups)

In [None]:
product_groups.sort_values(by='user count', ascending=False)

In [None]:
from datasets import load_dataset
import pandas as pd
from collections import defaultdict
import json

def get_avg(product_groups, outfile):

    for year in product_groups['year'].unique():
        subset = product_groups[product_groups['year'] == year]
        subset.to_pickle(f"./Group{year}.pkl")

        #hashmap to store user overlaps
        user_overlap = defaultdict(lambda: defaultdict(int))

        # Iterate through each product group i
        for _, row in subset.iterrows():
            users = row['users']
            num_users = len(users)

            # Iterate through all pairs of users within the group
            for i in range(num_users):
                for j in range(i + 1, num_users):
                    user1 = users[i]
                    user2 = users[j]

                    # Increment the overlap count for both user pairs
                    user_overlap[user1][user2] += 1
                    user_overlap[user2][user1] += 1

        with open(f'user_overlap_{year}.json', 'w') as f:
            json.dump(user_overlap, f)

        pair_count = 0
        avg_overlap = 0
        tot_count = 0
        for user1, overlaps in user_overlap.items():
            for user2, count in overlaps.items():
                if count > 1:
                    pair_count+=1
                    tot_count+=count
        if pair_count == 0:
            avg_overlap = 0
        else:
            avg_overlap = tot_count/pair_count
        outfile.write(f"{year}\t{pair_count}\t{avg_overlap}\n")

def main():
    # Load data, removing streaming=True to get a regular Dataset
    data = load_dataset("McAuley-Lab/Amazon-Reviews-2023",
                          "0core_timestamp_Beauty_and_Personal_Care",
                          trust_remote_code=True, split='train').to_pandas()
    # Convert timestamps to datetime, assuming they are in milliseconds
    data['date'] = pd.to_datetime(data['timestamp'], unit='ms')
    data['year'] = data['date'].dt.year

    # Get product groups
    product_groups = data.groupby(['parent_asin', 'year'])['user_id'].apply(list).reset_index(name='users')
    product_groups = product_groups[product_groups['users'].apply(len)>1]
    product_groups['user count'] = product_groups['users'].apply(len)

    outfile = open('info.tsv', 'w')
    outfile.write("year\tcount\tavg\n")
    get_avg(product_groups, outfile)
    outfile.close()
main()

In [None]:
group_2019 = product_groups[product_groups['year']==2019]
print(group_2019)

In [None]:
group_2019.to_pickle("./Group2019.pkl")
pd.read_pickle("./Group2019.pkl")

In [None]:
#Finding similarities for users in 2017
import pandas as pd
from collections import defaultdict

group_2019 = pd.read_pickle("./Group2019.pkl")

#hashmap to store user overlaps
user_overlap = defaultdict(lambda: defaultdict(int))

# Iterate through each product group in 2017
for _, row in group_2019.iterrows():
    users = row['users']
    num_users = len(users)

    # Iterate through all pairs of users within the group
    for i in range(num_users):
        for j in range(i + 1, num_users):
            user1 = users[i]
            user2 = users[j]

            # Increment the overlap count for both user pairs
            user_overlap[user1][user2] += 1
            user_overlap[user2][user1] += 1

# To print the entire overlap data structure
for user1, overlaps in user_overlap.items():
    for user2, count in overlaps.items():
        if count > 1:
          print(f"User {user1} and User {user2} have {count} product groups in common.")


In [None]:
pair_count = 0
avg_overlap = 0
tot_count = 0
for user1, overlaps in user_overlap.items():
    for user2, count in overlaps.items(): #count of how many similar product groups exist
      if count > 1:
        pair_count+=1
        tot_count+=count
avg_overlap = tot_count/pair_count
print(avg_overlap)


In [None]:
from scipy.sparse import csr_matrix

# transform matrix to scipy sparse matrix
user_to_product_sparse_df = csr_matrix(user_to_product_df.values)
user_to_product_sparse_df

**Fitting K-Nearest Neighbours model to the scipy sparse matrix:**

In [None]:
from sklearn.neighbors import NearestNeighbors

knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_model.fit(user_to_product_sparse_df)

**Specify User ID and number of similar users we want to consider here**

In [None]:
import numpy as np
from pprint import pprint

user_id = 'AE2CVCNCDLMNEBC6XZLMTHJTYEXA'
print(" Few of the products rated by the User:")
pprint(list(items_data[items_data['User ID'] == user_id]['Product ID'])[:10])

# function to find top n similar users of the given input user
def get_similar_users(user, n = 5):
  # input to this function is the user and number of top similar users we want
  user_index = user_to_product_df.index.get_loc(user) # Get the index corresponding to the user ID
  knn_input = np.asarray([user_to_product_df.values[user_index]])
  print(knn_input.sum(axis=-1))
  distances, indices = knn_model.kneighbors(knn_input, n_neighbors=n+1)

  print("Top",n,"users who are very similar to the user-",user, "are: ")
  print(" ")

  # Get the user IDs of similar users
  similar_users = [user_to_product_df.index[i] for i in indices.flatten()[1:]]

  for i, similar_user_id in enumerate(similar_users):
    print(i+1,". User:", similar_user_id, "separated by distance of", distances[0][i+1])

  return similar_users, distances.flatten()[1:]  # Return the similar user IDs

similar_user_list, distance_list = get_similar_users(user_id,5)

**Now we have to pick the top products to recommend. Which we can do by defining weights to ratings made by similar users.**

In [None]:
similar_user_list, distance_list

In [None]:
weight_list = distance_list/np.sum(distance_list)
weight_list

**Getting ratings of all products by derived similar users**

In [None]:
import numpy as np

similar_user_indices = [user_to_product_df.index.get_loc(user_id) for user_id in similar_user_list] #Get the indices of similar users.
product_ratings_sim_users = (user_to_product_df.values[similar_user_indices]) * weight_list[:, np.newaxis] #Use the indices to select rows.
product_ratings_sim_users

In [None]:
products_list = user_to_product_df.columns
products_list

In [None]:
print("Weight list shape:", len(weight_list))
print("product_ratings_sim_users shape:", product_ratings_sim_users.shape)
print("Number of products:", len(products_list))

**Broadcasting weightage matrix to similar user rating matrix, so that it is compatible for matrix operations**

In [None]:
weight_list = weight_list[:,np.newaxis] + np.zeros(len(products_list))
weight_list.shape

In [None]:
new_rating_matrix = weight_list*product_ratings_sim_users
mean_rating_list = new_rating_matrix.sum(axis =0)
mean_rating_list

In [None]:
from pprint import pprint
def recommend_products(n):
  n = min(len(mean_rating_list),n)
  pprint(list(products_list[np.argsort(mean_rating_list)[::-1][:n]]))

In [None]:
print("Products recommended based on similar users are: ")
recommend_products(10)

In [None]:
#Plotting a Scatterplot of the avg number of product groups in common over the range of years
import pandas as pd
import plotly.express as px

try:
    df = pd.read_csv('info.tsv', sep='\t')
    fig = px.scatter(df,
                     x='year',
                     y='avg',
                     #size='count',
                     hover_name='year',
                     title='Average Number of Product Groups in Common Over the Years',
                     labels={'year': 'Year', 'avg': 'Average Overlap', 'count': 'Number of Pairs'})

    fig.show()
except pd.errors.EmptyDataError:
    print("The file 'info.tsv' is empty or has an incorrect format. Please check the data and file format.")
except FileNotFoundError:
    print("The file 'info.tsv' was not found. Please ensure it has been created and is in the correct directory.")

In [None]:
#Plotting a Scatterplot of the avg number of product groups in common over the range of years, with each point size reflecting the number of user pairings
import pandas as pd
import plotly.express as px

try:
    df = pd.read_csv('info.tsv', sep='\t')
    fig = px.scatter(df,
                     x='year',
                     y='avg',
                     hover_name='year',
                     title='Average Number of Product Groups in Common Over the Years',
                     labels={'year': 'Year', 'avg': 'Average Overlap'})

    fig.show()
except pd.errors.EmptyDataError:
    print("The file 'info.tsv' is empty or has an incorrect format. Please check the data and file format.")
except FileNotFoundError:
    print("The file 'info.tsv' was not found. Please ensure it has been created and is in the correct directory.")
