### Imports

In [1]:
import json
import pandas as pd
import numpy as np
import torch

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sentence_transformers import SentenceTransformer, util

from models.content_based_recommendation_base import SimpleContentBasedRec
from models.content_based_recommendation_bert import ContentBasedRecommendation
from models.matrix_factorization_based_recommendation import MatrixFactorization
from models.deep_learning_based_recommendation import DeepLearningRec
from models.hybrid_recommendation import HybridRecommendation

### Visualize Data

In [7]:
# Items Data
with open('products.json', 'r') as json_file:
    products = json.load(json_file)
products

{'1': {'name': 'Apple iPhone 12',
  'description': 'Smartphone with A14 Bionic chip',
  'category': 'Electronics'},
 '2': {'name': 'Nike Air Max',
  'description': 'Athletic shoes with cushion support',
  'category': 'Fashion'},
 '3': {'name': 'LEGO Star Wars',
  'description': 'Building toy set from Star Wars series',
  'category': 'Toys'},
 '4': {'name': 'Logitech Mouse',
  'description': 'Wireless optical mouse',
  'category': 'Electronics'},
 '5': {'name': "Levi's Jeans",
  'description': 'Denim straight fit jeans',
  'category': 'Fashion'},
 '6': {'name': 'Harry Potter',
  'description': 'Fantasy book series by J.K. Rowling',
  'category': 'Books'},
 '7': {'name': 'Fender Strat',
  'description': 'Electric guitar with classic tones',
  'category': 'Music'},
 '8': {'name': 'Canon DSLR',
  'description': 'High-resolution digital camera',
  'category': 'Electronics'},
 '9': {'name': 'Green Pan Set',
  'description': 'Non-stick, eco-friendly pan set',
  'category': 'Home'},
 '10': {'n

In [8]:
# Users Data
users = pd.read_csv('users.csv')
users.head()

Unnamed: 0,id,gender,location,age,occupation,preferred_category
0,1,M,New York,29,Software Eng,Electronics
1,2,F,Los Angeles,35,Musician,Music
2,3,F,Miami,22,Student,Books
3,4,M,Chicago,45,Chef,Kitchen
4,5,F,Seattle,31,Designer,Home Decor


In [9]:
# Interactions Data
interactions = pd.read_csv('interactions.csv')
interactions.head()

Unnamed: 0,user_id,product_id,rating
0,1,1,5
1,1,2,2
2,1,4,4
3,1,5,5
4,1,7,2


### Process items data

In [10]:
# Data
with open('products.json', 'r') as json_file:
    products = json.load(json_file)

# Initialize the transformer model
model = SentenceTransformer('paraphrase-distilroberta-base-v1')

# Combining product name, description, and category
combined_texts = [f"{prod['name']} {prod['description']} {prod['category']}" for _, prod in products.items()]

# Generate embeddings
product_embeddings = model.encode(combined_texts, convert_to_tensor=True)

# Dimensionality reduction using PCA
pca = PCA(n_components=3)
product_embeddings = pca.fit_transform(product_embeddings)

# items_data = product_embeddings

# # Convert to torch tensor
items_data = torch.tensor(product_embeddings, dtype=torch.float32)

print("items data embeddings:\n", items_data)

items data embeddings:
 tensor([[ 2.1626, -0.8523, -0.9422],
        [-0.2020,  3.4703, -2.3967],
        [-3.5307, -3.0231, -2.2731],
        [ 2.8735, -0.6252, -0.6421],
        [-1.8927,  4.8022,  1.3659],
        [-2.9371, -1.3315, -1.9322],
        [ 0.8515,  1.1805,  0.0867],
        [ 1.0182, -1.3480,  2.9369],
        [-2.4257, -1.3046,  4.0382],
        [ 4.0824, -0.9681, -0.2415]])


### Process users data

In [11]:
# Load CSV data into a Pandas DataFrame
df = pd.read_csv('users.csv')

# Separate numerical and text columns
numerical_cols = ['age']
text_cols = ['gender', 'location', 'occupation', 'preferred_category']

# Normalize numerical columns
scaler = StandardScaler()
numerical_data = scaler.fit_transform(df[numerical_cols])

# Generate embeddings for text columns
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
text_dataframes = []

for col in text_cols:
    embeddings = model.encode(df[col].tolist())
    # Convert to DataFrame
    embeddings_df = pd.DataFrame(embeddings)
    text_dataframes.append(embeddings_df)

text_embeddings = pd.concat(text_dataframes, axis=1)

# Concatenate the embeddings
users_embeddings = pd.concat([pd.DataFrame(numerical_data), text_embeddings], axis=1).values

# Dimensionality reduction using PCA
pca = PCA(n_components=3)
users_embeddings = pca.fit_transform(users_embeddings)
users_data = users_embeddings
# print(users_data)
# Convert to torch tensor
users_data = torch.tensor(users_embeddings, dtype=torch.float32)

print("users data embeddings:\n", users_data)

users data embeddings:
 tensor([[-8.2327,  7.4664, -2.8698],
        [ 3.4268,  2.4652,  9.8292],
        [ 8.7280,  3.4951, -5.0207],
        [-6.3303, -6.2768,  1.0092],
        [ 2.4082, -7.1500, -2.9480]])


### Process interactions data

In [12]:
# Load the .csv file and generate interaction_data matrix:
loaded_interactions = pd.read_csv('interactions.csv')

interaction_data = -1 * np.ones((5, 10))  # Initializing with -1

for index, row in loaded_interactions.iterrows():
    user_idx = int(row['user_id']) - 1
    product_idx = int(row['product_id']) - 1
    interaction_data[user_idx][product_idx] = row['rating']
# print(interaction_data)

# Convert to torch tensor
interaction_data = torch.tensor(interaction_data, dtype=torch.float32)

print("interaction_data:\n", interaction_data)

interaction_data:
 tensor([[ 5.,  2., -1.,  4.,  5., -1.,  2.,  5., -1., -1.],
        [-1.,  3.,  5.,  2.,  1.,  1.,  4., -1., -1.,  2.],
        [ 1.,  2.,  5.,  2., -1.,  1.,  2., -1., -1., -1.],
        [-1.,  2.,  1., -1., -1.,  1., -1., -1., -1., -1.],
        [ 1., -1., -1.,  1.,  1.,  2.,  2.,  1., -1., -1.]])


### Recommendation using hybrid model

In [13]:
recommender = HybridRecommendation(users_data, items_data, interaction_data)
print("Training:")
recommender.train()

scores = recommender.predict()
print("Recommendation scores:\n", scores)

top_2_items = recommender.get_top_k_items(k=2)
print("\nID of recommended items for each user:\n", top_2_items.cpu().numpy())

Training:
Epoch 0, Loss: 14.145489692687988
Epoch 100, Loss: 0.12896539270877838
Epoch 200, Loss: 0.0044252704828977585
Epoch 300, Loss: 0.00042982492595911026
Epoch 400, Loss: 4.7105178964557126e-05
Epoch 500, Loss: 3.713901378432638e-06
Epoch 600, Loss: 2.9000275958424027e-07
Epoch 700, Loss: 3.298319128930416e-08
Epoch 800, Loss: 3.9367047399707644e-09
Epoch 900, Loss: 3.886859167856471e-10
Recommendation scores:
 tensor([[ 5.0000,  2.0000,  2.4408,  4.0000,  5.0000,  4.0755,  2.0000,  5.0000,
          5.0219,  3.6255],
        [ 1.1529,  3.0000,  5.0000,  2.0000,  1.0000,  1.0000,  4.0000,  2.3320,
          2.8729,  2.0000],
        [ 1.0000,  2.0000,  5.0000,  2.0000, -0.3521,  1.0000,  2.0000,  1.9358,
          2.3032,  1.3768],
        [-0.1334,  2.0000,  1.0000,  0.2974,  2.6285,  1.0000,  0.9861,  0.0542,
          1.2022, -0.3527],
        [ 1.0000,  2.1818,  4.5420,  1.0000,  1.0000,  2.0000,  2.0000,  1.0000,
          2.1623,  0.3677]], grad_fn=<AddBackward0>)

ID of re

### Recommendation using deep learning model

In [14]:
recommender = DeepLearningRec(users_data, items_data, interaction_data)
print("Training:")
recommender.train()

scores = recommender.predict()
print("Recommendation scores:\n", scores)

top_2_items = recommender.get_top_k_items(k=2)
print("\nID of recommended items for each user:\n", top_2_items.cpu().numpy())

Training:
Epoch 0, Loss: 21.365793228149414
Epoch 100, Loss: 0.8142966032028198
Epoch 200, Loss: 0.4298228919506073
Epoch 300, Loss: 0.11122187227010727
Epoch 400, Loss: 0.023746013641357422
Epoch 500, Loss: 0.013200347311794758
Epoch 600, Loss: 0.009376028552651405
Epoch 700, Loss: 0.006700543221086264
Epoch 800, Loss: 0.004597093444317579
Epoch 900, Loss: 0.003028180683031678
Recommendation scores:
 tensor([[ 4.9813,  1.9963, 11.2873,  4.0175,  5.0013,  7.7492,  2.0087,  4.9989,
          4.9307,  3.9259],
        [ 1.3779,  3.0050,  4.9914,  1.9766,  1.0058,  1.0109,  3.9945,  2.9357,
          2.2378,  2.0259],
        [ 1.1593,  1.9823,  5.0091,  1.8344,  1.3330,  0.9860,  2.0151,  2.5208,
          1.1959,  1.7373],
        [-1.7715,  2.0055,  1.0026, -1.9283,  1.0683,  1.0005,  0.7005, -0.5119,
          2.9306, -2.4732],
        [ 1.0118,  2.1650,  1.6358,  0.9935,  1.0031,  1.9782,  2.0183,  0.9979,
          1.9357,  0.4662]], grad_fn=<ViewBackward0>)

ID of recommended items

### Recommendation using matrix factorization

In [15]:
recommender = MatrixFactorization(interaction_data)
print("Training:")
recommender.train()

scores = recommender.predict()
print("Recommendation scores:\n", scores)

top_2_items = recommender.get_top_k_items(k=2)
print("\nID of recommended items for each user:\n", top_2_items.cpu().numpy())

Training:
Epoch 0, Loss: 11.612005233764648
Epoch 100, Loss: 5.90090274810791
Epoch 200, Loss: 4.268473148345947
Epoch 300, Loss: 3.240995168685913
Epoch 400, Loss: 2.1425223350524902
Epoch 500, Loss: 1.0853207111358643
Epoch 600, Loss: 0.49128007888793945
Epoch 700, Loss: 0.2604113519191742
Epoch 800, Loss: 0.1662200540304184
Epoch 900, Loss: 0.12038252502679825
Recommendation scores:
 tensor([[ 4.8909,  2.0107,  6.2174,  3.8992,  4.7118,  2.5147,  2.2380,  4.9651,
         -1.3599,  0.2591],
        [ 0.5559,  3.0965,  5.1074,  2.0605,  1.2198,  1.2182,  3.5212,  2.6061,
         -0.4018,  2.0440],
        [ 1.5683,  2.0371,  4.6845,  2.1413,  1.2432,  0.9474,  2.1952,  2.7523,
         -0.6178,  1.2330],
        [-0.1665,  1.7204,  1.0988,  0.6021,  1.8182,  1.3019,  2.2418,  0.6673,
          0.0518,  0.8030],
        [ 0.6673,  1.4354,  1.9185,  1.0509,  1.6894,  1.1203,  1.7670,  1.2872,
         -0.2064,  0.6304]], grad_fn=<MmBackward0>)

ID of recommended items for each user:
 

### Content-based recommendation - base model

In [16]:
recommender = SimpleContentBasedRec(users_data, items_data)
scores = recommender.recommend()

print("Recommendation scores:\n", scores)

top_2_items = scores.argsort(axis=1)[:, -2:]

print("\nID of recommended items for each user:\n", top_2_items)

Recommendation scores:
 [[-0.67773456  0.73958567  0.17048441 -0.71621767  0.75326697  0.41576947
   0.16993964 -0.67771697 -0.12356533 -0.80297084]
 [-0.25819972 -0.41060698 -0.76486256 -0.02905022  0.37299936 -0.82004852
   0.39412501  0.827658    0.6396758   0.14482857]
 [ 0.75146094  0.56348836 -0.43162358  0.78719069 -0.09735894 -0.37935002
   0.67607684 -0.40427074 -0.88273747  0.69880679]
 [-0.35020191 -0.63014362  0.79596883 -0.49901565 -0.40449072  0.67787184
  -0.96960987  0.21246008  0.58390732 -0.47266964]
 [ 0.70329417 -0.4686847   0.54318618  0.54654924 -0.99205672  0.3476067
  -0.60848451  0.04154644 -0.23528554  0.5036761 ]]

ID of recommended items for each user:
 [[1 4]
 [8 7]
 [0 3]
 [5 2]
 [3 0]]


### Content-based Recommendation - BERT model

In [17]:
with open('products.json', 'r') as json_file:
    items_data = json.load(json_file)
users_data = pd.read_csv("users.csv")

# Convert products and users_data to list of concatenated strings
products_data = [f"{product['name']} {product['description']} {product['category']}" for product in items_data.values()]
users_strings = users_data.apply(lambda row: f"{row['gender']} {row['location']} {row['age']} {row['occupation']} {row['preferred_category']}", axis=1).tolist()

recommender = ContentBasedRecommendation(users_strings, products_data)
scores = recommender.recommend()

print("Recommendation scores:\n", scores)

top_2_items = scores.argsort(axis=1)[:, -2:]

print("\nID of recommended items for each user:\n", top_2_items.cpu().numpy())

Recommendation scores:
 tensor([[0.8089, 0.8369, 0.7205, 0.8276, 0.7692, 0.7426, 0.8126, 0.6983, 0.7796,
         0.6849],
        [0.7890, 0.8499, 0.7418, 0.8069, 0.8115, 0.8079, 0.8682, 0.6770, 0.7854,
         0.6931],
        [0.8022, 0.8667, 0.7390, 0.8159, 0.7963, 0.8248, 0.8558, 0.6596, 0.7821,
         0.6759],
        [0.7744, 0.8251, 0.7120, 0.7797, 0.7815, 0.7595, 0.8187, 0.6405, 0.7972,
         0.6290],
        [0.7762, 0.8500, 0.7312, 0.7980, 0.8180, 0.7827, 0.8281, 0.6660, 0.8255,
         0.6763]])

ID of recommended items for each user:
 [[3 1]
 [1 6]
 [6 1]
 [6 1]
 [6 1]]
