In [1]:
from sklearn.model_selection import train_test_split
import json
from surprise import SVD, Reader, Dataset, accuracy, SVDpp, NMF, CoClustering, SlopeOne, KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline
import pandas as pd

In [2]:
business = []
with open("../dataset/business_all.json", "r") as file:
    for line in file:
        business.append(json.loads(line))

checkin = []
with open("../dataset/checkin_all.json", "r") as file:
    for line in file:
        checkin.append(json.loads(line))

train_review = []
with open("../dataset/review_train.json", "r") as file:
    for line in file:
        train_review.append(json.loads(line))

test_review = []
with open("../dataset/review_test.json", "r") as file:
    for line in file:
        test_review.append(json.loads(line))

user = []
with open("../dataset/user_all.json", "r") as file:
    for line in file:
        user.append(json.loads(line))
        

In [10]:
len(train_review)

183925

In [11]:
len(test_review)

45982

In [3]:
train_stars = []
valid_stars = []
for d in train_review[:147140]:
    train_stars.append((d["user_id"], d["business_id"], int(d["stars"])))
for d in train_review[147140:]:
    valid_stars.append((d["user_id"], d["business_id"], int(d["stars"])))

test_stars = []
for d in test_review:
    test_stars.append((d["user_id"], d["business_id"], int(d["stars"])))

In [4]:
df_train = pd.DataFrame(train_stars, columns=['user', 'item', 'rating'])
df_valid = pd.DataFrame(valid_stars, columns=['user', 'item', 'rating'])
df_test = pd.DataFrame(test_stars, columns=['user', 'item', 'rating'])
reader = Reader(rating_scale=(1, 5))
train_data = Dataset.load_from_df(df_train[['user', 'item', 'rating']], reader)
valid_data = Dataset.load_from_df(df_valid[['user', 'item', 'rating']], reader)
test_data = Dataset.load_from_df(df_test[['user', 'item', 'rating']], reader)
trainset = train_data.build_full_trainset()
validset = valid_data.build_full_trainset()
testset = test_data.build_full_trainset()

In [6]:
# SVD

model_SVD = SVD(
    random_state=26,
    n_factors=2,
    n_epochs=28,
    init_mean=0.03,
    init_std_dev=0.03,
    lr_all=0.005,
    reg_all=0.12,
)
model_SVD.fit(trainset)
predictions = model_SVD.test(validset.build_testset())
valid_mse = accuracy.mse(predictions)
print(f"Valid MSE: {valid_mse}")

MSE: 1.2547
Valid MSE: 1.2547242021499463


In [7]:
predictions = model_SVD.test(testset.build_testset())
test_mse = accuracy.mse(predictions)
print(f"Test MSE: {test_mse}")

MSE: 1.2592
Test MSE: 1.259187322875312


In [13]:
# SVDpp

model_SVDpp = SVDpp(
    random_state=26,
    n_factors=2,
    n_epochs=21,
    lr_all=0.007,
    reg_all=0.13,
)
model_SVDpp.fit(trainset)
predictions = model_SVDpp.test(validset.build_testset())
valid_mse = accuracy.mse(predictions)
print(f"Valid MSE: {valid_mse}")

MSE: 1.2549
Valid MSE: 1.2549134720016684


In [14]:
predictions = model_SVDpp.test(testset.build_testset())
test_mse = accuracy.mse(predictions)
print(f"Test MSE: {test_mse}")

MSE: 1.2590
Test MSE: 1.2590122759734572


In [11]:
# Accessing the user and item biases
user_biases = model_SVDpp.bu
item_biases = model_SVDpp.bi

# Print first few user and item biases
print("User biases:")
for i in range(5):  # printing first 5 users
    print(f"User {i}: {user_biases[i]}")
    
print("\nItem biases:")
for i in range(5):  # printing first 5 items
    print(f"Item {i}: {item_biases[i]}")

User biases:
User 0: -0.6247914639124159
User 1: 0.13909115485849916
User 2: 0.14793959195487555
User 3: -0.7417433378790887
User 4: -0.007035942887825003

Item biases:
Item 0: -0.45972516989030804
Item 1: -0.0629449714345564
Item 2: 0.7463388813468614
Item 3: -0.7625228058100991
Item 4: 0.22178405237977497


In [15]:
# Accessing the latent factor matrices
user_latent_factors = model_SVDpp.pu
item_latent_factors = model_SVDpp.qi

# Print first few latent factors for a user and an item
print("\nUser latent factors:")
for i in range(5):  # printing first 5 users
    print(f"User {i}: {user_latent_factors[i]}")

print("\nItem latent factors:")
for i in range(5):  # printing first 5 items
    print(f"Item {i}: {item_latent_factors[i]}")


User latent factors:
User 0: [ 0.01225726 -0.24325175]
User 1: [-0.02153202  0.08309185]
User 2: [-0.10290619  0.00197935]
User 3: [-0.06932579 -0.1581388 ]
User 4: [-0.05020979  0.04131654]

Item latent factors:
Item 0: [0.09126886 0.16681987]
Item 1: [-0.06693861 -0.08751041]
Item 2: [ 0.03861003 -0.05302889]
Item 3: [-0.01715328  0.16288534]
Item 4: [-0.0025778   0.08058914]


In [156]:
# NMF

model_NMF = NMF(
    random_state=26,
    n_factors=30,
    n_epochs=50,
    init_low=0.05,
    init_high=0.79
)
model_NMF.fit(trainset)
predictions = model_NMF.test(validset.build_testset())
valid_mse = accuracy.mse(predictions)
print(f"Valid MSE: {valid_mse}")

MSE: 1.5599
Valid MSE: 1.5598727354018092


In [157]:
predictions = model_NMF.test(testset.build_testset())
test_mse = accuracy.mse(predictions)
print(f"Test MSE: {test_mse}")

MSE: 1.5596
Test MSE: 1.5595668417370865


In [160]:
# SlopeOne bad

In [170]:
# CoClustering bad

In [None]:
# KNNBasic

model_KNNBasic = KNNBasic()
model_KNNBasic.fit(trainset)
predictions = model_KNNBasic.test(validset.build_testset())
valid_mse = accuracy.mse(predictions)
print(f"Valid MSE: {valid_mse}")

Computing the msd similarity matrix...
Done computing similarity matrix.
MSE: 1.5079
Valid MSE: 1.5079319252469106


In [175]:
predictions = model_KNNBasic .test(testset.build_testset())
test_mse = accuracy.mse(predictions)
print(f"Test MSE: {test_mse}")

MSE: 1.5260
Test MSE: 1.526035115539857


In [178]:
# KNNWithMeans

model_KNNWithMeans = KNNWithMeans()
model_KNNWithMeans.fit(trainset)
predictions = model_KNNWithMeans.test(validset.build_testset())
valid_mse = accuracy.mse(predictions)
print(f"Valid MSE: {valid_mse}")

Computing the msd similarity matrix...
Done computing similarity matrix.
MSE: 1.6005
Valid MSE: 1.6004530213477965


In [179]:
# KNNWithZScore

model_KNNWithZScore = KNNWithZScore()
model_KNNWithZScore.fit(trainset)
predictions = model_KNNWithZScore.test(validset.build_testset())
valid_mse = accuracy.mse(predictions)
print(f"Valid MSE: {valid_mse}")

Computing the msd similarity matrix...
Done computing similarity matrix.
MSE: 1.6114
Valid MSE: 1.6114392412185545


In [9]:
# KNNBaseline

model_KNNBaseline = KNNBaseline(
    k=50
)
model_KNNBaseline.fit(trainset)
predictions = model_KNNBaseline.test(validset.build_testset())
valid_mse = accuracy.mse(predictions)
print(f"Valid MSE: {valid_mse}")

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
MSE: 1.3824
Valid MSE: 1.382401204098977
