In [3]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse

# load the data
csv_dir = "/Users/pepijnschouten/Desktop/Python_Scripts/" \
    "Python_Scripts_Books/Distributed_ML_with_PySpark/" \
        "Python_Own_Files/Chapter 13/data/amazon_product_ratings.csv"

column_names = ["user_id", "product_id",
                "rating", "timestamp"]

pandas_df = pd.read_csv(csv_dir,
                        names=column_names)

In [None]:
# explore the datq
print(pandas_df.shape)
print(pandas_df.columns)
print(pandas_df.head())
sorted_unique_ratings = np.sort(pandas_df["rating"].unique())
print(sorted_unique_ratings)
print(pandas_df.dtypes)

pandas_results_desc = (pandas_df
                       .groupby("user_id")
                       .size()
                       .reset_index(name="count")
                       .sort_values(by="count", ascending=False))
print(pandas_results_desc.head(5))

pandas_results_asc = (pandas_df
                      .groupby("user_id")
                      .size()
                      .reset_index(name="count")
                      .sort_values(by="count", ascending=True))
print(pandas_results_asc.head(5))

(82677131, 4)
Index(['user_id', 'product_id', 'rating', 'timestamp'], dtype='object')
          user_id  product_id  rating   timestamp
0  A3AF8FFZAZYNE5  0000000078     5.0  1092182400
1   AH2L9G3DQHHAJ  0000000116     4.0  1019865600
2  A2IIIDRK3PRRZY  0000000116     1.0  1395619200
3  A1TADCM7YWPQ8M  0000000868     4.0  1031702400
4   AWGH7V0BDOJKB  0000013714     4.0  1383177600
[1. 2. 3. 4. 5.]
user_id        object
product_id     object
rating        float64
timestamp       int64
dtype: object


In [None]:
# recommender system

# define rating scale
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(
    pandas_df[["user_id", "product_id", "rating"]],
    reader)

# split data
trainset, testset = train_test_split(data, test_size=0.2)

# model
model = SVD()

# train
model.fit(trainset)

# evaluate
Predictions = model.test(testset)
rmse_score = rmse(Predictions)
print(f"RMSE score: {rmse_score}")

# show some cases
for prediction in Predictions[:5]:
    rounded_prediction = round(prediction.est, 1)
    print(f"{prediction.uid}, {prediction.iid}, "
          f"Actual rating: {prediction.r_ui}, "
          f"Predicted rating: {rounded_prediction}")