In [1]:
!pip install pyspark
!pip install findspark
!pip install "numpy<2.0"
!pip install scikit-surprise



In [ ]:
import pandas as pd
import findspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from surprise import KNNBasic
from surprise.model_selection import GridSearchCV
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator
from surprise import Dataset, Reader
from surprise import NMF

In [2]:
findspark.init()

In [3]:
spark = SparkSession.builder \
    .appName("rnmp_lab2") \
    .getOrCreate()

In [4]:
movies_df = spark.read\
    .option("delimiter", "|")\
    .option("header", False)\
    .csv("u.item", inferSchema=True)

In [5]:
ratings_df = spark.read \
    .option("delimiter", "\t") \
    .option("header", False) \
    .csv("u.data", inferSchema=True)

In [6]:
movies_df.show(1)

+---+----------------+-----------+----+--------------------+---+---+---+---+---+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
|_c0|             _c1|        _c2| _c3|                 _c4|_c5|_c6|_c7|_c8|_c9|_c10|_c11|_c12|_c13|_c14|_c15|_c16|_c17|_c18|_c19|_c20|_c21|_c22|_c23|
+---+----------------+-----------+----+--------------------+---+---+---+---+---+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
|  1|Toy Story (1995)|01-Jan-1995|NULL|http://us.imdb.co...|  0|  0|  0|  1|  1|   1|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|
+---+----------------+-----------+----+--------------------+---+---+---+---+---+----+----+----+----+----+----+----+----+----+----+----+----+----+----+


In [7]:
ratings_df.show(1)

+---+---+---+---------+
|_c0|_c1|_c2|      _c3|
+---+---+---+---------+
|196|242|  3|881250949|
+---+---+---+---------+


In [8]:
movies_df = movies_df.select(
    F.col("_c0").alias("movie_id"),
    F.col("_c1").alias("movie_title")
)
ratings_df = ratings_df.select(
    F.col("_c0").alias("user_id"),
    F.col("_c1").alias("movie_id"),
    F.col("_c2").alias("movie_rating")
)

In [9]:
movies_df.show(1)

+--------+----------------+
|movie_id|     movie_title|
+--------+----------------+
|       1|Toy Story (1995)|
+--------+----------------+


In [10]:
ratings_df.show(1)

+-------+--------+------------+
|user_id|movie_id|movie_rating|
+-------+--------+------------+
|    196|     242|           3|
+-------+--------+------------+


In [11]:
train_df, test_df = ratings_df.randomSplit([0.8, 0.2], seed=42)

In [12]:
als = ALS(
    userCol="user_id",
    itemCol="movie_id",
    ratingCol="movie_rating",
    coldStartStrategy="drop",
    nonnegative=True
)

In [27]:
mse_evaluator = RegressionEvaluator(
    metricName="mse",
    labelCol="movie_rating",
    predictionCol="prediction"
)
mae_evaluator = RegressionEvaluator(
    metricName="mae",
    labelCol="movie_rating",
    predictionCol="prediction"
)

In [14]:
parameter_grid = ParamGridBuilder() \
    .addGrid(als.rank, [10, 20]) \
    .addGrid(als.maxIter, [10, 15]) \
    .build()

In [15]:
cv = CrossValidator(
    estimator=als,
    estimatorParamMaps=parameter_grid,
    evaluator=mse_evaluator,
    numFolds=3,
)

In [16]:
cv_model = cv.fit(train_df)

In [17]:
best_als = cv_model.bestModel

In [18]:
print(f"Best als model has rank [{best_als.rank}] , max number of iterations [{best_als._java_obj.parent().getMaxIter()}]",)

Best als model has rank [20] , max number of iterations [15]


In [35]:
final_als = ALS(
    userCol="user_id",
    itemCol="movie_id",
    ratingCol="movie_rating",
    rank = best_als._java_obj.parent().getRank(),
    regParam = 0.1,
    maxIter = best_als._java_obj.parent().getMaxIter(),
    nonnegative=True,
    coldStartStrategy="drop"
)

In [36]:
final_model = final_als.fit(train_df)

In [37]:
predictions = final_model.transform(test_df)
predictions.show(5)

+-------+--------+------------+----------+
|user_id|movie_id|movie_rating|prediction|
+-------+--------+------------+----------+
|    148|       8|           4| 3.8307126|
|    148|      56|           5|  3.792567|
|    148|      71|           5| 3.1500516|
|    148|     133|           5|  3.404283|
|    148|     169|           5|   5.08085|
+-------+--------+------------+----------+


In [39]:
mae = mae_evaluator.evaluate(predictions)
mse = mse_evaluator.evaluate(predictions)
mse, mae

(0.8409848667013615, 0.7267128120397609)

In [40]:
user_recommendations = final_model.recommendForAllUsers(5)
# dava top 5 preporaki za sekoj user
# vrakja user_id | recommendations [{movie_id, rating}]

In [41]:
user_recommendations.show(5, truncate=False)

+-------+--------------------------------------------------------------------------------------------+
|user_id|recommendations                                                                             |
+-------+--------------------------------------------------------------------------------------------+
|1      |[{1449, 5.1228433}, {169, 4.980907}, {408, 4.9762735}, {114, 4.9458604}, {512, 4.911113}]   |
|2      |[{1643, 5.1536236}, {1449, 5.1019306}, {483, 4.7577267}, {648, 4.7416778}, {657, 4.6762094}]|
|3      |[{320, 4.6705847}, {1643, 4.2000694}, {1131, 4.0800204}, {1062, 4.069155}, {1449, 4.066411}]|
|4      |[{1512, 6.241803}, {1449, 5.813253}, {1194, 5.446516}, {745, 5.436035}, {189, 5.349012}]    |
|5      |[{793, 4.659515}, {408, 4.462857}, {50, 4.4311595}, {169, 4.3268924}, {172, 4.3163304}]     |
+-------+--------------------------------------------------------------------------------------------+


In [42]:
user_recommendations = user_recommendations \
    .withColumn("rec", F.explode("recommendations")) \
    .select(
        "user_id",
        "rec.movie_id",
        "rec.rating"
    )

In [43]:
user_recommendations = user_recommendations.join(
    movies_df,
    on="movie_id",
    how="inner"
)

In [44]:
user_recommendations.show(10, truncate=False)

+--------+-------+---------+------------------------------------------------------+
|movie_id|user_id|rating   |movie_title                                           |
+--------+-------+---------+------------------------------------------------------+
|1449    |1      |5.1228433|Pather Panchali (1955)                                |
|169     |1      |4.980907 |Wrong Trousers, The (1993)                            |
|408     |1      |4.9762735|Close Shave, A (1995)                                 |
|114     |1      |4.9458604|Wallace & Gromit: The Best of Aardman Animation (1996)|
|512     |1      |4.911113 |Wings of Desire (1987)                                |
|1643    |2      |5.1536236|Angel Baby (1995)                                     |
|1449    |2      |5.1019306|Pather Panchali (1955)                                |
|483     |2      |4.7577267|Casablanca (1942)                                     |
|648     |2      |4.7416778|Quiet Man, The (1952)                           

KNN

In [13]:
train_pd = train_df.toPandas()
test_pd  = test_df.toPandas()

In [16]:
reader = Reader(rating_scale=(1, 5))

data = Dataset.load_from_df(
    train_pd,
    reader
)

In [15]:
knn_parameter_grid = {
    "k": [20, 40, 60],
}

In [17]:
gs = GridSearchCV(
    KNNBasic,
    knn_parameter_grid,
    measures=["mse"],
    cv=3,
    n_jobs=-1
)
gs.fit(data)

In [18]:
best_knn = gs.best_estimator["mse"]
print("Best K parameter:", gs.best_params["mse"]["k"])

Best K parameter: 40


In [19]:
trainset = data.build_full_trainset()
best_knn.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x78ca2160b890>

In [20]:
y_true = []
y_pred = []

for _, row in test_pd.iterrows():
    pred = best_knn.predict(row.user_id, row.movie_id).est
    y_pred.append(pred)
    y_true.append(row.movie_rating)

In [23]:
mse = mean_squared_error(y_true, y_pred)
mae = mean_absolute_error(y_true, y_pred)

print("KNN Evaluation Results")
print(f"MSE: {mse:.4f}")
print(f"MAE: {mae:.4f}")

KNN Evaluation Results
MSE: 0.9557
MAE: 0.7705


NMF

In [24]:
nmf_param_grid = {
    "n_factors": [20, 40],
    "reg_pu": [0.06, 0.1],
    "reg_qi": [0.06, 0.1],
}
gs_nmf = GridSearchCV(
    NMF,
    nmf_param_grid,
    measures=["mse"],
    cv=3,
    n_jobs=-1
)
gs_nmf.fit(data)

In [25]:
best_nmf = gs_nmf.best_estimator["mse"]

gs_nmf.best_params["mse"]

{'n_factors': 40, 'reg_pu': 0.1, 'reg_qi': 0.1}

In [26]:
best_nmf.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x78ca0236f2c0>

In [28]:
y_true_nmf = []
y_pred_nmf = []

for _, row in test_pd.iterrows():
    pred = best_nmf.predict(row.user_id, row.movie_id).est

    y_pred_nmf.append(pred)
    y_true_nmf.append(row.movie_rating)

In [30]:
nmf_mse = mean_squared_error(y_true_nmf, y_pred_nmf)
nmf_mae = mean_absolute_error(y_true_nmf, y_pred_nmf)

print("NMF Evaluation Results")
print(f"MSE: {nmf_mse:.4f}")
print(f"MAE: {nmf_mae:.4f}")

NMF Evaluation Results
MSE: 0.8773
MAE: 0.7378
