### Description of dataset
* Dataset's name: [Collaborative filtering dataset - dating agency](http://www.occamslab.com/petricek/data/)
* Description: These files contain 17,359,346 anonymous ratings of 168,791 profiles made by 135,359 LibimSeTi users as dumped on April 4, 2006.

### 1. Prepare Dataset

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import Row
spark = SparkSession \
    .builder \
    .appName("Recommendation Systems") \
    .getOrCreate()
print(spark)

<pyspark.sql.session.SparkSession object at 0x00000251BDC72D68>


In [3]:
ratings = spark.read.csv("D://Documents//Semester_8//Apache-Spark//libimseti-complete//libimseti//ratings.csv", header=True)
ratings.printSchema()

root
 |-- UserID: string (nullable = true)
 |-- ProfileID: string (nullable = true)
 |-- Rating: string (nullable = true)



In [4]:
df = ratings.select(ratings.UserID.cast("int"),
                   ratings.ProfileID.cast("int"),
                   ratings.Rating.cast("int"))
df.show()
df.printSchema()
df.createOrReplaceTempView("rating")

+------+---------+------+
|UserID|ProfileID|Rating|
+------+---------+------+
|     1|      133|     8|
|     1|      720|     6|
|     1|      971|    10|
|     1|     1095|     7|
|     1|     1616|    10|
|     1|     1978|     7|
|     1|     2145|     8|
|     1|     2211|     8|
|     1|     3751|     7|
|     1|     4062|     3|
|     1|     4633|    10|
|     1|     4842|     5|
|     1|     6518|     6|
|     1|     7576|     8|
|     1|     7724|     7|
|     1|     8305|    10|
|     1|     8923|     9|
|     1|     9345|    10|
|     1|     9729|    10|
|     1|    10148|    10|
+------+---------+------+
only showing top 20 rows

root
 |-- UserID: integer (nullable = true)
 |-- ProfileID: integer (nullable = true)
 |-- Rating: integer (nullable = true)



In [5]:
spark.sql("select ProfileID, count(Rating) as total_review, min(Rating) as min_rating, max(Rating) as max_rating, round(avg(rating),2) as rating \
        from rating \
        group by ProfileID \
        order by total_review desc").show()

+---------+------------+----------+----------+------+
|ProfileID|total_review|min_rating|max_rating|rating|
+---------+------------+----------+----------+------+
|   156148|       33389|         6|        10|  10.0|
|    31116|       28398|         1|        10|  7.79|
|   193687|       23649|         1|        10|   8.3|
|   121859|       23639|         1|        10|   9.5|
|    83773|       23113|         1|        10|  6.27|
|    22319|       21387|         1|        10|  9.92|
|    71636|       21284|         5|        10|  9.98|
|    89855|       20634|         1|        10|  5.95|
|    20737|       18550|         2|        10|  8.23|
|   162707|       18224|         1|        10|  4.11|
|    68989|       16591|         1|        10|  5.94|
|    60983|       16253|         1|        10|  5.91|
|    33216|       16049|         1|        10|  8.55|
|   179192|       15620|         1|        10|  7.41|
|   194553|       14432|         3|        10|  6.15|
|   192666|       14335|    

### 2. ALS (Alternating Least Squares) Algorithm

In [6]:
(training, test) = df.randomSplit([0.8, 0.2])

In [7]:
als = ALS(maxIter=10, regParam=0.1, userCol="UserID", itemCol="ProfileID", ratingCol="Rating", coldStartStrategy="drop")
model = als.fit(training)

In [8]:
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="Rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.834707247020203


In [9]:
# Generate top 10 profile recommendations for each user
userRecs = model.recommendForAllUsers(10)
userRecs.show()

+------+--------------------+
|UserID|     recommendations|
+------+--------------------+
|   148|[[181289, 15.7552...|
|   463|[[183834, 15.0398...|
|   471|[[174257, 14.7062...|
|   496|[[217489, 18.1424...|
|   833|[[51448, 16.36873...|
|  1088|[[43288, 16.83376...|
|  1238|[[107284, 15.2142...|
|  1342|[[157929, 14.4738...|
|  1580|[[209453, 16.8693...|
|  1591|[[58307, 13.34065...|
|  1645|[[137862, 14.8699...|
|  1829|[[211064, 16.5125...|
|  1959|[[51448, 14.06783...|
|  2122|[[94841, 14.61345...|
|  2142|[[107990, 14.5996...|
|  2366|[[188024, 14.5479...|
|  2659|[[183834, 15.2326...|
|  2866|[[183834, 14.2354...|
|  3175|[[46427, 20.15452...|
|  3749|[[183834, 16.2350...|
+------+--------------------+
only showing top 20 rows



In [10]:
# Generate top 10 user recommendations for each profile
profileRecs = model.recommendForAllItems(10)
profileRecs.show()

+---------+--------------------+
|ProfileID|     recommendations|
+---------+--------------------+
|      496|[[78113, 15.30917...|
|      833|[[29513, 13.38827...|
|     1238|[[89473, 12.41046...|
|     1580|[[28469, 14.64806...|
|     1591|[[122019, 11.1961...|
|     1645|[[101100, 19.1631...|
|     1829|[[68655, 12.36113...|
|     1959|[[22638, 8.990044...|
|     2122|[[29112, 10.41686...|
|     2366|[[69236, 9.455493...|
|     2659|[[119676, 21.1902...|
|     2866|[[68655, 12.06060...|
|     3175|[[12857, 15.51095...|
|     3749|[[6447, 18.74696]...|
|     3794|[[123696, 14.5935...|
|     4101|[[83781, 16.57009...|
|     4900|[[44785, 14.63550...|
|     4935|[[20625, 11.58235...|
|     5300|[[69236, 15.72739...|
|     5518|[[72215, 9.627571...|
+---------+--------------------+
only showing top 20 rows



In [11]:
# Generate top 10 profile recommendations for a specified set of users
users = ratings.select(als.getUserCol()).distinct().limit(3)
userSubsetRecs = model.recommendForUserSubset(users, 10)

In [12]:
# Generate top 10 user recommendations for a specified set of profile
profile = ratings.select(als.getItemCol()).distinct().limit(3)
profileSubsetRecs = model.recommendForItemSubset(profile, 10)

In [13]:
userSubsetRecs.show()

+------+--------------------+
|UserID|     recommendations|
+------+--------------------+
|   296|[[107284, 18.1602...|
|   675|[[132059, 15.3977...|
|   467|[[48924, 17.63759...|
+------+--------------------+



In [14]:
profileSubsetRecs.show()

+---------+--------------------+
|ProfileID|     recommendations|
+---------+--------------------+
|    93893|[[106774, 12.7821...|
|    59318|[[95019, 11.38647...|
|   204088|[[8800, 10.898197...|
+---------+--------------------+



### 3. References

https://spark.apache.org/docs/latest/ml-collaborative-filtering.html