### Description of dataset
* Dataset's name: [Collaborative filtering dataset - dating agency](http://www.occamslab.com/petricek/data/)
* Description: These files contain 17,359,346 anonymous ratings of 168,791 profiles made by 135,359 LibimSeTi users as dumped on April 4, 2006.

### 1. Prepare Dataset

In [1]:
import findspark
findspark.init()

In [12]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import Row
spark = SparkSession \
    .builder \
    .appName("Recommendation Systems") \
    .getOrCreate()
print(spark)

<pyspark.sql.session.SparkSession object at 0x00000166E9722D30>


In [3]:
ratings = spark.read.csv("D://Documents//Semester_8//Apache-Spark//libimseti-complete//libimseti//ratings.csv", header=True)
ratings.printSchema()

root
 |-- UserID: string (nullable = true)
 |-- ProfileID: string (nullable = true)
 |-- Rating: string (nullable = true)



In [4]:
df = ratings.select(ratings.UserID.cast("int"),
                   ratings.ProfileID.cast("int"),
                   ratings.Rating.cast("int"))
df.show()
df.printSchema()
df.createOrReplaceTempView("rating")

+------+---------+------+
|UserID|ProfileID|Rating|
+------+---------+------+
|     1|      133|     8|
|     1|      720|     6|
|     1|      971|    10|
|     1|     1095|     7|
|     1|     1616|    10|
|     1|     1978|     7|
|     1|     2145|     8|
|     1|     2211|     8|
|     1|     3751|     7|
|     1|     4062|     3|
|     1|     4633|    10|
|     1|     4842|     5|
|     1|     6518|     6|
|     1|     7576|     8|
|     1|     7724|     7|
|     1|     8305|    10|
|     1|     8923|     9|
|     1|     9345|    10|
|     1|     9729|    10|
|     1|    10148|    10|
+------+---------+------+
only showing top 20 rows

root
 |-- UserID: integer (nullable = true)
 |-- ProfileID: integer (nullable = true)
 |-- Rating: integer (nullable = true)



In [7]:
spark.sql("select ProfileID, count(Rating) as total_review, min(Rating) as min_rating, max(Rating) as max_rating, round(avg(rating),2) as rating \
        from rating \
        group by ProfileID \
        order by total_review desc").show()

+---------+------------+----------+----------+------+
|ProfileID|total_review|min_rating|max_rating|rating|
+---------+------------+----------+----------+------+
|   156148|       33389|         6|        10|  10.0|
|    31116|       28398|         1|        10|  7.79|
|   193687|       23649|         1|        10|   8.3|
|   121859|       23639|         1|        10|   9.5|
|    83773|       23113|         1|        10|  6.27|
|    22319|       21387|         1|        10|  9.92|
|    71636|       21284|         5|        10|  9.98|
|    89855|       20634|         1|        10|  5.95|
|    20737|       18550|         2|        10|  8.23|
|   162707|       18224|         1|        10|  4.11|
|    68989|       16591|         1|        10|  5.94|
|    60983|       16253|         1|        10|  5.91|
|    33216|       16049|         1|        10|  8.55|
|   179192|       15620|         1|        10|  7.41|
|   194553|       14432|         3|        10|  6.15|
|   192666|       14335|    

### 2. ALS (Alternating Least Squares) Algorithm

In [9]:
(training, test) = df.randomSplit([0.8, 0.2])

In [17]:
als = ALS(maxIter=10, regParam=0.1, userCol="UserID", itemCol="ProfileID", ratingCol="Rating", coldStartStrategy="drop")
model = als.fit(training)

In [18]:
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="Rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.8325456808086027


In [19]:
# Generate top 10 profile recommendations for each user
userRecs = model.recommendForAllUsers(10)
userRecs.show()

+------+--------------------+
|UserID|     recommendations|
+------+--------------------+
|   148|[[197855, 16.6731...|
|   463|[[132059, 16.2649...|
|   471|[[68295, 15.04343...|
|   496|[[15487, 17.57094...|
|   833|[[184022, 15.1616...|
|  1088|[[36880, 21.67093...|
|  1238|[[119519, 15.0903...|
|  1342|[[86379, 14.51208...|
|  1580|[[15487, 16.46562...|
|  1591|[[15487, 14.51271...|
|  1645|[[216709, 14.1595...|
|  1829|[[117973, 19.7139...|
|  1959|[[12391, 15.97196...|
|  2122|[[182214, 15.5431...|
|  2142|[[188457, 15.5643...|
|  2366|[[15487, 15.51381...|
|  2659|[[178022, 15.2394...|
|  2866|[[15487, 15.49009...|
|  3175|[[56724, 15.31058...|
|  3749|[[68295, 14.12433...|
+------+--------------------+
only showing top 20 rows



In [20]:
# Generate top 10 user recommendations for each profile
profileRecs = model.recommendForAllItems(10)
profileRecs.show()

+---------+--------------------+
|ProfileID|     recommendations|
+---------+--------------------+
|      496|[[38464, 15.65150...|
|      833|[[76155, 11.48229...|
|     1238|[[112652, 11.7947...|
|     1580|[[31242, 15.84751...|
|     1591|[[70289, 9.904825...|
|     1645|[[82865, 16.65724...|
|     1829|[[31019, 15.42304...|
|     1959|[[88809, 11.30611...|
|     2122|[[105939, 10.5938...|
|     2366|[[26871, 8.984773...|
|     2659|[[85332, 13.95647...|
|     2866|[[91397, 11.24922...|
|     3175|[[51565, 18.85792...|
|     3749|[[85332, 18.5251]...|
|     3794|[[112652, 12.4217...|
|     4101|[[75614, 17.13718...|
|     4900|[[63056, 14.40250...|
|     4935|[[85332, 11.49072...|
|     5300|[[122307, 15.0731...|
|     5518|[[92606, 11.26908...|
+---------+--------------------+
only showing top 20 rows



In [25]:
# Generate top 10 profile recommendations for a specified set of users
users = ratings.select(als.getUserCol()).distinct().limit(3)
userSubsetRecs = model.recommendForUserSubset(users, 10)

In [30]:
# Generate top 10 user recommendations for a specified set of profile
profile = ratings.select(als.getItemCol()).distinct().limit(3)
profileSubsetRecs = model.recommendForItemSubset(profile, 10)

In [27]:
userSubsetRecs.show()

+------+--------------------+
|UserID|     recommendations|
+------+--------------------+
|   296|[[188024, 16.7914...|
|   675|[[15487, 14.59643...|
|   467|[[189065, 14.2386...|
+------+--------------------+



In [31]:
profileSubsetRecs.show()

+---------+--------------------+
|ProfileID|     recommendations|
+---------+--------------------+
|    93893|[[38464, 13.18778...|
|    59318|[[106843, 12.0248...|
|   204088|[[58132, 11.21015...|
+---------+--------------------+

