**Building recommender system using collaborative filtering approach**

The following code:

* Transforms data from transactions.csv into a (user, article, numbers_of_pusrchases) cllection which is the requirded form to feed to ALS model in PySpark.

* Fits the model on the data to produce 10d feature vectors 

* Produces and plots top k recommendations

Input data limited to 100000 transactions 

In [None]:
!pip install pyspark

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from pyspark.sql import SparkSession
from pyspark.sql import Row

spark = SparkSession.builder.appName('Recommendations').getOrCreate()

lines = spark.read.options(header=True).csv("../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv")
df = lines.drop('sales_channel_id').drop('price').drop('t_dat').limit(100000)

In [None]:
# get unique customer and article ids so you can map them to integers as per pyspark requirements for ALS model
unique_customers = df.select('customer_id').distinct()
unique_articles = df.select('article_id').distinct()

In [None]:
#Create a list of Row objects that map each custmoer id to a unique integer
customer_id_mapping = []
for i, c in enumerate(unique_customers.collect()):
    customer_id_mapping.append(Row(c['customer_id'], i))

In [None]:
#Create a list of Row objects that map each article id to a unique integer

article_id_mapping = []
for i, c in enumerate(unique_articles.collect()):
    article_id_mapping.append(Row(c['article_id'], i))

In [None]:
customer_map = spark.createDataFrame(customer_id_mapping, ['customer_id', 'int_customer_id'])

In [None]:
article_map = spark.createDataFrame(article_id_mapping, ['article_id', 'int_article_id'])

In [None]:
map_df = df.join(customer_map, 'customer_id').join(article_map, on='article_id')

In [None]:
map_df.show()

In [None]:
#get number of purchases of each item by each users 
#which will be treated as a rating of customers liking of this specific article

user_item = map_df.groupby(['int_customer_id', 'int_article_id']).count()

In [None]:
user_item.show(5)

In [None]:
# user_item.write.parquet("./user_item_matrix.parquet")

In [None]:
(training, test) = user_item.randomSplit([0.8, 0.2])

In [None]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5, regParam=0.01, userCol="int_customer_id", itemCol="int_article_id", ratingCol="count",
          coldStartStrategy="drop")
model = als.fit(training)

In [None]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="count",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

In [None]:
def get_rcmnds(k=6):
    # Generate top k recommendations for a specified set of customers
    users = user_item.select(als.getUserCol()).distinct().limit(1)
    userSubsetRecs = model.recommendForUserSubset(users, k)
    # Generate top k customer recommendations for a specified set of articles
    movies = user_item.select(als.getItemCol()).distinct().limit(1)
    movieSubSetRecs = model.recommendForItemSubset(movies, k)

    cid = userSubsetRecs.collect()[0][0]
    aids = []
    for row in userSubsetRecs.collect()[0][1]:
        aids.append(row['int_article_id'])

    rcmnds = article_map.filter(article_map.int_article_id.isin(aids)).toPandas().article_id.values

    prev_items_int = user_item.filter(user_item.int_customer_id == cid).toPandas().int_article_id.values.tolist()

    prev_items = article_map.filter(article_map.int_article_id.isin(prev_items_int)).toPandas().article_id.values
    
    return rcmnds, prev_items

In [None]:
rcmnds, prev_items = get_rcmnds()

In [None]:
import matplotlib.pyplot as plt
path = "../input/h-and-m-personalized-fashion-recommendations/images"
def plot_prev(prev_items):
    k = len(prev_items)
    fig = plt.figure(figsize=(15, 10))
    for item, i in zip(prev_items, range(1, k+1)):
        item = str(item)
        sub = item[:3]
        image = path + "/"+ sub + "/"+ item +".jpg"
        image = plt.imread(image)
        fig.add_subplot(1, k, i)
        plt.imshow(image)
        
def plot_rcmnd(rcmnds):
    k = len(rcmnds)
    fig = plt.figure(figsize=(15, 10))
    for item, i in zip(rcmnds, range(1, k+1)):
        
        item = str(item)
        sub = item[:3]
        image = path + "/"+ sub + "/"+ item +".jpg"
        image = plt.imread(image)
        fig.add_subplot(1, k, i)
        plt.imshow(image)

In [None]:
plot_prev(prev_items)

In [None]:
plot_rcmnd(rcmnds)