In [None]:
pip install pyspark

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import SparkSession
from sklearn import preprocessing

# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
# /kaggle/input/h-and-m-personalized-fashion-recommendations/sample_submission.csv
# /kaggle/input/h-and-m-personalized-fashion-recommendations/articles.csv
# /kaggle/input/h-and-m-personalized-fashion-recommendations/transactions_train.csv
# /kaggle/input/h-and-m-personalized-fashion-recommendations/customers.csv

In [None]:
articles = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/articles.csv")

In [None]:
articles.head(1)

In [None]:
%%time
transactions_train = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/transactions_train.csv")

In [None]:
transactions_train.head(1)

In [None]:
dff = articles.merge(transactions_train,on="article_id")

In [None]:
dff.tail(1)

In [None]:
#Create PySpark SparkSession
spark = SparkSession.builder \
    .master("local[1]") \
    .appName("fashionproject") \
    .getOrCreate()


In [None]:
# Taking a few records due to Kaggle memory contraints 
data = dff.head(1000)[['customer_id','article_id','price']]

In [None]:
le = preprocessing.LabelEncoder()

In [None]:
data['customer_id']= le.fit_transform(data['customer_id'])

In [None]:
data.sample()

In [None]:
#Create PySpark DataFrame from Pandas
sparkDF=spark.createDataFrame(data) 

In [None]:
sparkDF.printSchema()

In [None]:
training_size = 0.7
random_state = 0
test_size = 0.3
training, testing = sparkDF.randomSplit([training_size, test_size], seed=random_state)

In [None]:
# Recommendation model using ALS on the training data
# model parameters
maxIter = 5
regParam = 0.01
userCol = "customer_id"
itemCol = "article_id"
ratingCol = "price"
coldStartStrategy = "drop"

In [None]:
# https://spark.apache.org/docs/3.0.0/ml-collaborative-filtering.html
# According to https://spark.apache.org/docs/2.2.0/api/python/pyspark.ml.html#pyspark.ml.recommendation.ALS
# ALS attempts to estimate the ratings matrix R as the product of two lower-rank matrices, X and Y, i.e. X * Yt = R.
# Typically these approximations are called ‘factor’ matrices. The general approach is iterative. During each iteration,
# one of the factor matrices is held constant, while the other is solved for using least squares. The newly-solved factor
# matrix is then held constant while solving for the other factor matrix.
als = ALS(maxIter=maxIter, regParam=regParam, userCol=userCol, itemCol=itemCol, ratingCol=ratingCol,
          coldStartStrategy=coldStartStrategy  )
model = als.fit(training)

In [None]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(testing)
evaluator = RegressionEvaluator(metricName="rmse", labelCol=ratingCol, predictionCol="prediction")
rmse = evaluator.evaluate(predictions)

In [None]:
predictions.show(3)

In [None]:
rmse