In [None]:
!pip install pyspark

# Import Data

In [None]:
import pandas as pd
articles = pd.read_csv ('../input/h-and-m-personalized-fashion-recommendations/articles.csv')
transactions = pd.read_csv ('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv')
customers = pd.read_csv ('../input/h-and-m-personalized-fashion-recommendations/customers.csv')

In [None]:
sample = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')
sample

In [None]:
transactions.info()

In [None]:
print('min date: {}, max date: {}'.format(transactions.t_dat.min(), transactions.t_dat.max()))

In [None]:
#select one day and only online channel
ts_1d = transactions.loc[((transactions['t_dat']=='2020-09-21') & (transactions['sales_channel_id']==2))]
ts_1d

In [None]:
customers.info()

# Data Preparation

### create dataframe from Spark

In [None]:
from pyspark.sql import SparkSession
#Create PySpark SparkSession
spark = SparkSession.builder \
    .master("local[1]") \
    .appName("Recommendation") \
    .getOrCreate()
#Create PySpark DataFrame from Pandas
df=spark.createDataFrame(ts_1d) 
df.printSchema()
df.show()

In [None]:
df1 = df.groupBy('customer_id').count()
df1.show()

In [None]:
df2 = df.groupby('customer_id','article_id').count()
df2.show()

### convert string id to interger starting from number 0

In [None]:
from pyspark.ml.feature import StringIndexer

def str2int(df,col_name):
    for col in col_name:
        indexer = StringIndexer(inputCol=col, outputCol=col+"_index")
        model = indexer.fit(df)
        df = model.transform(df)
    return df

str2int_col = ['customer_id','article_id']
df_idx = str2int(df2,str2int_col)
df_idx.show()

### split the modelling dataset into training and testing sets 

In [None]:
(training,test)=df_idx.randomSplit([0.8, 0.2])

# Data Modelling using ALS

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder


#create ALS model
als=ALS(userCol="customer_id_index",itemCol="article_id_index",ratingCol="count",coldStartStrategy="drop",nonnegative=True)

#tune model using ParamGridBuilder
param_grid = ParamGridBuilder()\
            .addGrid(als.rank, [15,20,25])\
            .addGrid(als.maxIter,[5,10,15])\
            .addGrid(als.regParam,[0.09,0.14,0.19])\
            .build()

#define evaluator as RMSE
evaluator = RegressionEvaluator(metricName = "rmse",labelCol = 'count', predictionCol = 'prediction')

#Build cross validation using CrossValidator
cv = CrossValidator(estimator=als,estimatorParamMaps=param_grid, evaluator=evaluator,numFolds=3)


#Fit ALS model to training data
model = cv.fit(training)


In [None]:
#Extract best model from the tuning exercise using ParamGridBuilder
best_model = model.bestModel

#Generate predictions and evaluate using RMSE
predictions = best_model.transform(test)
rmse = evaluator.evaluate(predictions)

#print evaluation metrics and model parameters
print("RMSE =" + str(rmse))
print("**Best Model**")
print("Rank : {}".format(best_model.rank))
print("MaxIter: {}".format(best_model._java_obj.parent().getMaxIter()))
print("RegParam: {}".format(best_model._java_obj.parent().getRegParam()))

In [None]:
predictions.show()

In [None]:
df_recom = best_model.recommendForAllUsers(10)
df_recom.show(10)

In [None]:
df_recom = df_recom.select("customer_id_index","recommendations.article_id_index")
df_recom.show(10)
df_recom = df_recom.toPandas()

In [None]:
df_recom.sort_values('customer_id_index')

In [None]:
md=df_idx.select(df_idx['article_id'],df_idx['article_id_index'],df_idx['customer_id'],df_idx['customer_id_index'])
md=md.toPandas()
md

In [None]:
dict1 =dict(zip(md['article_id_index'],md['article_id']))
dict2=dict(zip(md['customer_id_index'],md['customer_id']))
df_recom['article_id'] = df_recom['article_id_index'].map(lambda x: [dict1[y] for y in x if y in dict1])
df_recom['customer_id']=df_recom['customer_id_index'].map(dict2)
df_recom

In [None]:
recom_final = df_recom.drop(['customer_id_index','article_id_index'], axis = 1)
recom_final

In [None]:
from matplotlib import pyplot as plt
def plot_img(prev_items):
    fig = plt.figure(figsize=(20, 10))
    for item, i in zip(prev_items, range(1, len(prev_items)+1)):
        item = '0' + str(item)
        sub = item[:3]
        image = path + "/"+ sub + "/"+ item +".jpg"
        image = plt.imread(image)
        fig.add_subplot(2, 5, i)
        plt.imshow(image)
        if i == 10:
            break

In [None]:
path = "../input/h-and-m-personalized-fashion-recommendations/images"
customer1 = recom_final.loc[10,:]
customer_id = customer1[0]
predict_articles = customer1[1]
actual_articles = ts_1d.loc[ts_1d['customer_id']==customer_id].article_id.tolist()

print('customer id: {}'.format(customer1[0]))
print('actual articles')
plot_img(actual_articles)


In [None]:
print('predicted article')
plot_img(predict_articles)