In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# About this pipeline
The idea is simply to collect the most sold items in the last two weeks of observation, grouped by age range of customers.

In order to do this I will create a "age_range" feature on customers table. Then I will use the transactions table, limited to the two last weeks, to rank the most sold items (grouped by age range).

Lastly the recommendations are collected in a list for each age_range and joined to the application table.

For a more complex solution, please feel free to check my [LightGBM.Ranker model proposal](https://www.kaggle.com/code/lorenzopagliaro01/h-m-ranker-pyspark-lgbmranker).

In [None]:
!pip install pyspark -q
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql import functions as F
from pyspark.sql import Window
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, ArrayType, DoubleType, BooleanType

sc = SparkSession.builder.appName("Recommendations").config("spark.sql.files.maxPartitionBytes", 5000000).getOrCreate()
spark = SparkSession(sc)

In [None]:
articles = spark.read.option("header",True) \
                .csv("../input/h-and-m-personalized-fashion-recommendations/articles.csv")
customers = spark.read.option("header",True) \
                .csv("../input/h-and-m-personalized-fashion-recommendations/customers.csv")
transactions = spark.read.option("header",True) \
                .csv("../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv")

Add age_range feature to customers

In [None]:
customers = customers\
    .fillna({'age': '27'})\
    .withColumn('age_range', 
                 F.when(F.col('age') < 20, 'under_20')\
                  .when((F.col('age') >= 20) & (F.col('age') <= 25), '20_25')\
                  .when((F.col('age') >= 26) & (F.col('age') <= 30), '26_30')\
                  .when((F.col('age') >= 31) & (F.col('age') <= 35), '31_35')\
                  .when((F.col('age') >= 36) & (F.col('age') <= 40), '36_40')\
                  .when((F.col('age') >= 41) & (F.col('age') <= 45), '41_45')\
                  .when((F.col('age') >= 46) & (F.col('age') <= 50), '46_50')\
                  .when((F.col('age') >= 51) & (F.col('age') <= 55), '51_55')\
                  .when((F.col('age') >= 56) & (F.col('age') <= 60), '56_60')\
                  .when((F.col('age') >= 61) & (F.col('age') <= 65), '61_65')\
                  .when((F.col('age') >= 66) & (F.col('age') <= 70), '66_70')\
                  .otherwise('over_70'))\
    .drop('age','FN', 'Active', 'club_member_status', 'fashion_news_frequency', 'postal_code')

customers.show(5)

Limit data to only last 2 weeks

In [None]:
transactions = transactions\
    .withColumn('week1', F.date_trunc('week', transactions.t_dat))\
    .withColumn('week', F.to_date('week1', 'yyyy-MM-dd'))\
    .drop('week1')\
    .filter(F.col('week').isin(['2020-09-21', '2020-09-14']))\
    .withColumn('article_id_int', transactions['article_id'].cast(IntegerType()))\
    .drop('price', 'sales_channel_id', 'week', 't_dat')\
    .join(customers, 'customer_id', 'left')

transactions.show(10)

Articles rank section - top items 12 for last 2 weeks, for each age range

In [None]:
articles_orders = transactions\
    .groupBy('article_id', 'age_range').count().orderBy('count', ascending=False)\
    .withColumnRenamed('count', 'articles_order_count')

articles_orders.show(50)

Keep only top 12 sold items in last 2 weeks

In [None]:
w_articles = Window.partitionBy(articles_orders.age_range).orderBy(articles_orders.articles_order_count.desc())

articles_orders = articles_orders\
    .withColumn('rn', F.row_number().over(w_articles))\
    .filter(F.col('rn') <= 12)

articles_orders.show(24)

Create a list column, for each age range, that contains the top 12 sold items for each age_range

In [None]:
listina = articles_orders\
    .groupBy('age_range')\
    .agg(F.collect_list('article_id').alias('sorted_list'))\
    .withColumn('prediction', F.concat_ws(' ', 'sorted_list'))\
    .drop('sorted_list')
    
listina.show(20)

Load application file

In [None]:
application = spark.read.option("header",True) \
                .csv("../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv")

join customers to add age_range column to application, then join prediction

In [None]:
application = application\
    .drop('prediction')\
    .join(customers, 'customer_id', 'left')\
    .join(listina, 'age_range', 'left')\
    .drop('age_range')\

application.show(10)

Export the prediction

In [None]:
my_pred = application.toPandas()
my_pred.to_csv('my_pred.csv',index=False)