# H&M RECOMMENDATION SYSTEM

In [None]:
# Install Pyspark
!pip install pyspark

In [None]:
# import packages

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, IntegerType 
from pyspark.sql.types import ArrayType, DoubleType, BooleanType
from pyspark.sql.functions import col,array_contains
from pyspark.sql import SQLContext 
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import udf,col,when
from pyspark.sql.functions import to_timestamp,date_format
from pyspark.sql.functions import weekofyear
import numpy as np
import pandas as pd
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import *

In [None]:
sc = SparkSession.builder \
    .appName("Recommendations") \
    .config("spark.sql.files.maxPartitionBytes", 5000000) \
    .getOrCreate()

spark = SparkSession(sc)

In [None]:
from pyspark.sql import Row
from pyspark.sql.types import *

schema = StructType([
    StructField("t_dat", DateType()),
    StructField("customer_id", StringType()),
    StructField("article_id", IntegerType()),
    StructField("price", DoubleType()),
    StructField("sales_channel_id", IntegerType())
])

dataset = spark.read.option("header", True) \
    .csv("../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv",
        schema = schema)

In [None]:
dataset.show(5)

In [None]:
dataset.printSchema()

In [None]:
from pyspark.sql.functions import min, max
from pyspark.sql.functions import unix_timestamp, lit
min_date, max_date = dataset.select(min("t_dat"), max("t_dat")).first()
min_date, max_date

Just something to note here... There seems to be at least two years worth of data. Maybe we could do TopPop on the same week in 2018, 2019 and 2020?

Someone mentioned that we should weigh towards 2020?

In [None]:
# Create Calendar Weeks
dataset = dataset.withColumn('week_of_year',weekofyear(dataset.t_dat))
dataset.show()

In [None]:
# Select CW 38 (Arbitrary at this point)
from pyspark.sql.functions import col
from pyspark.sql import functions as F

recommend = dataset \
    .filter((F.col('week_of_year') == F.lit('38'))) \
    .groupby('article_id').count()

recommend = recommend \
    .withColumn('count', col('count')/3) \
    .sort("count", ascending=False)

In [None]:
recommend.show(12)

In [None]:
recommend_df = recommend.drop(col('count')) \
    .limit(12) \
    .toPandas()

In [None]:
recommend_df

In [None]:
recommend_array = recommend_df['article_id'] \
    .astype(str) \
    .astype(int) \
    .to_numpy()

In [None]:
customers = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/customers.csv')

In [None]:
customers

In [None]:
submission = customers[['customer_id']].copy()
submission['y_score'] = submission.apply(lambda x: recommend_array, axis=1)

In [None]:
submission

In [None]:
submission = submission.rename(columns={'customer_id': 'customer_id', 'y_score': 'prediction'})
submission['prediction'] = submission.prediction.apply(lambda x: ' '.join([f'{e:010d}' for e in x]))
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)

In [None]:
customers = spark.read.option("header", True) \
    .csv('../input/h-and-m-personalized-fashion-recommendations/customers.csv')

dataset.createOrReplaceTempView("transaction") # Create temp view
customers.createOrReplaceTempView("customer")

df_bracketed_customers = spark.sql("""
                                   with age_bracketed_customers as(
                                   select customer_id,
                                    CASE
                                    WHEN age < 20 then 'Under 20'
                                    WHEN age between 20 and 30 then '20-30'
                                    WHEN age between 31 and 40 then '30-40'
                                    WHEN age between 41 and 50 then '40-50'
                                    WHEN age between 51 and 60 then '50-60'
                                    ELSE '60+'
                                    END AS `age bracket`
                                    from customer
                                    ),
                                    recs as (
                                    select
                                    article_id
                                    , `age bracket` as `purchaser_age_bracket`
                                    , row_number() over (partition by `age bracket` order by count(*) desc) as rank_within_age_bracket
                                    , count(*) as `purchase count`
                                    from transaction t
                                    join age_bracketed_customers a on a.customer_id = t.customer_id
                                    group by article_id, `age bracket`
                                    )
                                    select * from recs
                                    where rank_within_age_bracket <= 12
                                    order by purchaser_age_bracket, rank_within_age_bracket asc
                                   """)

In [None]:
pd_df_bracket_cust = df_bracketed_customers.toPandas()

In [None]:
pd_df_bracket_cust

In [None]:
groups = pd_df_bracket_cust['purchaser_age_bracket'].unique().tolist()
groups

In [None]:
group_variables = ['twenty_to_thirty', 
                   'thirty_to_forty', 
                   'forty_to_fifty', 
                   'fifty_to_sixty', 
                   'over_sixty', 
                   'under_twenty']

In [None]:
def array_maker(source_df, targeted_filter):
    filtered_df = source_df[source_df['purchaser_age_bracket'] == targeted_filter]
    
    df_array = filtered_df['article_id'] \
    .astype(str) \
    .astype(int) \
    .to_numpy()
    
    return df_array

In [None]:
d = {}
for variable in group_variables:
    for g in groups:
        variable = array_maker(pd_df_bracket_cust, g)
        d.update({g : variable})

In [None]:
d