In [None]:
import numpy as np
import pandas as pd

In [None]:
# local environment
# DATA_INPUT_DIR = 'data'
# DATA_OUTPUT_DIR = 'data'

In [None]:
# kaggle environment
DATA_INPUT_DIR = '/kaggle/input/h-and-m-personalized-fashion-recommendations'
DATA_OUTPUT_DIR = '.'

# About

This notebook will create parquet files for the H&M Fashion Recommendation data set. Main purpose is to save memory and make the data load faster. A minimal set of data cleaning and transformation is included:

- customers.age==NA has been mapped to -1
- customers.customer_id has been converted to integer codes, mappings to original ids in customer_ids.parquet
- customers.postal_code has been converted to integer codes, no mapping file created to revert this
- most of the article categories have been converted to categorical variables, original category ids have been dropped
- transactions_train.price multiplied with 590 (most likely the original price in euros)
- two new datetime features for transaction data instead of the date
  - *yearday* - integer representation of the date, ranges from 0 to 733, makes it easy to calculate differences in days
  - *week* - integer representation of the week, ranges from 0 to 104, 105 is the week to be predicted

Result files:

- customers.parquet (corresponds to customers.csv with new customer_id)
- customer_ids.parquet (mapping from new customer_id to original customer_id)
- articles.parquet (corresponds to articles.csv)
- sales.parquet (corresponds to transactions_train.csv, contains new customer_id)
- sample_submission.parquet (corresponds to sample_submission.csv, contains original customer_id)

# Customers

In [None]:
customers = pd.read_csv(f'{DATA_INPUT_DIR}/customers.csv')

In [None]:
customers.loc[customers.fashion_news_frequency=='None', 'fashion_news_frequency'] = 'NONE'
customers.fashion_news_frequency = customers.fashion_news_frequency.astype('category')

In [None]:
customers.loc[customers.FN.isna(), 'FN'] = 0
customers.FN = customers.FN.astype('bool')
customers.loc[customers.Active.isna(), 'Active'] = 0
customers.Active = customers.Active.astype('bool')

In [None]:
customers.club_member_status = customers.club_member_status.astype('category')

In [None]:
customers.loc[customers.age.isna(), 'age'] = -1
customers.age = customers.age.astype('int8')

In [None]:
customers.postal_code = customers.postal_code.astype('category').cat.codes

In [None]:
customers['customer_id_original'] = customers.customer_id
customers['customer_id'] = customers.index.values.astype('int32')

In [None]:
customers[['customer_id', 'customer_id_original']].to_parquet(f'{DATA_OUTPUT_DIR}/customer_ids.parquet', compression='gzip')

In [None]:
customers.drop(columns='customer_id_original', inplace=True)
customers.to_parquet(f'{DATA_OUTPUT_DIR}/customers.parquet', compression='gzip')

In [None]:
customers.head()

# Articles

In [None]:
articles = pd.read_csv(f'{DATA_INPUT_DIR}/articles.csv')

In [None]:
articles['product_type'] = articles.product_type_name.astype('category')
articles.drop(columns=['product_type_no', 'product_type_name'], inplace=True)

articles['graphical_appearance'] = articles.graphical_appearance_name.astype('category')
articles.drop(columns=['graphical_appearance_no', 'graphical_appearance_name'], inplace=True)

articles['colour_group'] = articles.colour_group_name.astype('category')
articles.drop(columns=['colour_group_code', 'colour_group_name'], inplace=True)

articles['perceived_colour_value'] = articles.perceived_colour_value_name.astype('category')
articles.drop(columns=['perceived_colour_value_id', 'perceived_colour_value_name'], inplace=True)

articles['perceived_colour_master'] = articles.perceived_colour_master_name.astype('category')
articles.drop(columns=['perceived_colour_master_id', 'perceived_colour_master_name'], inplace=True)

articles['index'] = articles.index_name.astype('category')
articles.drop(columns=['index_code', 'index_name'], inplace=True)

articles['index_group'] = articles.index_group_name.astype('category')
articles.drop(columns=['index_group_no', 'index_group_name'], inplace=True)

articles['section_name'] = articles.section_name.astype('category')
articles.drop(columns=['section_no', 'section_name'], inplace=True)

articles['garment_group'] = articles.garment_group_name.astype('category')
articles.drop(columns=['garment_group_no', 'garment_group_name'], inplace=True)

articles.product_code = articles.product_code.astype('int32')
articles.department_no = articles.department_no.astype('int32')

articles.product_group_name = articles.product_group_name.astype('category')

articles.article_id = articles.article_id.astype('int32')
articles.product_code = articles.product_code.astype('int32')

In [None]:
articles.to_parquet(f'{DATA_OUTPUT_DIR}/articles.parquet', compression='gzip')

In [None]:
articles.head()

# Sales

In [None]:
sales = pd.read_csv(f'{DATA_INPUT_DIR}/transactions_train.csv')

In [None]:
sales.info(memory_usage='deep')

In [None]:
# descramble original price and round to cents
sales['price'] = (np.round(sales.price*590*100)/100).astype('float32')

In [None]:
sales.t_dat = sales.t_dat.astype('datetime64')
sales.article_id = sales.article_id.astype('int32')

In [None]:
# sales_channel_id 1: offline, 2: online
sales['online_channel'] = (sales.sales_channel_id-1).astype('bool')
sales.drop(columns='sales_channel_id', inplace=True)

In [None]:
# yearday represents each day with a simple int > faster calculation of diffs, etc.
sales['yearday'] = (sales.t_dat.dt.day_of_year + (sales.t_dat.dt.year-sales.t_dat.dt.year.min())*365).astype('int16')
sales['yearday'] = sales.yearday - sales.yearday.min()

# as we need to predict for a week we add a week column
last_yearday = sales.yearday.max()
week_offset = np.ceil(last_yearday/7)*7 - last_yearday - 1

sales['week'] = np.trunc((sales.yearday+week_offset)/7).astype('int8')

# drop original date column - could be reconstructed by pd.Timestamp('2018-09-20') + sales['yearday'].astype('timedelta64[D]')
sales.drop(columns='t_dat', inplace=True)

In [None]:
# map to new customer ids
customer_ids = pd.read_parquet(f'{DATA_OUTPUT_DIR}/customer_ids.parquet')
sales = sales.rename(columns={'customer_id': 'customer_id_original'}).merge(customer_ids)
sales = sales.drop(columns='customer_id_original')

In [None]:
sales = sales.reindex(columns=['yearday', 'week', 'customer_id', 'article_id', 'price', 'online_channel'])

In [None]:
sales.info(memory_usage='deep')

In [None]:
sales.to_parquet(f'{DATA_OUTPUT_DIR}/sales.parquet', compression='gzip')

In [None]:
sales.head()

# Sample Submission

In [None]:
sample_submission = pd.read_csv(f'{DATA_INPUT_DIR}/sample_submission.csv')

In [None]:
sample_submission.to_parquet(f'{DATA_OUTPUT_DIR}/sample_submission.parquet', compression='gzip')