<p style = "font-size:40px; 
font-family: Helvetica; 
font-weight : bold; 
background-color: #036EB7; 
color : #FFFFFF; 
text-align: left; 
padding: 0px 15px; 
border-radius:3px">
	H&M Competitions Sample Dataset
</p>

### SMALL(5%): https://www.kaggle.com/datasets/adldotori/hm-small5-dataset
### MINI(1%): https://www.kaggle.com/datasets/adldotori/hm-mini1-dataset
### TINY(0.2%): https://www.kaggle.com/datasets/adldotori/hm-tiny02-dataset

In [None]:
DATA_PATH = '../input/h-and-m-personalized-fashion-recommendations'

In [None]:
import cudf
print('RAPIDS version', cudf.__version__)

In [None]:
import os.path as osp

In [None]:
customers = cudf.read_csv(osp.join(DATA_PATH, 'customers.csv'))
articles = cudf.read_csv(osp.join(DATA_PATH, 'articles.csv'))
sample_submission = cudf.read_csv(osp.join(DATA_PATH, 'sample_submission.csv'))

In [None]:
train = cudf.read_csv(osp.join(DATA_PATH, 'transactions_train.csv'))
train.t_dat = cudf.to_datetime(train.t_dat)

print(train.shape)
train.head()

In [None]:
train['t_dat'] = cudf.to_datetime(train['t_dat'], format="%Y-%m-%d")
train['month'] = train['t_dat'].dt.strftime('%m')
train['year'] = train['t_dat'].dt.strftime('%Y')
train.head()

<p style = "font-size:25px; 
font-family: Helvetica; 
font-weight : normal; 
background-color: #036EB7; 
color : #FFFFFF; 
text-align: left; 
padding: 0px 15px; 
border-radius:3px">
	Those who purchased less than 5 articles
</p>

In [None]:
customer_count = train.groupby('customer_id', as_index=False)[['price']].count().to_pandas().sort_index()
not_cold_users = customer_count[customer_count.price > 5].index
customer_count[customer_count.price > 5]

<p style = "font-size:25px; 
font-family: Helvetica; 
font-weight : normal; 
background-color: #036EB7; 
color : #FFFFFF; 
text-align: left; 
padding: 0px 15px; 
border-radius:3px">
	A person whose last purchase was three months ago
</p>

In [None]:
import datetime

customer_last_purchase = train[
    ~train.customer_id.isin(not_cold_users)
].groupby('customer_id', as_index=False)[['t_dat']].last().to_pandas()
cold_inactive_users = customer_last_purchase[customer_last_purchase.t_dat < "2020-06-01"].index
cold_active_users = customer_last_purchase[customer_last_purchase.t_dat > "2020-06-01"].index
customer_last_purchase[customer_last_purchase.t_dat < "2020-06-01"]

In [None]:
not_cold_users = not_cold_users.to_list()
cold_inactive_users = cold_inactive_users.to_list()
cold_active_users = cold_active_users.to_list()

In [None]:
import random
import numpy as np

random.seed(42)

random.shuffle(not_cold_users)
random.shuffle(cold_inactive_users)
random.shuffle(cold_active_users)

We have a three types of user.
1. **not_cold_users**: Those who purchased within 3 months and have purchased a total of 5 or more so far. 
2. **cold_inactive_users**: Those who have purchased less than 5 items and have not purchased in the last 3 months
3. **cold_active_users**: Those who have purchased less than 5 and have purchased in the last 3 months

### Because the characteristics of the three types of users are completely different, the data set is created according to the user ratio.

In [None]:
article_count = train.groupby('article_id')['t_dat'].count().sort_values(ascending=False)
article_count = article_count.index.to_pandas().to_list()

<p style = "font-size:25px; 
font-family: Helvetica; 
font-weight : normal; 
background-color: #036EB7; 
color : #FFFFFF; 
text-align: left; 
padding: 0px 15px; 
border-radius:3px">
	Generate Dataset (tiny, mini, small)
</p>

In [None]:
import random
import pandas as pd
from typing import Tuple

def generate_dataset(
    rate # dataset size rate
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    new_not_cold_users = not_cold_users[:round(len(not_cold_users) * rate)]
    new_cold_inactive_users = not_cold_users[:round(len(cold_inactive_users) * rate)]
    new_cold_active_users = not_cold_users[:round(len(cold_active_users) * rate)]
    new_articles = article_count[:round(len(article_count) * rate)]
    
    new_users = new_not_cold_users + new_cold_inactive_users + new_cold_active_users

    new_train = train[(train.customer_id.isin(new_users)) & (train.article_id.isin(new_articles))]
    new_customer = customers[customers.customer_id.isin(new_users)]
    new_articles = articles[articles.article_id.isin(new_articles)]
    
    return new_train, new_customer, new_articles

In [None]:
tiny_train, tiny_customer, tiny_articles = generate_dataset(0.002)
mini_train, mini_customer, mini_articles = generate_dataset(0.01)
small_train, small_customer, small_articles = generate_dataset(0.05)

In [None]:
import os
os.makedirs('tiny')
os.makedirs('mini')
os.makedirs('small')

In [None]:
tiny_train.to_csv('tiny/transactions_train.csv', index=False)
tiny_customer.to_csv('tiny/customers.csv', index=False)
tiny_articles.to_csv('tiny/articles.csv', index=False)
sample_submission.to_csv('tiny/sample_submission.csv', index=False)

mini_train.to_csv('mini/transactions_train.csv', index=False)
mini_customer.to_csv('mini/customers.csv', index=False)
mini_articles.to_csv('mini/articles.csv', index=False)
sample_submission.to_csv('mini/sample_submission.csv', index=False)

small_train.to_csv('small/transactions_train.csv', index=False)
small_customer.to_csv('small/customers.csv', index=False)
small_articles.to_csv('small/articles.csv', index=False)
sample_submission.to_csv('small/sample_submission.csv', index=False)

In [None]:
!zip -r tiny.zip tiny/
!zip -r mini.zip mini/
!zip -r small.zip small/