# Overview

### Objective of the Notebook: 
Create customer aggregates to capture key attributes based on past purchases, which can be used for further EDA or modeling

### References: 
- https://www.kaggle.com/cdeotte/recommend-items-purchased-together-0-021
- https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/308635


# Data Loading and Memory Reduction

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import cudf
import dask_cudf


import seaborn as sns
from matplotlib import pyplot as plt
from tqdm.notebook import tqdm
import plotly.express as px
import matplotlib.image as mpimg
import scipy

import warnings 
warnings.filterwarnings('ignore')

In [None]:
transactions = cudf.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv')
transactions['customer_id'] = transactions['customer_id'].str[-16:].str.hex_to_int().astype('int64')
transactions['article_id'] = transactions.article_id.astype('int32')
transactions.t_dat = cudf.to_datetime(transactions.t_dat)
transactions.to_parquet('train.pqt',index=False)
print(transactions.shape)
transactions['priceK'] = transactions['price'] * 1000
transactions.head()

In [None]:
customer_id_decile = []
for i in range(10):
    j = (i + 1) * 0.1
    customer_id_decile.append(transactions.customer_id.quantile(j)) 
customer_id_decile   

# Load articles dataset and incorporate into transactions

In [None]:
articles = cudf.read_csv("../input/h-and-m-personalized-fashion-recommendations/articles.csv")
print(articles.columns)
articles.head()

In [None]:
# transactions = transactions.sample(10000)

In [None]:
###########3
# # dask_cudf merge is not working as expected. reference: https://github.com/rapidsai/cudf/issues/2694 
#########

# dask_transactions = dask_cudf.from_cudf(transactions,npartitions=20)
# dask_articles = dask_cudf.from_cudf(articles,npartitions=20)
# transactionsEnriched = dask_transactions.merge(dask_articles,on='article_id',how='left')
# transactionsEnriched.compute().head(2)

In [None]:
import dask as dask
dask_transactions = dask.dataframe.from_pandas(transactions.to_pandas(),npartitions=20)
dask_articles = dask.dataframe.from_pandas(articles.to_pandas(),npartitions=20)
transactionsEnriched = dask_transactions.merge(dask_articles,on='article_id',how='left')


In [None]:
# import shutil
# try:
#     shutil.rmtree('df_transactionsEnriched.pqt')
# except:
#     print('na')
# transactionsEnriched.to_parquet('df_transactionsEnriched.pqt')
# transactionsEnriched.to_csv('df_transactionsEnriched.csv')

In [None]:
# df_transactionsEnriched = pd.DataFrame(transactionsEnriched)

In [None]:
# df_transactionsEnriched = pd.read_parquet('df_transactionsEnriched.pqt')
# # df_transactionsEnriched = pd.read_csv('df_transactionsEnriched.csv')
# df_transactionsEnriched.head()

# Too big to be read as regular pandas

In [None]:
transactionsEnrichedDict = {}
for i in range(10):
    upper = customer_id_decile[i]
    if i == 0:
        transactionsEnrichedDict[i] = transactionsEnriched[(transactionsEnriched.customer_id <= upper)]
    else:        
        lower = customer_id_decile[i-1]
        transactionsEnrichedDict[i] = transactionsEnriched[(transactionsEnriched.customer_id > lower) &  (transactionsEnriched.customer_id <= upper)]

In [None]:
category_cols = [
       'product_type_name', 'product_group_name', 'graphical_appearance_no',
       'graphical_appearance_name', 'colour_group_name',
       'perceived_colour_value_name',
       'perceived_colour_master_name',
       'department_name', 'index_name',
       'index_group_name', 'section_name',
       'garment_group_name']

dictAggr = {'article_id':'count',
            'priceK':['mean','sum']
           }

for col in category_cols:
    dictAggr[col]= pd.Series.mode

print(dictAggr)
    
def countItem(group, columnCat, columnCat_item):
    dfOutput = group[group[columnCat]==columnCat_item]['customer_id'].count()
    return dfOutput

def countItemDict(group, columnCat, columnCat_itemlist):
    Output = {}
    for columnCat_item in columnCat_itemlist:
        Output[columnCat_item] = group[group[columnCat]==columnCat_item]['customer_id'].count()
    return Output



In [None]:
transactionsSample = transactionsEnrichedDict[0]
dict_CustomerArticleAttributes = {}
dict_df_NumPurch_index_group_name = {}
dict_df_NumPurch_garment_group_name = {}

for i in range(10):
    
    df_transactionsEnriched = transactionsEnrichedDict[i].compute()
    
    # Aggregate for most frequent item 
    CustomerArticleAttributes = df_transactionsEnriched.groupby('customer_id').agg(dictAggr).reset_index()
    CustomerArticleAttributes.columns = [' '.join(col).strip() for col in CustomerArticleAttributes.columns.values]
    CustomerArticleAttributes.rename(columns={'article_id count':'count'},inplace=True)
    for col in category_cols:
        CustomerArticleAttributes.rename(columns={col+' mode':'mostfreq_'+col},inplace=True)
        
    # Count of specific 'index_group_name' category
    columnCat_itemlist = df_transactionsEnriched['index_group_name'].unique()
    columnCat = 'index_group_name'
    df_NumPurch_index_group_name = df_transactionsEnriched.groupby(['customer_id']).apply(lambda grp: countItemDict(grp,columnCat,columnCat_itemlist)).reset_index()
    df_NumPurch_index_group_name.columns = ['customer_id','numpurch_dict']    
    for item in columnCat_itemlist:
        colname = 'numpurchased_'+item
        df_NumPurch_index_group_name[colname] = [row[item] for row in df_NumPurch_index_group_name.numpurch_dict]
    
    # Count of specific 'garment_group_name' category
    # Get Top 15 group names
    df_transactionsEnriched.groupby(['garment_group_name'])['customer_id'].agg('count').sort_values(ascending=False).head(15).index
    columnCat = 'garment_group_name'
    columnCat_itemlist = df_transactionsEnriched.groupby(['garment_group_name'])['customer_id'].agg('count').sort_values(ascending=False).head(15).index
    df_NumPurch_garment_group_name = df_transactionsEnriched.groupby(['customer_id']).apply(lambda grp: countItemDict(grp,columnCat,columnCat_itemlist)).reset_index()
    df_NumPurch_garment_group_name.columns = ['customer_id','numpurch_dict']
    for item in columnCat_itemlist:
        colname = 'numpurchased_'+item
        df_NumPurch_garment_group_name[colname] = [row[item] for row in df_NumPurch_garment_group_name.numpurch_dict]
        
    dict_CustomerArticleAttributes[i] = CustomerArticleAttributes
    dict_df_NumPurch_index_group_name[i] = df_NumPurch_index_group_name
    dict_df_NumPurch_garment_group_name[i] = df_NumPurch_garment_group_name


In [None]:
dict_df_NumPurch_index_group_name[1].head(2)

## Combine across chunks

In [None]:
for i in range(10):
    if i == 0:
        CustomerArticleAttributes = dict_CustomerArticleAttributes[i]
        df_NumPurch_index_group_name = dict_df_NumPurch_index_group_name[i]
        df_NumPurch_garment_group_name = dict_df_NumPurch_index_group_name[i]
    else:
        CustomerArticleAttributes.append(dict_CustomerArticleAttributes[i])
        df_NumPurch_index_group_name.append(dict_df_NumPurch_index_group_name[i])
        df_NumPurch_garment_group_name.append(dict_df_NumPurch_index_group_name[i])
CustomerArticleAttributes.shape

## Merge into single customer table

In [None]:
df_NumPurch_garment_group_name=df_NumPurch_garment_group_name.drop('numpurch_dict',axis=1)
df_NumPurch_index_group_name=df_NumPurch_index_group_name.drop('numpurch_dict',axis=1)
CustomerArticleAttributes.rename(columns={'count':'totalpurchase'},inplace=True)

customers = cudf.read_csv('../input/h-and-m-personalized-fashion-recommendations/customers.csv')
customers['customer_id'] = customers['customer_id'].str[-16:].str.hex_to_int().astype('int64')
customers.head()

col_objects = CustomerArticleAttributes.columns[4:]
for col in col_objects:
    CustomerArticleAttributes[col] = CustomerArticleAttributes[col].astype(str)
    
cudf_NumPurch_garment_group_name = cudf.DataFrame(df_NumPurch_garment_group_name)
cudf_NumPurch_index_group_name = cudf.DataFrame(df_NumPurch_index_group_name)
cudf_CustomerArticleAttributes = cudf.DataFrame(CustomerArticleAttributes)

customersEnriched = (customers.merge(cudf_NumPurch_garment_group_name,on='customer_id',how='left')
                     .merge(cudf_NumPurch_index_group_name,on='customer_id',how='left')
                     .merge(cudf_CustomerArticleAttributes,on='customer_id',how='left')
                    )
customersEnriched.to_csv('customersEnriched.csv')
# os.remove('./customersEnriched.pqt')
# customersEnriched.to_parquet('customersEnriched.pqt')

# #####################################
# Previous Set of Codes; not deleted for reference of workflow before combining them
# #####################################

# Create Customer Attributes based on Purchased Products

In [None]:
# category_cols = [
#        'product_type_name', 'product_group_name', 'graphical_appearance_no',
#        'graphical_appearance_name', 'colour_group_name',
#        'perceived_colour_value_name',
#        'perceived_colour_master_name',
#        'department_name', 'index_name',
#        'index_group_name', 'section_name',
#        'garment_group_name']

# dictAggr = {'article_id':'count',
#             'priceK':['mean','sum']
#            }

# for col in category_cols:
#     dictAggr[col]= pd.Series.mode

# dictAggr

In [None]:
# CustomerArticleAttributes = df_transactionsEnriched.groupby('customer_id').agg(dictAggr).reset_index()

In [None]:
# CustomerArticleAttributes.columns = [' '.join(col).strip() for col in CustomerArticleAttributes.columns.values]
# CustomerArticleAttributes.rename(columns={'article_id count':'count'},inplace=True)
# for col in category_cols:
#     CustomerArticleAttributes.rename(columns={col+' mode':'mostfreq_'+col},inplace=True)
# CustomerArticleAttributes.head()
# # CustomerArticleAttributes.sort_values(('article_id','count'),ascending=False).head()

## Purchase of Specific Category Flags

### Create and test the function

In [None]:
# # Do one example
# columnCat = 'index_group_name'
# columnCat_item = 'Ladieswear'
# columnCat_itemlist = ['Ladieswear','Menswear']

# def countItem(group, columnCat, columnCat_item):
#     dfOutput = group[group[columnCat]==columnCat_item]['customer_id'].count()
#     return dfOutput
# def countItemDict(group, columnCat, columnCat_itemlist):
#     Output = {}
#     for columnCat_item in columnCat_itemlist:
#         Output[columnCat_item] = group[group[columnCat]==columnCat_item]['customer_id'].count()
#     return Output

In [None]:
# dfColAggr = df_transactionsEnriched.groupby(['customer_id']).apply(lambda grp: countItemDict(grp,columnCat,columnCat_itemlist)).reset_index()
# dfColAggr.columns=['customer_id','index_group_name_dict']
# dfColAggr.head(2)

### Create for index_group_name

In [None]:
# df_transactionsEnriched.groupby('index_group_name')['customer_id'].agg('count').sort_values(ascending=False)

In [None]:
# columnCat_itemlist = df_transactionsEnriched['index_group_name'].unique()
# columnCat = 'index_group_name'
# df_NumPurch_index_group_name = df_transactionsEnriched.groupby(['customer_id']).apply(lambda grp: countItemDict(grp,columnCat,columnCat_itemlist)).reset_index()
# df_NumPurch_index_group_name.columns = ['customer_id','numpurch_dict']
# df_NumPurch_index_group_name.head()

In [None]:
# for item in columnCat_itemlist:
#     colname = 'numpurchased_'+item
#     df_NumPurch_index_group_name[colname] = [row[item] for row in df_NumPurch_index_group_name.numpurch_dict]
# df_NumPurch_index_group_name.head(2)

### Create for garment_group_name

In [None]:
# df_transactionsEnriched.groupby(['garment_group_name','index_group_name'])['customer_id'].agg('count').head(20)

In [None]:
# df_transactionsEnriched.groupby(['garment_group_name'])['customer_id'].agg('count').sort_values(ascending=False).head(20)

In [None]:
# # Get Top 15 group names
# df_transactionsEnriched.groupby(['garment_group_name'])['customer_id'].agg('count').sort_values(ascending=False).head(15).index

In [None]:
# columnCat = 'garment_group_name'
# columnCat_itemlist = df_transactionsEnriched.groupby(['garment_group_name'])['customer_id'].agg('count').sort_values(ascending=False).head(15).index
# df_NumPurch_garment_group_name = df_transactionsEnriched.groupby(['customer_id']).apply(lambda grp: countItemDict(grp,columnCat,columnCat_itemlist)).reset_index()
# df_NumPurch_garment_group_name.columns = ['customer_id','numpurch_dict']
# df_NumPurch_garment_group_name.head()

In [None]:
# for item in columnCat_itemlist:
#     colname = 'numpurchased_'+item
#     df_NumPurch_garment_group_name[colname] = [row[item] for row in df_NumPurch_garment_group_name.numpurch_dict]
# df_NumPurch_garment_group_name.head(2)

In [None]:
# df_NumPurch_garment_group_name.sort_values('numpurchased_Jersey Fancy',ascending=False).head(3)

# Merge into original customer attributes

### Recap of the tables to be merged
- df_NumPurch_garment_group_name
- df_NumPurch_index_group_name
- CustomerArticleAttributes

In [None]:
# df_NumPurch_garment_group_name=df_NumPurch_garment_group_name.drop('numpurch_dict',axis=1)
# df_NumPurch_index_group_name=df_NumPurch_index_group_name.drop('numpurch_dict',axis=1)
# CustomerArticleAttributes.rename(columns={'count':'totalpurchase'},inplace=True)


In [None]:
# df_NumPurch_garment_group_name=df_NumPurch_garment_group_name.drop('numpurch_dict',axis=1)
# df_NumPurch_garment_group_name.head(2)

In [None]:
# df_NumPurch_index_group_name=df_NumPurch_index_group_name.drop('numpurch_dict',axis=1)
# df_NumPurch_index_group_name.head(2)

In [None]:
# CustomerArticleAttributes.rename(columns={'count':'totalpurchase'},inplace=True)
# CustomerArticleAttributes.head(2)

In [None]:
# customers = cudf.read_csv('../input/h-and-m-personalized-fashion-recommendations/customers.csv')
# customers['customer_id'] = customers['customer_id'].str[-16:].str.hex_to_int().astype('int64')
# customers.head()

In [None]:
# print(CustomerArticleAttributes.columns)
# print(CustomerArticleAttributes.columns[4:])

# col_objects = CustomerArticleAttributes.columns[4:]
# for col in col_objects:
#     CustomerArticleAttributes[col] = CustomerArticleAttributes[col].astype(str)

In [None]:
# cudf_NumPurch_garment_group_name = cudf.DataFrame(df_NumPurch_garment_group_name)
# cudf_NumPurch_index_group_name = cudf.DataFrame(df_NumPurch_index_group_name)
# cudf_CustomerArticleAttributes = cudf.DataFrame(CustomerArticleAttributes)

In [None]:
# customersEnriched = (customers.merge(cudf_NumPurch_garment_group_name,on='customer_id',how='left')
#                      .merge(cudf_NumPurch_index_group_name,on='customer_id',how='left')
#                      .merge(cudf_CustomerArticleAttributes,on='customer_id',how='left')
#                     )
# customersEnriched[customersEnriched.numpurchased_Shoes>0].head()

In [None]:
# customersEnriched.to_csv('customersEnriched.csv')
# os.remove('./customersEnriched.pqt')
# customersEnriched.to_parquet('customersEnriched.pqt')

# Save as csv and parquet