# H&M EDA & Customer Clustering by Kmeans (modified)

## information

- This note is based on "[H&M EDA & Customer Clustering by Kmeans](https://www.kaggle.com/code/hirotakanogami/h-m-eda-customer-clustering-by-kmeans)" 
- CPU only (without GPU)

In [None]:
import sys, warnings, time, os, copy, gc, re, random, pickle#, cudf
warnings.filterwarnings('ignore')
from IPython.display import display
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# pd.set_option('display.max_rows', 50)
# pd.set_option('display.max_columns', None)
# pd.set_option("display.max_colwidth", 10000)
import seaborn as sns
sns.set()
from pandas.io.json import json_normalize
from pprint import pprint
from pathlib import Path
from tqdm import tqdm
tqdm.pandas()
from collections import Counter
from datetime import datetime, timedelta
#import cudf

from sklearn.cluster import KMeans
from sklearn import preprocessing

## Customer Clustering

In [None]:
DEBUG = False
PATH_INPUT = r'../input/h-and-m-personalized-fashion-recommendations/'
#PATH_INPUT = r'./'

In [None]:
customers = pd.read_csv(PATH_INPUT + 'customers.csv')

In [None]:
class Clustering_HandM():
    def customers_preprocessing(self, customers, dropcol=['postal_code'] , **kwargs):
        customers = customers.drop(dropcol, axis=1)
        customers_col = list(customers.columns)
        
        if 'fashion_news_frequency' in customers_col :
            customers['fashion_news_frequency'] = customers['fashion_news_frequency'].replace('NONE','None')
            customers['fashion_news_frequency'] = customers['fashion_news_frequency'].replace({np.nan :0, 'None':0, 'Monthly':1, 'Regularly':2})
            
        if 'club_member_status' in customers_col:
            customers['club_member_status'] = customers['club_member_status'].replace({np.nan :0, 'PRE-CREATE':1, 'ACTIVE':2, 'LEFT CLUB':-1})
            
        if 'age' in customers_col:
            customers['age'] = customers['age'].fillna(-1)
            
        if 'FN' in customers_col:
            customers['FN'] = customers['FN'].fillna(0)

        if 'Active' in customers_col:
            customers['Active'] = customers['Active'].fillna(0)
            
            print(f'###NULL DESCRIPTION###\n{customers.isnull().sum()}')
            
        return customers
    
    def clustering(self, df, predcol, usecol, normmethod='StandardScaler', clusters=12, DEBUG=False):
        
        X = np.array(df[usecol])
        
        if normmethod == 'StandardScaler':
            nm = preprocessing.StandardScaler()
            X = nm.fit_transform(X)
        elif normmethod == 'minMax':
            nm = preprocessing.MinMaxScaler()
            X = nm.fit_transform(X)
        print(f'NormarlizationMethod:{normmethod}')
        
        #km = KMeans(n_clusters=clusters, random_state=2022)
        km = KMeans(n_clusters=clusters, 
                    init='k-means++', 
                    random_state=2022)
        km.fit(X)
        distortion = km.inertia_
        print('Distortion: %.2f'% km.inertia_)

        pred = km.labels_
        df_pred = pd.DataFrame(pred, columns=['pred'])
        df_pred = pd.concat([df, df_pred], axis=1)
        
        df_norm = pd.DataFrame(X, columns=usecol)
        #print(df_norm.describe())


        if DEBUG:
            df_norm = pd.concat([df[predcol], df_norm], axis=1)
            return df_pred, distortion, df_norm
        else:
            return df_pred, distortion
        

### Dealing with missing values and replacing some words with integer

In [None]:
print("FN:", customers['FN'].unique())
print("Active:", customers['Active'].unique())
print("club_member_status:", customers['club_member_status'].unique())
print("fashion_news_frequency:", customers['fashion_news_frequency'].unique())
print("age:", customers['age'].unique())

In [None]:
clst = Clustering_HandM()
customers = clst.customers_preprocessing(customers)
usecol = ['club_member_status', 'fashion_news_frequency', 'age', 'FN', 'Active']
predcol = ['customer_id']

https://qiita.com/deaikei/items/11a10fde5bb47a2cf2c2

### Determining K, number of clustering by Elbow method

In [None]:
distortions = []
kx = []
for k in range(1,12):
    print(f'---- K = {k} ----')
    dfCustomers, dist = clst.clustering(customers, predcol=predcol, usecol=usecol, clusters=k)
    distortions.append(dist)
    kx.append(k)

In [None]:
# Elbow method
plt.plot(kx, distortions)
plt.scatter(kx, distortions)
plt.xlabel("k: number of clusters")
plt.ylabel("Distortion")
plt.show

### Clustering after optimizing K

In [None]:
K_NUMBER = 5

In [None]:
dfCustomers, dist = clst.clustering(customers, predcol=predcol, usecol=usecol, clusters=K_NUMBER)

In [None]:
listBin = [-1, 19, 29, 39, 49, 59, 69, 119]
dfCustomers['age_bins'] = pd.cut(dfCustomers['age'], listBin)

In [None]:
pd.crosstab(dfCustomers['pred'], dfCustomers['age_bins'])

In [None]:
dfCustomers = dfCustomers.drop(['age_bins'], axis=1)

In [None]:
dfTransactions = pd.read_csv(PATH_INPUT + 'transactions_train.csv',  
                               usecols=['t_dat', 'customer_id', 'article_id'],
                               dtype={'article_id': 'int32', 't_dat': 'string', 'customer_id': 'string'})
dfTransactions['t_dat'] = pd.to_datetime(dfTransactions['t_dat'])
dfTransactions.set_index('t_dat', inplace=True)
dfTransactions.head()

In [None]:
dfRecent = dfTransactions.loc['2020-09-01' : '2020-09-21']
dfRecent

In [None]:
#dfRecent = dfRecent.to_pandas()
# dfRecent = dfRecent.merge(dfCustomers[['customer_id', 'age_bins']], on='customer_id', how='inner')
dfRecent = dfRecent.merge(dfCustomers[['customer_id', 'pred']], on='customer_id', how='inner')
dfRecent

In [None]:
dfRecent = dfRecent.groupby(['pred', 'article_id']).count().reset_index().rename(columns={'customer_id': 'counts'})
listUniBins = dfRecent['pred'].unique().tolist()
display(dfRecent, listUniBins)

In [None]:
dict100 = {}
for uniBin in listUniBins:
    # dfTemp = dfRecent[dfRecent['age_bins'] == uniBin]
    dfTemp = dfRecent[dfRecent['pred'] == uniBin]
    dfTemp = dfTemp.sort_values(by='counts', ascending=False)
    dict100[uniBin] = dfTemp.head(100)['article_id'].values.tolist()

df100 = pd.DataFrame([dict100]).T.rename(columns={0:'top100'})

In [None]:
df100

In [None]:
for index in df100.index:
    df100[index] = [len(set(df100.at[index, 'top100']) & set(df100.at[x, 'top100']))/100 for x in df100.index]

df100 = df100.drop(columns='top100')
plt.figure(figsize=(10, 6))
sns.heatmap(df100, annot=True, cbar=False)

In [None]:
dfCustomers

In [None]:
listUniBins = dfCustomers['pred'].sort_values().unique().tolist()
print(listUniBins)

In [None]:
df_org  = pd.read_csv(PATH_INPUT + 'transactions_train.csv',
                        usecols= ['t_dat', 'customer_id', 'article_id'], 
                        dtype={'article_id': 'int32', 't_dat': 'string', 'customer_id': 'string'})

In [None]:
df_org

In [None]:
last_ts = df_org['t_dat'].max()
last_ts = (datetime.fromisoformat(last_ts) - timedelta(days=7)).strftime("%Y-%m-%d")
print(f'last day for train: {last_ts}')

In [None]:
df_train = df_org[df_org['t_dat'] <= last_ts]
df_val = df_org[df_org['t_dat'] > last_ts]
last_ts = df_train['t_dat'].max()
print(last_ts)

In [None]:
display(df_train, df_val)

In [None]:
df_val["article_id"] = df_val["article_id"].astype(str)
df_val["article_id"] = df_val["article_id"].str.zfill(10)
df_val2 = df_val.groupby(['customer_id'])["article_id"].apply(list)
df_val2

### Define Weight Parameters

In [None]:
# 重み付け関数
a, b, c, d = 2.5e4, 1.5e6, 3e-1, 1e3
#a, b, c, d = 2.5e4, 1.5e5, 2e-1, 1e3
x = np.array(range(0, 100))
y = a / np.sqrt(x) + b * np.exp(-c * x) - d
plt.plot(x, y)
plt.xlabel("delta [days]")
plt.ylabel("Weight")
plt.show

## Prediction for validation

### mAP function

In [None]:
def apk(actual, predicted, k=12):
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    # 12回ループが回る
    for i, p in enumerate(predicted):
        # p ∈ actual, p ∉ predicted[:i] (このループ内で始めてでてくる、繰り返しでないという意味)
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=12):
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

### prediction for validation data

In [None]:
score_k = []
num_customers = []
N = 12 # 一般的なおすすめとして点数の大きいものからトップから12番までランキング
for uniBin in listUniBins:
    df = df_train.copy()
    #df  = pd.read_csv('transactions_train.csv',
    #                        usecols= ['t_dat', 'customer_id', 'article_id'], 
    #                        dtype={'article_id': 'int32', 't_dat': 'string', 'customer_id': 'string'})
    
    # segmantation data by kmeans 
    if str(uniBin) == 'nan':
        dfCustomersTemp = dfCustomers[dfCustomers['pred'].isnull()]
    else:
        dfCustomersTemp = dfCustomers[dfCustomers['pred'] == uniBin]
    dfCustomersTemp = dfCustomersTemp.drop(['pred'], axis=1)
    #dfCustomersTemp = pd.from_pandas(dfCustomersTemp)
    
    #
    # dfのデータにageの情報を加えている
    #
    df = df.merge(dfCustomersTemp[['customer_id', 'age']], on='customer_id', how='inner')
    print(f'The shape of scope transaction for {uniBin} is {df.shape}. \n')

    #
    # メモリを減らす工夫
    # https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations/discussion/308635
    #
    df['customer_id'] = df['customer_id'].apply(lambda x: int(x[-16:],16) ).astype('int64')
    #df['customer_id'] = df ['customer_id'].str[-16:].str.hex_to_int().astype('int64')
    df['t_dat'] = pd.to_datetime(df['t_dat'])
    last_ts = df['t_dat'].max()

    #
    # 売上を計算する火曜のdateの列を作る
    #
    #tmp = df[['t_dat']].copy().to_pandas()
    tmp = df[['t_dat']].copy()
    tmp['dow'] = tmp['t_dat'].dt.dayofweek
    # Previous Tuesday を計算する
    tmp['ldbw'] = tmp['t_dat'] - pd.TimedeltaIndex(tmp['dow'] - 1, unit='D')
    # np.ones(len(tmp.loc[tmp['dow'] >=2])) 水, 木, 金, 土, 日 のデータ数分、中身が1の配列を作る => [1. 1. 1. ... 1. 1. 1.]
    # * 7で 全体に7をかける: [1. 1. 1. ... 1. 1. 1.] -> [7. 7. 7. ... 7. 7. 7.]
    # pd.TimedeltaIndex(np.ones(len(tmp.loc[tmp['dow'] >=2])) * 7, unit='D'))により['7 days', '7 days', '7 days', ...
    # 先程計算したldbwに7日のデータが加算される 2018-09-18 (火曜) => 2018-09-25 (火曜)
    tmp.loc[tmp['dow'] >=2 , 'ldbw'] = tmp.loc[tmp['dow'] >=2 , 'ldbw'] + pd.TimedeltaIndex(np.ones(len(tmp.loc[tmp['dow'] >=2])) * 7, unit='D')
    
    df['ldbw'] = tmp['ldbw'].values
    
    # カスタマー情報を抜いて、人気商品ランキングを作りたい
    # 次の売上集計日: ldbw, 商品ID: article_id でgroupbyする. 売上は個数をカウントする
    weekly_sales = df.drop('customer_id', axis=1).groupby(['ldbw', 'article_id']).count().reset_index()
    weekly_sales = weekly_sales.rename(columns={'t_dat': 'count'})
    
    # 直近の販売個数情報がすべてのデータに付与される
    df = df.merge(weekly_sales, on=['ldbw', 'article_id'], how = 'left')
    
    weekly_sales = weekly_sales.reset_index().set_index('article_id')

    # "count_targ" の列が付与される。付与されるデータは最終日の販売個数
    df = df.merge(
        weekly_sales.loc[weekly_sales['ldbw']==last_ts, ['count']],
        on='article_id', suffixes=("", "_targ"))

    # 最終日のみに売れたデータは、mergeしたときnanになる。それらのデータはゼロ埋め
    df['count_targ'].fillna(0, inplace=True)
    del weekly_sales
    
    # 最終日を基準として販売個数の割合を計算する。quotientの列を作成。
    df['quotient'] = df['count_targ'] / df['count']
    
    # quotientをarticle_idごとにsumしたい
    target_sales = df.drop('customer_id', axis=1).groupby('article_id')['quotient'].sum()
    # N はBin数で N = 12と設定されている。
    # quotientが大きい順に並べて、上から12個抜き出す。
    #general_pred = target_sales.nlargest(N).index.to_pandas().tolist()
    general_pred = target_sales.nlargest(N).index.tolist()
    general_pred = ['0' + str(article_id) for article_id in general_pred]
    general_pred_str =  ' '.join(general_pred)
    del target_sales
    
    purchase_dict = {}

    #tmp = df.copy().to_pandas()
    tmp = df.copy()
    # 9月22日からの差分、9月20日なら2という値がint型で tmp['x'] に格納される
    tmp['x'] = ((last_ts - tmp['t_dat']) / np.timedelta64(1, 'D')).astype(int)
    # 最終日の "0" が1に変換される
    tmp['dummy_1'] = 1 
    tmp['x'] = tmp[["x", "dummy_1"]].max(axis=1)

    #a, b, c, d = 2.5e4, 1.5e5, 2e-1, 1e3
    tmp['y'] = a / np.sqrt(tmp['x']) + b * np.exp(-c*tmp['x']) - d

    tmp['dummy_0'] = 0 
    tmp['y'] = tmp[["y", "dummy_0"]].max(axis=1)
    tmp['value'] = tmp['quotient'] * tmp['y'] 

    # 'customer_id', 'article_id' はgroupbyの基準列で残り、あとはvalueの列のみsumをとって列に残している
    tmp = tmp.groupby(['customer_id', 'article_id']).agg({'value': 'sum'})
    tmp = tmp.reset_index()

    tmp = tmp.loc[tmp['value'] > 0]
    tmp['rank'] = tmp.groupby("customer_id")["value"].rank("dense", ascending=False)
    tmp = tmp.loc[tmp['rank'] <= 12]

    purchase_df = tmp.sort_values(['customer_id', 'value'], ascending = False).reset_index(drop = True)
    purchase_df['prediction'] = '0' + purchase_df['article_id'].astype(str) + ' '
    purchase_df = purchase_df.groupby('customer_id').agg({'prediction': sum}).reset_index()
    purchase_df['prediction'] = purchase_df['prediction'].str.strip()
    purchase_df = pd.DataFrame(purchase_df)
    
    #sub  = pd.read_csv('sample_submission.csv',
    #                        usecols= ['customer_id'], 
    #                        dtype={'customer_id': 'string'})
    sub = df_val.groupby(["customer_id"]).count()
    sub.reset_index()
    
    numCustomers = sub.shape[0]
    
    sub = sub.merge(dfCustomersTemp[['customer_id', 'age']], on='customer_id', how='inner')

    sub['customer_id2'] = sub['customer_id'].apply(lambda x: int(x[-16:],16) ).astype('int64')
    #sub['customer_id2'] = sub['customer_id'].str[-16:].str.hex_to_int().astype('int64')

    sub = sub.merge(purchase_df, left_on = 'customer_id2', right_on = 'customer_id', how = 'left',
                   suffixes = ('', '_ignored'))

    #sub = sub.to_pandas()
    sub['prediction'] = sub['prediction'].fillna(general_pred_str)
    sub['prediction'] = sub['prediction'] + ' ' +  general_pred_str
    sub['prediction'] = sub['prediction'].str.strip()
    sub['prediction'] = sub['prediction'].str[:131]
    sub = sub[['customer_id', 'prediction']]
    sub['prediction2'] = sub['prediction'].apply(lambda x: x.split())
    sub2 = sub.merge(df_val2, on = 'customer_id', how = 'left', suffixes = ('article_id', 'val'))
    val_items = sub2["article_id"].tolist()
    outputs = sub2["prediction2"].tolist()
    score = mapk(val_items, outputs)
    print("mAP Score on Validation set:", score)
    print(f'prediction for {uniBin}. The shape is {sub.shape}. \n')
    
    score_k.append(score)
    num_customers.append(sub.shape[0])

    print('-'*50)
print('Finished.\n')
print('='*50)

### Caliculate mAP Score for valid data

In [None]:
score_list = np.array(score_k)
num_list = np.array(num_customers)

total = (score_list * num_list).sum()
count = num_list.sum()
print("mAP Score on Validation set:", total/count)

In [None]:
del df
del sub

## Create submittion data

In [None]:
for uniBin in listUniBins:
    df = df_org.copy()
    
    if str(uniBin) == 'nan':
        dfCustomersTemp = dfCustomers[dfCustomers['pred'].isnull()]
    else:
        dfCustomersTemp = dfCustomers[dfCustomers['pred'] == uniBin]
    
    dfCustomersTemp = dfCustomersTemp.drop(['pred'], axis=1)
    #dfCustomersTemp = pd.from_pandas(dfCustomersTemp)
    
    #
    # dfのデータにageの情報を加えている
    #
    df = df.merge(dfCustomersTemp[['customer_id', 'age']], on='customer_id', how='inner')
    print(f'The shape of scope transaction for {uniBin} is {df.shape}. \n')

    #
    # メモリを減らす工夫
    # https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations/discussion/308635
    #
    df['customer_id'] = df['customer_id'].apply(lambda x: int(x[-16:],16) ).astype('int64')
    #df['customer_id'] = df ['customer_id'].str[-16:].str.hex_to_int().astype('int64')
    df['t_dat'] = pd.to_datetime(df['t_dat'])
    last_ts = df['t_dat'].max()

    #
    # 売上を計算する火曜のdateの列を作る
    #
    #tmp = df[['t_dat']].copy().to_pandas()
    tmp = df[['t_dat']].copy()
    tmp['dow'] = tmp['t_dat'].dt.dayofweek
    # Previous Tuesday を計算する
    tmp['ldbw'] = tmp['t_dat'] - pd.TimedeltaIndex(tmp['dow'] - 1, unit='D')
    # np.ones(len(tmp.loc[tmp['dow'] >=2])) 水, 木, 金, 土, 日 のデータ数分、中身が1の配列を作る => [1. 1. 1. ... 1. 1. 1.]
    # * 7で 全体に7をかける: [1. 1. 1. ... 1. 1. 1.] -> [7. 7. 7. ... 7. 7. 7.]
    # pd.TimedeltaIndex(np.ones(len(tmp.loc[tmp['dow'] >=2])) * 7, unit='D'))により['7 days', '7 days', '7 days', ...
    # 先程計算したldbwに7日のデータが加算される 2018-09-18 (火曜) => 2018-09-25 (火曜)
    tmp.loc[tmp['dow'] >=2 , 'ldbw'] = tmp.loc[tmp['dow'] >=2 , 'ldbw'] + pd.TimedeltaIndex(np.ones(len(tmp.loc[tmp['dow'] >=2])) * 7, unit='D')
    
    df['ldbw'] = tmp['ldbw'].values
    
    # カスタマー情報を抜いて、人気商品ランキングを作りたい
    # 次の売上集計日: ldbw, 商品ID: article_id でgroupbyする. 売上は個数をカウントする
    weekly_sales = df.drop('customer_id', axis=1).groupby(['ldbw', 'article_id']).count().reset_index()
    weekly_sales = weekly_sales.rename(columns={'t_dat': 'count'})
    
    # 直近の販売個数情報がすべてのデータに付与される
    df = df.merge(weekly_sales, on=['ldbw', 'article_id'], how = 'left')
    
    weekly_sales = weekly_sales.reset_index().set_index('article_id')

    # "count_targ" の列が付与される。付与されるデータは最終日の販売個数
    df = df.merge(
        weekly_sales.loc[weekly_sales['ldbw']==last_ts, ['count']],
        on='article_id', suffixes=("", "_targ"))

    # 最終日のみに売れたデータは、mergeしたときnanになる。それらのデータはゼロ埋め
    df['count_targ'].fillna(0, inplace=True)
    del weekly_sales
    
    # 最終日を基準として販売個数の割合を計算する。quotientの列を作成。
    df['quotient'] = df['count_targ'] / df['count']
    
    # quotientをarticle_idごとにsumしたい
    target_sales = df.drop('customer_id', axis=1).groupby('article_id')['quotient'].sum()
    # N はBin数で N = 12と設定されている。
    # quotientが大きい順に並べて、上から12個抜き出す。
    #general_pred = target_sales.nlargest(N).index.to_pandas().tolist()
    general_pred = target_sales.nlargest(N).index.tolist()
    general_pred = ['0' + str(article_id) for article_id in general_pred]
    general_pred_str =  ' '.join(general_pred)
    del target_sales
    
    purchase_dict = {}

    #tmp = df.copy().to_pandas()
    tmp = df.copy()
    # 9月22日からの差分、9月20日なら2という値がint型で tmp['x'] に格納される
    tmp['x'] = ((last_ts - tmp['t_dat']) / np.timedelta64(1, 'D')).astype(int)
    # 最終日の "0" が1に変換される
    tmp['dummy_1'] = 1 
    tmp['x'] = tmp[["x", "dummy_1"]].max(axis=1)

    #a, b, c, d = 2.5e4, 1.5e5, 2e-1, 1e3
    tmp['y'] = a / np.sqrt(tmp['x']) + b * np.exp(-c*tmp['x']) - d

    tmp['dummy_0'] = 0 
    tmp['y'] = tmp[["y", "dummy_0"]].max(axis=1)
    tmp['value'] = tmp['quotient'] * tmp['y'] 

    # 'customer_id', 'article_id' はgroupbyの基準列で残り、あとはvalueの列のみsumをとって列に残している
    tmp = tmp.groupby(['customer_id', 'article_id']).agg({'value': 'sum'})
    tmp = tmp.reset_index()

    tmp = tmp.loc[tmp['value'] > 0]
    tmp['rank'] = tmp.groupby("customer_id")["value"].rank("dense", ascending=False)
    tmp = tmp.loc[tmp['rank'] <= 12]

    purchase_df = tmp.sort_values(['customer_id', 'value'], ascending = False).reset_index(drop = True)
    purchase_df['prediction'] = '0' + purchase_df['article_id'].astype(str) + ' '
    purchase_df = purchase_df.groupby('customer_id').agg({'prediction': sum}).reset_index()
    purchase_df['prediction'] = purchase_df['prediction'].str.strip()
    purchase_df = pd.DataFrame(purchase_df)
    
    sub  = pd.read_csv(PATH_INPUT + 'sample_submission.csv',
                            usecols= ['customer_id'], 
                            dtype={'customer_id': 'string'})
    
    numCustomers = sub.shape[0]
    
    sub = sub.merge(dfCustomersTemp[['customer_id', 'age']], on='customer_id', how='inner')

    sub['customer_id2'] = sub['customer_id'].apply(lambda x: int(x[-16:],16) ).astype('int64')
    #sub['customer_id2'] = sub['customer_id'].str[-16:].str.hex_to_int().astype('int64')

    sub = sub.merge(purchase_df, left_on = 'customer_id2', right_on = 'customer_id', how = 'left',
                   suffixes = ('', '_ignored'))

    #sub = sub.to_pandas()
    sub['prediction'] = sub['prediction'].fillna(general_pred_str)
    sub['prediction'] = sub['prediction'] + ' ' +  general_pred_str
    sub['prediction'] = sub['prediction'].str.strip()
    sub['prediction'] = sub['prediction'].str[:131]
    sub = sub[['customer_id', 'prediction']]
    sub.to_csv(f'submission_' + str(uniBin) + '.csv',index=False)
    print(f'Saved prediction for {uniBin}. The shape is {sub.shape}. \n')
    print('-'*50)
print('Finished.\n')
print('='*50)

In [None]:
display(df, purchase_df, sub)

### Prediction data for submittion 

In [None]:
for i, uniBin in enumerate(listUniBins):
    dfTemp  = pd.read_csv(f'submission_' + str(uniBin) + '.csv')
    if i == 0:
        dfSub = dfTemp
    else:
        dfSub = pd.concat([dfSub, dfTemp], axis=0)

assert dfSub.shape[0] == numCustomers, f'The number of dfSub rows is not correct. {dfSub.shape[0]} vs {numCustomers}.'

dfSub.to_csv(f'submission.csv', index=False)
print(f'Saved submission.csv.')