In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
import os
import gc 
from datetime import datetime
from collections import Counter

warnings.filterwarnings('ignore')

In [None]:
transactions = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv")

In [None]:
transactions.head()

In [None]:
transactions.shape

In [None]:
transactions['t_dat'].max()

In [None]:
transactions['t_dat'].min()

# (0) Split dataset by transaction year

In [None]:
transactions_2020 = transactions[transactions['t_dat']>='2020-01-01']
transactions.drop(index=transactions_2020.index,inplace=True)

transactions_2019 = transactions[transactions['t_dat']>='2019-01-01']
transactions.drop(index=transactions_2019.index,inplace=True)

In [None]:
transactions_2018 = transactions.copy()
del transactions

In [None]:
gc.collect()

In [None]:
transactions_2019['t_dat'].max(),transactions_2019['t_dat'].min()

In [None]:
transactions_2020['t_dat'].max(),transactions_2020['t_dat'].min()

In [None]:
transactions_2018['t_dat'].max(),transactions_2018['t_dat'].min()

In [None]:
transactions_2018['t_dat'] = pd.DatetimeIndex(transactions_2018['t_dat'])
transactions_2020['t_dat'] = pd.DatetimeIndex(transactions_2020['t_dat'])
transactions_2019['t_dat'] = pd.DatetimeIndex(transactions_2019['t_dat'])

In [None]:
transactions_2018.reset_index(drop=True,inplace=True)
transactions_2019.reset_index(drop=True,inplace=True)
transactions_2020.reset_index(drop=True,inplace=True)

# (1) Create RFM Features

In [None]:
class RFM():
    def __init__(self):
        self.recency_df = None
        self.frequency_df = None
        self.monetary_df = None
        self.rfm_segmentation = None
        
    def RScore(self,x,p,d):
        if x <= d[p][0.25]:
            return 4
        elif x <= d[p][0.50]:
            return 3
        elif x <= d[p][0.75]: 
            return 2
        else:
            return 1
    
    def FMScore(self,x,p,d):
        if x <= d[p][0.25]:
            return 1
        elif x <= d[p][0.50]:
            return 2
        elif x <= d[p][0.75]: 
            return 3
        else:
            return 4
    
    def fit(self,data,cusomer_id_col,date_column,amount_col,trans_id_col='index'):
        data = data.reset_index().copy()
        data[amount_col] = data[amount_col].astype(float)
        now = datetime.now()
        
        self.recency_df = data.groupby(by=cusomer_id_col, as_index=False)[date_column].max()
        self.recency_df.columns = ['CustomerID','LastPurshaceDate']
        self.recency_df['Recency'] = self.recency_df['LastPurshaceDate'].apply(lambda x: (now - x).days)
        
        self.frequency_df = data.groupby(by=[cusomer_id_col], as_index=False)[trans_id_col].count()
        self.frequency_df.columns = ['CustomerID','Frequency']
        
        self.monetary_df = data.groupby(by=cusomer_id_col,as_index=False).agg({amount_col: 'sum'})
        self.monetary_df.columns = ['CustomerID','Monetary']
        
        temp_df = self.recency_df.merge(self.frequency_df,on='CustomerID')
        rfm_df = temp_df.merge(self.monetary_df,on='CustomerID')
        rfm_df.set_index('CustomerID',inplace=True)
        
        self.quantiles = rfm_df.quantile(q=[0.25,0.5,0.75])
        
        self.rfm_segmentation = rfm_df
        self.rfm_segmentation['R_Quartile'] = self.rfm_segmentation['Recency'].apply(self.RScore, args=('Recency',self.quantiles,))
        self.rfm_segmentation['F_Quartile'] = self.rfm_segmentation['Frequency'].apply(self.FMScore, args=('Frequency',self.quantiles,))
        self.rfm_segmentation['M_Quartile'] = self.rfm_segmentation['Monetary'].apply(self.FMScore, args=('Monetary',self.quantiles,))
        
        self.rfm_segmentation['RFMScore'] = self.rfm_segmentation.R_Quartile.map(str) \
                            + self.rfm_segmentation.F_Quartile.map(str) \
                            + self.rfm_segmentation.M_Quartile.map(str)
        return self

In [None]:
rfm_2018 = RFM()
rfm_2019 = RFM()
rfm_2020 = RFM()

In [None]:
%time rfm_2018.fit(transactions_2018,cusomer_id_col='customer_id',date_column='t_dat',amount_col='price')

In [None]:
%time rfm_2019.fit(transactions_2019,cusomer_id_col='customer_id',date_column='t_dat',amount_col='price')

In [None]:
%time rfm_2020.fit(transactions_2020,cusomer_id_col='customer_id',date_column='t_dat',amount_col='price')

In [None]:
gen = ({key:value} for key,value in enumerate(range(10)))

In [None]:
rfm_segmentation_2020 = rfm_2020.rfm_segmentation
rfm_segmentation_2019 = rfm_2019.rfm_segmentation
rfm_segmentation_2018 = rfm_2018.rfm_segmentation

In [None]:
transactions_2018_copy = transactions_2018.set_index('customer_id')
transactions_2019_copy = transactions_2019.set_index('customer_id')
transactions_2020_copy = transactions_2020.set_index('customer_id')

In [None]:
transactions_2018_copy.shape[0]==transactions_2018.shape[0]

In [None]:
del transactions_2018
del transactions_2019
del transactions_2020

# (2) Get last purchased item

In [None]:
def get_last_purchased_item(transactions_df,last_perchased_date):
#     temp_df = transactions_df.join(last_perchased_date)[['article_id']]
    return transactions_df.join(last_perchased_date)
#     return temp_df

In [None]:
last_puchased_item_2018_test =  get_last_purchased_item(transactions_2018_copy,rfm_segmentation_2018[['LastPurshaceDate']])

In [None]:
last_puchased_item_2018_test.sample(10000).reset_index().groupby('index').agg({"article_id":'count'}).value_counts()

In [None]:
last_puchased_item_2018_test.sample(10000).reset_index().groupby('index').agg({"article_id":'count'}).value_counts()

In [None]:
last_puchased_item_2018_test.sample(10000).reset_index().groupby('index').agg({"article_id":'count'}).value_counts()

In [None]:
def get_last_purchased_item(transactions_df,last_perchased_date,year):
    temp_df = transactions_df.join(last_perchased_date)[['article_id']]
    temp_df = temp_df[~temp_df.index.duplicated(keep='first')]
    temp_df.columns = [f"last_purchesed_item_{str(year)}"]
    return temp_df

In [None]:
%%time

last_puchased_item_2018 =  get_last_purchased_item(transactions_2018_copy,rfm_segmentation_2018[['LastPurshaceDate']],year=2018)
last_puchased_item_2019 =  get_last_purchased_item(transactions_2019_copy,rfm_segmentation_2019[['LastPurshaceDate']],year=2019)
last_puchased_item_2020 =  get_last_purchased_item(transactions_2020_copy,rfm_segmentation_2020[['LastPurshaceDate']],year=2020)

# (3) Get most frequent items per user

In [None]:
def get_most_frequent_item(transactions_df,year):
    temp = transactions_df.groupby(transactions_df.index).agg({"article_id":lambda x:Counter(x)})
    temp = temp['article_id'].apply(lambda x:x.most_common(n=1)[0][0])
    temp = pd.DataFrame(temp)
    temp.columns = [f'most_frequent_item_{str(year)}']
    return temp

In [None]:
%%time

most_frequent_item_2018 = get_most_frequent_item(transactions_2018_copy,year=2018)
most_frequent_item_2019 = get_most_frequent_item(transactions_2019_copy,year=2019)
most_frequent_item_2020 = get_most_frequent_item(transactions_2020_copy,year=2020)

# (4) Get users monthly money spending

In [None]:
def get_month_breakdown(df1,df2,df3):
    df1 = df1.copy()
    df2 = df2.copy()
    df3 = df3.copy()
    
    df1['month'] = df1['t_dat'].apply(lambda x:x.month)
    df2['month'] = df2['t_dat'].apply(lambda x:x.month)
    df3['month'] = df3['t_dat'].apply(lambda x:x.month)
    
    df1 = df1.groupby([df1.index,'month']).agg({"price":sum})
    df2 = df2.groupby([df2.index,'month']).agg({"price":sum})
    df3 = df3.groupby([df3.index,'month']).agg({"price":sum})
    
    
    df1 = df1.pivot_table(index=df1.index,columns='month',values='price',aggfunc=sum,fill_value=0)
    df2 = df2.pivot_table(index=df2.index,columns='month',values='price',aggfunc=sum,fill_value=0)
    df3 = df3.pivot_table(index=df3.index,columns='month',values='price',aggfunc=sum,fill_value=0)
    
    
    df1.index = df1.reset_index()['index'].apply(lambda x:x[0]).values
    df2.index = df2.reset_index()['index'].apply(lambda x:x[0]).values
    df3.index = df3.reset_index()['index'].apply(lambda x:x[0]).values
    
    df1 = df1.groupby(df1.index).sum()
    df2 = df2.groupby(df2.index).sum()
    df3 = df3.groupby(df3.index).sum()
    
    df = pd.concat([df1,df2,df3])
    df = df.groupby(df.index).sum()
    
    return df

In [None]:
%%time

transactions_month_view = get_month_breakdown(transactions_2018_copy,transactions_2019_copy,transactions_2020_copy)

# (5) Simple EDA

In [None]:
N = 100_000
plt.style.use('fivethirtyeight')

col_names_l = ("last_purchesed_item_2018","last_purchesed_item_2019","last_purchesed_item_2020")
col_names_f = ("most_frequent_item_2018","most_frequent_item_2019","most_frequent_item_2020")

dfs_l = (last_puchased_item_2018.copy(),last_puchased_item_2019.copy(),last_puchased_item_2020.copy())
dfs_f = (most_frequent_item_2018.copy(),most_frequent_item_2019.copy(),most_frequent_item_2020.copy())

## (a) RFM Features

**Definitions for RFM**

- RECENCY (R): Days since last purchase
- FREQUENCY (F): Total number of purchases
- MONETARY VALUE (M): Total money this customer spent.

In [None]:
def get_summary():
    
    n_days_2018 = transactions_2018_copy['t_dat'].max() - transactions_2018_copy['t_dat'].min()
    n_days_2019 = transactions_2019_copy['t_dat'].max() - transactions_2019_copy['t_dat'].min()
    n_days_2020 = transactions_2020_copy['t_dat'].max() - transactions_2020_copy['t_dat'].min()
    
    rfm_info_2018 = rfm_segmentation_2018.describe().to_dict()
    rfm_info_2019 = rfm_segmentation_2019.describe().to_dict()
    rfm_info_2020 = rfm_segmentation_2020.describe().to_dict()
    
    rfm_info = dict()

    rfm_info['2018'] = dict()
    rfm_info['2019'] = dict()
    rfm_info['2020'] = dict()

    rfm_info['2018']['mean_frequency'] = rfm_info_2018['Frequency']['mean']
    rfm_info['2019']['mean_frequency'] = rfm_info_2019['Frequency']['mean']
    rfm_info['2020']['mean_frequency'] = rfm_info_2020['Frequency']['mean']

    rfm_info['2018']['mean_monetary'] = rfm_info_2018['Monetary']['mean']
    rfm_info['2019']['mean_monetary'] = rfm_info_2019['Monetary']['mean']
    rfm_info['2020']['mean_monetary'] = rfm_info_2020['Monetary']['mean']

    rfm_info['2018']['median_frequency'] =rfm_segmentation_2018['Frequency'].median()
    rfm_info['2019']['median_frequency'] =rfm_segmentation_2019['Frequency'].median()
    rfm_info['2020']['median_frequency'] =rfm_segmentation_2020['Frequency'].median()

    rfm_info['2018']['median_monetary'] = rfm_segmentation_2018['Monetary'].median()
    rfm_info['2019']['median_monetary'] = rfm_segmentation_2019['Monetary'].median()
    rfm_info['2020']['median_monetary'] = rfm_segmentation_2020['Monetary'].median()

    rfm_info['2018']['total_money_spent'] = rfm_segmentation_2018['Monetary'].sum()
    rfm_info['2019']['total_money_spent'] = rfm_segmentation_2019['Monetary'].sum()
    rfm_info['2020']['total_money_spent'] = rfm_segmentation_2020['Monetary'].sum()

    rfm_info['2018']['cutomer_count'] = int(rfm_segmentation_2018.shape[0])
    rfm_info['2019']['cutomer_count'] = int(rfm_segmentation_2019.shape[0])
    rfm_info['2020']['cutomer_count'] = int(rfm_segmentation_2020.shape[0])
    
    rfm_info['2018']['n_days'] = n_days_2018.days
    rfm_info['2019']['n_days'] = n_days_2019.days
    rfm_info['2020']['n_days'] = n_days_2020.days
    
    rfm_info['2018']['customers/days'] = rfm_info['2018']['cutomer_count']/n_days_2018.days
    rfm_info['2019']['customers/days'] = rfm_info['2019']['cutomer_count']/n_days_2019.days
    rfm_info['2020']['customers/days'] = rfm_info['2020']['cutomer_count']/n_days_2020.days
    
    rfm_info['2018']['money/days'] = rfm_info['2018']['total_money_spent']/n_days_2018.days
    rfm_info['2019']['money/days'] = rfm_info['2019']['total_money_spent']/n_days_2019.days
    rfm_info['2020']['money/days'] = rfm_info['2020']['total_money_spent']/n_days_2020.days    
    
    return pd.DataFrame(rfm_info).T

In [None]:
info = get_summary()

In [None]:
info.style.background_gradient(cmap='viridis',axis='rows')

## (b) Last purchesed items

In [None]:
articles = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/articles.csv')

In [None]:
def plot_product_popularity(product_df,col_names,dfs,product_feature='prod_name'):
    
    product_df = product_df.copy()
    
    col_name1,col_name2,col_name3 = col_names
    df_2018,df_2019,df_2020 = dfs
    
    mapper = dict(zip(product_df['article_id'].values,product_df[product_feature].values))
    
    df1 = pd.DataFrame(index=df_2018.index,columns=[col_name1])
    df2 = pd.DataFrame(index=df_2019.index,columns=[col_name2])
    df3 = pd.DataFrame(index=df_2020.index,columns=[col_name3])
    
    df1[col_name1] = df_2018[col_name1].map(mapper)
    df2[col_name2] = df_2019[col_name2].map(mapper)
    df3[col_name3] = df_2020[col_name3].map(mapper)
    
    _,axi = plt.subplots(1,3,figsize=(15,9))

    ax = df1[col_name1].value_counts().sort_values().tail(20).plot.barh(ax=axi[0],title='2018')
    ax = df2[col_name2].value_counts().sort_values().tail(20).plot.barh(ax=axi[1],title='2019')
    ax = df3[col_name3].value_counts().sort_values().tail(20).plot.barh(ax=axi[2],title='2020')

    plt.tight_layout()
    plt.grid(False)
    plt.show()

#### (i) Product name trend

In [None]:
plot_product_popularity(articles,col_names=col_names_l,dfs=dfs_l)

#### (ii) Product type name trend

In [None]:
plot_product_popularity(articles,col_names=col_names_l,dfs=dfs_l,product_feature='product_type_name')

#### (iii) Index group name trend

In [None]:
plot_product_popularity(articles,col_names=col_names_l,dfs=dfs_l,product_feature='index_group_name')

#### (iv) Section name trend

In [None]:
plot_product_popularity(articles,col_names=col_names_l,dfs=dfs_l,product_feature='section_name')

## (c) Most frequent items

#### (i) Product name trend

In [None]:
plot_product_popularity(articles,col_names=col_names_f,dfs=dfs_f)

#### (ii) Product type name trend

In [None]:
plot_product_popularity(articles,col_names=col_names_f,dfs=dfs_f,product_feature='product_type_name')

#### (iii) Index group name trend

In [None]:
plot_product_popularity(articles,col_names=col_names_f,dfs=dfs_f,product_feature='index_group_name')

#### (iv) Section name trend

In [None]:
plot_product_popularity(articles,col_names=col_names_f,dfs=dfs_f,product_feature='section_name')

## (d) Money spending

### Check sparsity

In [None]:
plt.figure(figsize=(15,20))
plt.spy(transactions_month_view.sample(200).T)
plt.tight_layout()
plt.axis('off')
plt.grid(False)
plt.show()

In [None]:
plt.figure(figsize=(12,8))
ax = transactions_month_view.sum(axis=0).plot.bar()
ax.set_xticklabels(["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"])
ax.set_ylabel("Total Money Spent")
plt.show()

## (6) Save Datasets

In [None]:
rfm_segmentation_2018.reset_index().to_csv("rfm_features_2018.csv")
rfm_segmentation_2019.reset_index().to_csv("rfm_features_2019.csv")
rfm_segmentation_2020.reset_index().to_csv("rfm_features_2020.csv")

transactions_2018_copy.reset_index().to_csv("transactions_2018.csv")
transactions_2019_copy.reset_index().to_csv("transactions_2019.csv")
transactions_2020_copy.reset_index().to_csv("transactions_2020.csv")

last_puchased_item_2018.reset_index().to_csv("last_purchased_items_2018.csv")
last_puchased_item_2019.reset_index().to_csv("last_purchased_items_2019.csv")
last_puchased_item_2020.reset_index().to_csv("last_purchased_items_2020.csv")

most_frequent_item_2018.reset_index().to_csv("most_frequent_items.csv")
most_frequent_item_2019.reset_index().to_csv("most_frequent_items.csv")
most_frequent_item_2020.reset_index().to_csv("most_frequent_items.csv")