# Quoting and Conclusion

@astrung published a discussion and a notebook about user's distribution. \
Discussion: https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations/discussion/312653 \
Notebook: https://www.kaggle.com/astrung/eda-extract-user-metadata-to-apply-deep-model/notebook \
Please UPVOTE them too!

Thanks for your great interesting Notebooks and Notebooks!

I validated its strategy by hold-out method. \
Valid term is from 2020-09-16 to 2020-09-22 (both included). \
I shift the transactions' date 1 week later in training data for strict validation. (We cut the last week's transactions, so the amount of transactions of September get lowers.)

The insight abought this validation is in this discussion. \
https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations/discussion/315587

In [None]:
from datetime import timedelta

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
df = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv')
df['t_dat'] = pd.to_datetime(df['t_dat'], format="%Y-%m-%d")
df = df[df['t_dat'] <= pd.to_datetime('2020-09-15')]
df.head()

In [None]:
df['t_dat'] = pd.to_datetime(df['t_dat'], format="%Y-%m-%d")

# Shift transactions for strict validation!
df['t_dat'] = df['t_dat'] + timedelta(weeks=1)

df['month'] = df['t_dat'].dt.strftime('%m')
df['year'] = df['t_dat'].dt.strftime('%Y')
df.head()

In [None]:
df = df[df['year'] == '2020']
df.shape

In [None]:
df_test_user = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')
df_test_user.shape

# Find inactive user

First, let count number of transaction in each month for all users

In [None]:
df_month_avg_item_per_u = df.groupby(['customer_id', 'month'])['price'].count().unstack().reset_index()
df_month_avg_item_per_u

Then merge with test data. Test data has more rows than our transaction data. It means we have some users who don't have any transactions in 2020 in test data. Let check how many users like it

In [None]:
df_month_avg_item_per_u = pd.merge(df_month_avg_item_per_u, df_test_user[['customer_id']], on='customer_id', how='outer')
df_month_avg_item_per_u

In [None]:
df_month_avg_item_per_u['num_missing_months'] = df_month_avg_item_per_u.isnull().sum(axis=1)
df_month_avg_item_per_u

**num_missing_months=9 means users don't have any transactions in 2020(9 months of 2020). There is 37% users with this condition in test data**

In [None]:
num_missing_year = len(df_month_avg_item_per_u[df_month_avg_item_per_u['num_missing_months'] == 9])
print(num_missing_year)
print(num_missing_year/len(df_test_user))

In [None]:
df_month_avg_item_per_u = df_month_avg_item_per_u.fillna(0)
df_month_avg_item_per_u

**Inactive users with more than 3 consecutive months will be still masked as 3**

In [None]:
def cal_inactive_months(x):
    if x['09'] > 0:
        return 0
    elif x['09'] == 0 and x['08'] > 0:
        return 1
    elif x['09'] == 0 and x['08'] == 0 and x['07'] > 0:
        return 2
    elif x['09'] == 0 and x['08'] == 0 and x['07'] == 0:
        return 3
    else:
        return 4

df_month_avg_item_per_u['lastest_inactive_months'] = df_month_avg_item_per_u[
    df_month_avg_item_per_u.columns.difference(['customer_id', 'num_missing_months'])].apply(
    lambda x: cal_inactive_months(x), axis=1)
df_month_avg_item_per_u

**In below cell, we see that 50% of users disappers in 8 or more months before reactive. It is another challenge in our data**

In [None]:
print(df_month_avg_item_per_u.num_missing_months.value_counts())
print(df_month_avg_item_per_u.num_missing_months.describe())
df_month_avg_item_per_u.num_missing_months.hist()

**In following cell, we see that 63% of users disappeared in recent 3 months (Sep, Aug, July) before reappear in testdata.**

In [None]:
print(df_month_avg_item_per_u.lastest_inactive_months.value_counts())
print(df_month_avg_item_per_u.lastest_inactive_months.describe())
df_month_avg_item_per_u.lastest_inactive_months.hist()

In [None]:
print("Missing 3 months")
num_missing_3months = len(df_month_avg_item_per_u[df_month_avg_item_per_u['lastest_inactive_months'] == 3])
print(num_missing_3months)
print(num_missing_3months/len(df_test_user))
print("Missing 2 months")
num_missing_2months = len(df_month_avg_item_per_u[df_month_avg_item_per_u['lastest_inactive_months'] == 2])
print(num_missing_2months)
print(num_missing_2months/len(df_test_user))
print("Missing 1 months")
num_missing_1months = len(df_month_avg_item_per_u[df_month_avg_item_per_u['lastest_inactive_months'] == 1])
print(num_missing_1months)
print(num_missing_1months/len(df_test_user))

Create a dataframe for inactive user, in order to merge with other information about user

In [None]:
df_month_avg_item_per_u['active_status'] = 'active'
df_month_avg_item_per_u.loc[(df_month_avg_item_per_u.num_missing_months == 9),'active_status']='inactive_in_year'
df_month_avg_item_per_u.loc[(df_month_avg_item_per_u.num_missing_months < 9) &
                            (df_month_avg_item_per_u.lastest_inactive_months == 3),
                            'active_status']='inactive_in_3_months_or_more'
df_month_avg_item_per_u.loc[
    (df_month_avg_item_per_u.lastest_inactive_months == 2),'active_status']='inactive_in_2_months'
df_month_avg_item_per_u.loc[
    (df_month_avg_item_per_u.lastest_inactive_months == 1),'active_status']='inactive_in_1_month'
df_month_avg_item_per_u

In [None]:
df_active_user = df_month_avg_item_per_u[['customer_id', 'num_missing_months', 'lastest_inactive_months', 'active_status']].copy()
df_active_user

# Find coldstart customer

**First, count number of transaction. We will mask users with number of transactions <= 10 are cold start user. They are users with too small data for correct recommendation**

In [None]:
df_avg_item_per_u = df.groupby(['customer_id'])['price'].count().reset_index()
df_avg_item_per_u.columns = ['customer_id', 'num_transactions']
df_avg_item_per_u

In test data, we have some users who dont have any transactions in 2020. Let add it into our dataframe, and label their number of transaction as 0

In [None]:
df_avg_item_per_u = pd.merge(df_avg_item_per_u, df_test_user[['customer_id']], on='customer_id', how='outer')
df_avg_item_per_u = df_avg_item_per_u.fillna(0)
df_avg_item_per_u

**In below plot, we see that most of users have small number of transactions**

In [None]:
df_avg_item_per_u.num_transactions.hist(bins=100)
plt.show()
plt.close()
df_avg_item_per_u.boxplot('num_transactions')
plt.show()
plt.close()

In [None]:
df_avg_item_per_u.num_transactions.value_counts(bins=[-1, 0, 10, 100, 1000])

In [None]:
df_avg_item_per_u.num_transactions.describe()

**we mask users with num transaction < 10 as cold start user**

In [None]:
df_avg_item_per_u['cold_start_status'] = 'cold_start'
df_avg_item_per_u.loc[(df_avg_item_per_u.num_transactions >= 10),'cold_start_status']='non_cold_start'
df_coldstart_user = df_avg_item_per_u.copy()
df_coldstart_user

# Find about frequent transaction of user in month 

In [None]:
df_month_avg_item_per_u = df.groupby(['customer_id', 'month'])['price'].count().unstack().reset_index()
df_month_avg_item_per_u

In [None]:
def find_active_month(x):
    float_x = x.values[1:].astype(float)
    return float_x[~np.isnan(float_x)]
df_month_avg_item_per_u['transactions_in_active_month'] = df_month_avg_item_per_u.apply(
    lambda x: find_active_month(x), axis=1)
df_month_avg_item_per_u

In [None]:
df_month_avg_item_per_u['mean_transactions_in_active_month'] = df_month_avg_item_per_u.apply(
    lambda x: x['transactions_in_active_month'].mean(), axis=1)
df_month_avg_item_per_u

In average, each user only buy 4 items in a month/1 item in a week. It is another challenge

In [None]:
print(df_month_avg_item_per_u.mean_transactions_in_active_month.describe())
df_month_avg_item_per_u.mean_transactions_in_active_month.hist(bins=100)

# Create dataframe for all metadata for user: active status/cold start status

In [None]:
df_transaction_frequent = df_month_avg_item_per_u[['customer_id', 'mean_transactions_in_active_month']].copy()
df_transaction_frequent

In [None]:
result = pd.merge(df_active_user, df_coldstart_user, on='customer_id', how='outer')
result = pd.merge(result, df_transaction_frequent, on='customer_id', how='outer')
result

In [None]:
result[(result.active_status == 'active') & (result.cold_start_status == 'non_cold_start')].shape

In [None]:
result.to_csv('metadata_customer_id_fold1.csv', index=False)

In [None]:
result.shape

In [None]:
print(f"active & non cold start user: {len(result[(result.active_status == 'active') & (result.cold_start_status == 'non_cold_start')])/len(result)*100 :.2f}%")

# Validation for This Strategy

In [None]:
valid_df = pd.read_csv(f'../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv')
valid_df['t_dat'] = pd.to_datetime(valid_df['t_dat'])
valid_df = valid_df[valid_df['t_dat'] >= pd.to_datetime('2020-09-16')]
valid_user = valid_df[['customer_id']].drop_duplicates()
valid_user = valid_user.merge(result, how='left')

In [None]:
pd.options.display.float_format = '{:.2f}'.format
valid_user_dist = pd.DataFrame()
for c in ['lastest_inactive_months', 'active_status', 'cold_start_status']:
    tmp = (valid_user[c].value_counts().sort_index()/len(valid_user)*100).reset_index()
    tmp['column'] = c
    tmp = tmp[tmp.columns[[2,0,1]]]
    tmp.columns = ['column', 'value', 'percent']
    tmp = tmp.set_index(['column', 'value'])
    valid_user_dist = pd.concat([valid_user_dist, tmp], axis=0)
valid_user_dist

In [None]:
print(f"active & non cold start user (by all users who bought any item in valid term): \
    {len(valid_user[(valid_user['active_status']=='active') & (valid_user['cold_start_status']=='non_cold_start')])/len(valid_user)*100 :.2f}%")