# EDA TOC
1. Shape of Ratings - how many are users and items?
2. Outlier buyers - these may be test buyers
2. Behavior in terms of time - how many users buy items in the next 7 days?
3. Behavior in terms of tops - what are the most bought items? item types? What are the items most bought one time only?
4. Buyer archetypes - what do they buy?
    - Loyal customer - >24 items (75 pct), what are the items being bought?
    - See table:
|                 | 8 items (50 pct)  | 3 items (25 pct) |   |   |
|-----------------|-------------------|------------------|---|---|
| Short Intervals | Obsessed Customer | Surge Customer   |   |   |
| Long Intervals  | Regular Customer  | Repeat Customer  |   |   |
|                 |                   |                  |   |   |


In [None]:
import numpy as np
import pandas as pd
from recommender_utils import RecommenderUtils
import seaborn as sns
from matplotlib import pyplot as plt

pd.set_option('display.max_colwidth', None)


INPUT_DIR = "/kaggle/input/h-and-m-personalized-fashion-recommendations"
METADATA_ITEMS_FILE = f"{INPUT_DIR}/articles.csv"
METADATA_USERS_FILE = f"{INPUT_DIR}/customers.csv"
METADATA_TRANS_FILE = f"{INPUT_DIR}/transactions_train.csv"
IMAGES_DIR = f"{INPUT_DIR}/images"
SUBMISSIONS_SAMPLE_FILE = f"{INPUT_DIR}/sample_submission.csv"

# IMPORTANT COLUMNS / GROUPS
USER_ID = "customer_id"
ITEM_ID = "article_id"
RATING="price"
ITEM_CATEGORICAL_COLS = ["product_group_name", "graphical_appearance_name", "colour_group_name", "perceived_colour_value_name",
                        "perceived_colour_master_name", "index_name", "index_group_name", "section_name", "garment_group_name"]
ITEM_TEXT_COLS = ["product_type_name", "prod_name", "department_name", "detail_desc"]
USER_CATEGORICAL_COLS = ["club_member_status", "fashion_news_frequency", "postal_code"]
USER_BOOLEAN_COLS = ["FN", "Active"]
USER_NUMERICAL_COLS = ["age", "min_purchase_interval", "median_purchase_interval", "max_purchase_interval"]

In [None]:
df_items = pd.read_csv(METADATA_ITEMS_FILE)
df_users = pd.read_csv(METADATA_USERS_FILE)
df_txn = pd.read_csv(METADATA_TRANS_FILE)

print(f"Item shape: {df_items.shape}, User shape: {df_users.shape}, Transaction Shape: {df_txn.shape}")

# one can transform the ids to categoricals, to save space
print("Items")
print(df_items.columns)
print("Users")
print(df_users.columns)
print("Ratings")
print(df_txn.columns)

In [None]:
display(df_items[:3])
display(df_users[:3])
display(df_txn[:3])

# Shape of Ratings
- There are 1M+ users, 100k items, 31M ratings, with a sparsity of 2e-4
- Price seems to be from >0.01 to 0.5. Bit weird.
- Median of the # of items to users and vice versa seem healthy. As always, one should take care of sparsity. How many users have only one transaction? This can determine the importance of the side information (item and user metadata).
    - One time purchase users are 11% while on the other end, items are 4%. It's not the worst I've seen.
    - These sparse buying users can probably be saved by the item metadata. But it's just a small fraction anyway.
- **[MODELING NOTES]Outliers: 99 PCT of items to users is 153, and the max is 1346 purchases! Perhaps it's good to remove these users in the modeling stage.**

In [None]:
utils = RecommenderUtils(user_id = "customer_id", item_id = "article_id", rating="price")
utils.print_ratings_shape(df_txn)

**Note:** For quicker EDA, I'll subset only 10% of the transactions for some of the graphs

In [None]:
ratings = df_txn.sample(frac = 0.1, random_state=42)
utils.print_ratings_shape(ratings)

In [None]:
display(ratings["price"].describe())
sns.distplot(ratings["price"])

In [None]:
# average number of items per user
item_per_user = df_txn.groupby(USER_ID)[ITEM_ID].nunique()

fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(211)
display(item_per_user.describe().to_frame("Number of items per user"))
sns.distplot(item_per_user, kde=False, ax=ax)
ax.set_title("Median number of items per user: {:.2f}".format(item_per_user.median()))

# # average number of users per item
user_per_item = df_txn.groupby(ITEM_ID)[USER_ID].nunique()
ax = fig.add_subplot(212)
display(user_per_item.describe().to_frame("Number of users per item"))
sns.distplot(user_per_item, kde=False, ax=ax)
ax.set_title("Median number of users per item: {:.2f}".format(user_per_item.median()))

fig.tight_layout()

In [None]:
one_time_buyers = item_per_user[item_per_user == 1]
one_time_purchases = user_per_item[user_per_item == 1]

pct_one_time_buyers = len(one_time_buyers) / len(item_per_user)
pct_one_time_purchases = len(one_time_purchases) / len(user_per_item)

print(f"(users) One time buyers: {len(one_time_buyers)} ({pct_one_time_buyers:.4f})")
print(f"(items) One time purchases: {len(one_time_purchases)} ({pct_one_time_purchases:.4f})")

In [None]:
print(f"Outlier customers (99 pct): {item_per_user.quantile(0.99)}")

## Cold start users
Warning, cold start. Hence, the side info is REALLY IMPORTANT!

In [None]:
# there are cold-start users??
num_users_with_txn = set(df_users[USER_ID]).intersection(set(df_txn[USER_ID]))
num_cold_start = len(df_users) - len(num_users_with_txn)
print(f"Num cold start users!: {num_cold_start}")

# Behavior in terms of time
- The data has two years of purchasing behavior.
- ~25% of the users buy items within 7 days. Almost 60% buy within 32 days (a complete month cycle). This is a frequent buying pattern! Fast fashion?
    - **Recommenders can really boost the bottom line!**
- **[MODELING NOTES] Include average purchasing interval for customers**

In [None]:
df_txn["t_dat"] = pd.to_datetime(df_txn["t_dat"])

In [None]:
_, ax = plt.subplots(figsize=(15,10))
df_txn["t_dat"].value_counts().plot()
display(df_txn["t_dat"].describe())

In [None]:
vc_year_months = df_txn["t_dat"].dt.strftime("%Y-%m").value_counts()

vc_year_months = vc_year_months.sort_index()
display(vc_year_months)
_, ax = plt.subplots(figsize=(15,10))
vc_year_months.plot()

In [None]:
# computing purchase lags
start_of_observation = df_txn["t_dat"].min()
df_txn["offset_purchase_dat"] = df_txn["t_dat"] - start_of_observation
df_txn['prev_purchase_offset'] = df_txn.groupby(USER_ID)['offset_purchase_dat'].shift()

# deduplicate same day purchases per customer
df_removed_same_day_purchases =  df_txn[df_txn["offset_purchase_dat"] != df_txn["prev_purchase_offset"]]
df_removed_same_day_purchases["purchase_lag"] = df_removed_same_day_purchases["offset_purchase_dat"] - df_removed_same_day_purchases["prev_purchase_offset"]

In [None]:
# looks right, yes?
df_removed_same_day_purchases[df_removed_same_day_purchases[USER_ID] == "fffef3b6b73545df065b521e19f64bf6fe93bfd450ab20e02ce5d1e58a8f700b"]

In [None]:
# this is the unaveraged lag
display(df_removed_same_day_purchases["purchase_lag"].describe(percentiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.]))
plt.figure(figsize=(15,10))
sns.distplot(df_removed_same_day_purchases["purchase_lag"].dt.days)

In [None]:
# this is the averaged lag
average_purchase_lag_per_user = df_removed_same_day_purchases.groupby(USER_ID)["purchase_lag"].mean()
average_purchase_lag_per_user.describe(percentiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.])
plt.figure(figsize=(15,10))
sns.distplot(average_purchase_lag_per_user.dt.days)

# Behavior in terms of tops 
- what are the most bought items? item types? What are the items most bought one time only?
    - these are ladieswear mostly
- are there any differences with the once-bought items?
    - Kids' items are mostly bought once. Specials?
- commonalities?
    - dark, trousers, upper garments

In [None]:
vc_item_id = df_txn[ITEM_ID].value_counts()
most_bought_items = vc_item_id[vc_item_id > vc_item_id.quantile(0.99)]
most_bought_items = most_bought_items.to_frame("count").reset_index()
most_bought_items.rename(columns={"index" : ITEM_ID}, inplace=True)

once_bought_items = vc_item_id[vc_item_id == 1]
once_bought_items = once_bought_items.to_frame("count").reset_index()
once_bought_items.rename(columns={"index" : ITEM_ID}, inplace=True)

df_items_popular = df_items.merge(most_bought_items)
df_items_rare = df_items.merge(once_bought_items)

## Most bought items and their categories

In [None]:
fig = plt.figure(figsize=(10, 30))
for idx, cat_col in enumerate(ITEM_CATEGORICAL_COLS):
    ax = fig.add_subplot(len(ITEM_CATEGORICAL_COLS), 1, idx+1)
    df_items_popular[cat_col].value_counts()[:5][::-1].plot.barh(ax=ax)
    
    ax.set_title(cat_col)
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    
fig.tight_layout()

## Once bought items and their categories

In [None]:
fig = plt.figure(figsize=(10, 30))
for idx, cat_col in enumerate(ITEM_CATEGORICAL_COLS):
    ax = fig.add_subplot(len(ITEM_CATEGORICAL_COLS), 1, idx+1)
    df_items_rare[cat_col].value_counts()[:5][::-1].plot.barh(ax=ax)
    
    ax.set_title(cat_col)
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    
fig.tight_layout()

# Buyer Archetypes
- WIP!

- what do they buy?
    - Loyal customer - >24 items (75 pct), what are the items being bought?
    - See table:
|                 | 8 items (50 pct)  | 3 items (25 pct) |   |   |
|-----------------|-------------------|------------------|---|---|
| Short Intervals (1-7 days) | Obsessed Customer | Surge Customer   |   |   |
| Long Intervals (30-120 days) | Regular Customer  | Repeat Customer  |   |   |
|                 |                   |                  |   |   |

In [None]:
# define short interval

# Features
## Age is bimodal. How to impute this?
- It's a toughie since, short of MICE, there is no single variable that can separate age.

In [None]:

sns.distplot(df_users["age"].sample(frac=0.01))

In [None]:
sns.displot(df_users.sample(frac=0.01), x="age", col="club_member_status")

In [None]:
sns.displot(df_users.sample(frac=0.01), x="age", col="FN", row="Active")

## Items

In [None]:
for col in ITEM_CATEGORICAL_COLS:
    display(df_items[col].value_counts())