## Exploratory data analysis of cold-start dataset
 - We will explore newly created train/test datasets attributes and identify user cold-start problem
 - In particular we will verify that datasets still share common traits as it is prerequisite for proper modeling:
   - compare items unique counts
   - compare distribution of slate sizes
   - compare distribution of slates per user
   - compare history sizes
   - compare click ratios
   - compare categories and subcategories distribution
 - **Questions**:
   - Does user cold-start occur in newly created dataset?
   - How many users are affected by user cold-start?
   - Does item cold-start occur in newly created dataset?

In [None]:
try:
    from google.colab import drive

    drive.mount('/content/gdrive')
    BASE_DIR = "/content/gdrive/MyDrive/mlprague2022"
    IN_COLAB = True
except:
    BASE_DIR = ".."
    IN_COLAB = False

import os
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import plotly.graph_objects as go

In [None]:
OUTPUT_DIR = os.path.join(BASE_DIR, "data/mind_cold_start_datasets_basic/")

COLD_START_BEHAVIORS_TRAIN = os.path.join(OUTPUT_DIR, "behaviors_train.tsv")
COLD_START_BEHAVIORS_TEST = os.path.join(OUTPUT_DIR, "behaviors_test.tsv")

In [None]:
behaviors_train = pd.read_csv(COLD_START_BEHAVIORS_TRAIN, sep="\t")
behaviors_train

In [None]:
behaviors_test = pd.read_csv(COLD_START_BEHAVIORS_TEST, sep="\t")
behaviors_test

### Unique slate count

In [None]:
behaviors_train.slateid.nunique(), behaviors_test.slateid.nunique()

### Unique user count

In [None]:
# unique users per dataset
behaviors_train.userid.nunique(), behaviors_test.userid.nunique()

In [None]:
# do datasets share users?
len(set(behaviors_train.userid.unique()) & set(behaviors_test.userid.unique()))

### Unique arcticles count

In [None]:
# unique articles per dataset
train_unq_hist_articles = behaviors_train.history.map(lambda x: x.split()).explode().dropna().unique().tolist()
train_uniq_imp_articles = behaviors_train.impressions.map(lambda x: [xx [:-2]for xx in x.split()]).explode().unique().tolist()

train_unq_articles = set(train_unq_hist_articles + train_uniq_imp_articles)

test_unq_hist_articles = behaviors_test.history.map(lambda x: x.split()).explode().dropna().unique().tolist()
test_uniq_imp_articles = behaviors_test.impressions.map(lambda x: [xx [:-2]for xx in x.split()]).explode().unique().tolist()

test_unq_articles = set(test_unq_hist_articles + test_uniq_imp_articles)

In [None]:
len(train_unq_articles), len(test_unq_articles)

In [None]:
# do dataset share articles?
len(train_unq_articles & test_unq_articles)

### Unique categories count

In [None]:
behaviors_train.history_all_categories.map(lambda x: x.split()).explode().nunique(), behaviors_test.history_all_categories.map(lambda x: x.split()).explode().nunique()

### Unique subcategories count

In [None]:
behaviors_train.history_all_subcategories.map(lambda x: x.split()).explode().nunique(), behaviors_test.history_all_subcategories.map(lambda x: x.split()).explode().nunique()

In [None]:
def item_len(x):
    return len(x.split())

In [None]:
bins = range(50)
ax = behaviors_train["impressions"].apply(item_len).plot.hist(
    bins=bins, figsize=(20, 4), cumulative=True, density=True, title="Cumulative distribution of slates w.r.t. their size", histtype="step"
)

behaviors_test["impressions"].apply(item_len).plot.hist(
     bins=bins, ax=ax, cumulative=True, density=True, histtype="step"
)


plt.legend(["Train dataset", "Test dataset"])

plt.xlabel("Slate size")
plt.show()

In [None]:
bins=range(10)

ax = behaviors_train.groupby("userid")["slateid"].count().plot.hist(
    bins=bins, figsize=(20, 4), cumulative=True, density=True, histtype="step", title="Cumulative distribution of users w.r.t. slate count"
)

behaviors_test.groupby("userid")["slateid"].count().plot.hist(
    bins=bins, cumulative=True, density=True, histtype="step"
)

plt.legend(["Train dataset", "Test dataset"])

plt.xlabel("Slate count")
plt.show()

In [None]:
bins=range(50)

ax = behaviors_train["history"].apply(item_len).plot.hist(
    bins=bins, figsize=(20, 4), cumulative=True, density=True, histtype = "step", title="Cumulative distribution of history sizes"
)

ax = behaviors_test["history"].apply(item_len).plot.hist(
    bins=bins, cumulative=True, density=True, histtype = "step"
)

plt.legend(["Train dataset", "Test dataset"])

plt.xlabel("History size")
plt.show()

In [None]:
behaviors_train["clicks"] = behaviors_train["impressions"].apply(
    lambda x: " ".join(xx[:-2] for xx in x.split(" ") if xx.endswith("1"))
)

behaviors_test["clicks"] = behaviors_test["impressions"].apply(
    lambda x: " ".join(xx[:-2] for xx in x.split(" ") if xx.endswith("1"))
)

In [None]:
bins=range(10)

ax = (
    behaviors_train.assign(clicks_cnt=lambda x: x["clicks"].apply(item_len))
    .groupby("userid")["clicks_cnt"]
    .sum()
    .plot.hist(
        bins=bins, figsize=(20, 4), density=True, cumulative=True, histtype="step", title="Cumulative distribution of users w.r.t. their clicks"
    )
)

(
    behaviors_test.assign(clicks_cnt=lambda x: x["clicks"].apply(item_len))
    .groupby("userid")["clicks_cnt"]
    .sum()
    .plot.hist(
        bins=bins, ax=ax, density=True, cumulative=True, histtype="step"
    )
)

plt.legend(["Train dataset", "Test dataset"])

plt.xlabel("Clicks")
plt.show()

In [None]:
def compute_cat_hist(df, col):
  hist_cat = Counter(
      df.drop_duplicates(["userid"])[col]
      .apply(lambda x: x.split())
      .explode()
  )

  hist_cat = pd.Series(hist_cat).to_frame("cnt")
  hist_cat["prob"] = hist_cat["cnt"] / hist_cat["cnt"].sum()
  hist_cat.sort_values("prob", ascending=False, inplace=True)

  return hist_cat

def plot_cat_hist(df_train, df_test, col, title):
  hist_cat_train = compute_cat_hist(df_train, col)
  hist_cat_test = compute_cat_hist(df_test, col)

  fig = go.Figure(data=[
                        go.Bar(x=hist_cat_train.index, y=hist_cat_train["prob"], name="Train dataset"),
                        go.Bar(x=hist_cat_test.index, y=hist_cat_test["prob"], name="Test dataset")
  ])

  fig.update_layout(title_text=title)

  fig.show()

In [None]:
plot_cat_hist(behaviors_train, behaviors_test, "history_all_categories", "Distribution of categories among users - test vs train")

In [None]:
plot_cat_hist(behaviors_train, behaviors_test, "history_all_subcategories", "Distribution of subcategories among users - test vs train")