# Exploratory Data Analysis

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from datetime import timedelta

## Loading data

In [None]:
df = pd.read_csv("../data/transactions_dataset.csv", sep=";")

## Exploring data

In [None]:
df.sort_values("date_order").head(10)

In [None]:
df.info(verbose=True)

In [None]:
df.describe()

Let's check if there is missing data...

In [None]:
df.isna().mean()

In [None]:
len(df[df.sales_net < 0]) / len(df)

Let's have a look at the different columns individually...

In [None]:
df.order_channel.value_counts().plot(kind="bar")
plt.title("Distribution of order channels");

In [None]:
df.branch_id.nunique()

In [None]:
df.branch_id.value_counts().head(30).plot(kind="bar")
plt.title("Distribution of top 30 branches");

In [None]:
df.client_id.nunique()

In [None]:
df.client_id.value_counts().head(30).plot(kind="bar")
plt.title("Distribution of top 30 clients");

In [None]:
df.product_id.nunique()

In [None]:
df.product_id.value_counts().head(30).plot(kind="bar")
plt.title("Distribution of top 30 products");

## Churn Analysis

In [None]:
df["date_order"] = pd.to_datetime(df["date_order"])

In [None]:
df.client_id.value_counts()

In [None]:
df_pos = df[df.sales_net > 0].copy()

In [None]:
# Sort data by 'client_id' and 'date_order'
df_pos.sort_values(["client_id", "date_order"], inplace=True)

In [None]:
df_day = df_pos.drop_duplicates(["date_order", "client_id"])

# Calculate the time difference between consecutive purchases for each customer
df_day["time_since_previous_purchase"] = df_day.groupby("client_id")[
    "date_order"
].diff()

In [None]:
# Calculate the average time between purchases for all customers
average_time_between_purchases = df_day["time_since_previous_purchase"].mean()
print(
    f"Average time between a customer's purchases: {average_time_between_purchases}"
)

In [None]:
time_to_buy = (
    df_day.groupby("client_id")["time_since_previous_purchase"].mean().dt.days
)

In [None]:
churn_customers = time_to_buy[time_to_buy > 60].index

In [None]:
time_to_buy.isna().mean()

In [None]:
time_to_buy.hist(bins=100)

In [None]:
client_percent = (
    time_to_buy.reset_index()
    .groupby("time_since_previous_purchase")
    .client_id.count()
)
client_percent = client_percent.reset_index().sort_values(
    "time_since_previous_purchase"
)
client_percent["client_id"] = (
    client_percent.client_id.cumsum(axis=0) / client_percent.client_id.sum()
)

In [None]:
plt.plot("time_since_previous_purchase", "client_id", data=client_percent)
plt.xlabel("Avg. number of days since last purchase")
plt.ylabel("Percentage of clients")

In [None]:
revenue_sum = df_pos.groupby("client_id").sales_net.sum()
revenue_sum = time_to_buy.reset_index().merge(
    revenue_sum, left_on="client_id", right_index=True, how="outer"
)
revenue_sum = revenue_sum.fillna(max(revenue_sum.time_since_previous_purchase))
revenue_sum = revenue_sum.sort_values("time_since_previous_purchase")
revenue_sum = (
    revenue_sum.groupby("time_since_previous_purchase")
    .sales_net.sum()
    .reset_index()
)
revenue_precent = revenue_sum.copy()
revenue_precent["sales_net"] = (
    revenue_precent.sales_net.cumsum(axis=0) / revenue_precent.sales_net.sum()
)
revenue_sum["sales_net"] = revenue_sum.sales_net.cumsum(axis=0)

In [None]:
plt.plot(
    "time_since_previous_purchase",
    "sales_net",
    data=revenue_precent,
    label="Revenue",
)
plt.xlabel("Avg. number of days since last purchase")
plt.ylabel("Percentage of revenue")

In [None]:
plt.rcParams.update({"font.size": 18})
plt.figure(figsize=(15, 6))
plt.plot(
    "time_since_previous_purchase",
    "sales_net",
    data=revenue_precent,
    label="Revenue",
    color="#03522D",
)
plt.plot(
    "time_since_previous_purchase",
    "client_id",
    data=client_percent,
    label="Client",
    color="#29BA74",
)
plt.vlines(x=60, ymin=0, ymax=0.983666, color="grey")
plt.xlabel("Avg. number of days since last purchase")
plt.ylabel("Percentage of revenue or client")
plt.legend()
Path("../results").mkdir(parents=True, exist_ok=True)
plt.savefig(
    Path("../results") / "avg_time_since_purchase.png", transparent=True
)

In [None]:
client_percent[client_percent.time_since_previous_purchase == 60]

In [None]:
(1 - 0.757764) * df_pos.client_id.nunique()

In [None]:
1 - (
    revenue_sum[revenue_sum.time_since_previous_purchase == 60].sales_net
    / revenue_sum.sales_net.max()
)

In [None]:
(
    revenue_sum.sales_net.max()
    - revenue_sum[revenue_sum.time_since_previous_purchase == 60].sales_net
)