In [None]:
import pandas as pd
from datetime import timedelta

In [None]:
df = pd.read_csv("../data/transactions_dataset.csv", sep=";")

In [None]:
df["date_order"] = pd.to_datetime(df["date_order"])

In [None]:
threshold = 60

In [None]:
all_clients = df.client_id.unique()

In [None]:
df_ml = df[df.sales_net > 0].copy()
df_ml = pd.concat(
    [df_ml, pd.get_dummies(df_ml["order_channel"], dtype=int)], axis=1
)
df_ml["nr_orders"] = 1
df_ml = df_ml.drop(
    columns=["date_invoice", "branch_id", "order_channel", "product_id"]
)

In [None]:
df_sum = df_ml.groupby(["date_order", "client_id"]).sum().reset_index()

In [None]:
test_stamp = df_sum.date_order.max() - timedelta(days=threshold)

In [None]:
customers_test = df_sum[df_sum.date_order >= test_stamp].client_id.unique()

In [None]:
y_test = pd.DataFrame({"client_id": all_clients})
y_test["churn"] = ~y_test.client_id.isin(customers_test)

In [None]:
train_stamp = test_stamp - timedelta(days=threshold)
X_test = df_sum[
    (df_sum.date_order >= train_stamp) & (df_sum.date_order < test_stamp)
]

In [None]:
X_test = X_test.drop(columns="date_order")
X_test = X_test.groupby("client_id").sum()

In [None]:
df_sum

In [None]:
# all buys ever
all_buys_test = (
    df_sum.loc[
        (df_sum.date_order < test_stamp),
        ["client_id", "sales_net", "quantity", "nr_orders"],
    ]
    .groupby("client_id")
    .sum()
)
all_buys_test.columns = ["perc_sales_net", "perc_quantity", "perc_nr_orders"]
X_test = X_test.merge(
    all_buys_test, left_index=True, right_index=True, how="left"
)
X_test["perc_sales_net"] = X_test["sales_net"] / X_test["perc_sales_net"]
X_test["perc_quantity"] = X_test["quantity"] / X_test["perc_quantity"]
X_test["perc_nr_orders"] = X_test["nr_orders"] / X_test["perc_nr_orders"]
X_test = X_test.drop(columns=["nr_orders"])

In [None]:
avg_buy_time_test = df_sum[df_sum.date_order < test_stamp].copy()
# Sort data by 'client_id' and 'date_order'
avg_buy_time_test.sort_values(["client_id", "date_order"], inplace=True)
df_day = avg_buy_time_test.drop_duplicates(["date_order", "client_id"])

# Calculate the time difference between consecutive purchases for each customer
df_day["time_since_previous_purchase"] = df_day.groupby("client_id")[
    "date_order"
].diff()

time_to_buy = (
    df_day.groupby("client_id")["time_since_previous_purchase"].mean().dt.days
)

X_test = X_test.merge(
    time_to_buy, left_index=True, right_index=True, how="left"
).fillna(0)

In [None]:
test = X_test.merge(y_test, left_index=True, right_on="client_id", how="left")

In [None]:
test.churn.mean()

In [None]:
y_train = pd.DataFrame({"client_id": all_clients})
y_train["churn"] = ~y_train.client_id.isin(X_test.index)

In [None]:
X_train = df_sum[
    (df_sum.date_order >= (train_stamp - timedelta(days=threshold)))
    & (df_sum.date_order < train_stamp)
]
X_train = X_train.drop(columns="date_order")
X_train = X_train.groupby("client_id").sum()

In [None]:
# all buys ever
all_buys_train = (
    df_sum.loc[
        (df_sum.date_order < train_stamp),
        ["client_id", "sales_net", "quantity", "nr_orders"],
    ]
    .groupby("client_id")
    .sum()
)
all_buys_train.columns = ["perc_sales_net", "perc_quantity", "perc_nr_orders"]
X_train = X_train.merge(
    all_buys_train, left_index=True, right_index=True, how="left"
)
X_train["perc_sales_net"] = X_train["sales_net"] / X_train["perc_sales_net"]
X_train["perc_quantity"] = X_train["quantity"] / X_train["perc_quantity"]
X_train["perc_nr_orders"] = X_train["nr_orders"] / X_train["perc_nr_orders"]
X_train = X_train.drop(columns=["nr_orders"])

In [None]:
avg_buy_time_train = df_sum[df_sum.date_order < train_stamp].copy()
# Sort data by 'client_id' and 'date_order'
avg_buy_time_train.sort_values(["client_id", "date_order"], inplace=True)
df_day = avg_buy_time_train.drop_duplicates(["date_order", "client_id"])

# Calculate the time difference between consecutive purchases for each customer
df_day["time_since_previous_purchase"] = df_day.groupby("client_id")[
    "date_order"
].diff()

time_to_buy = (
    df_day.groupby("client_id")["time_since_previous_purchase"].mean().dt.days
)

X_train = X_train.merge(
    time_to_buy, left_index=True, right_index=True, how="left"
).fillna(0)

In [None]:
train = X_train.merge(
    y_train, left_index=True, right_on="client_id", how="left"
)

In [None]:
train.churn.mean()

In [None]:
train.shape

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# clf = LogisticRegression(random_state=0).fit(train.drop(columns=["client_id", "churn"]), train["churn"])
clf = RandomForestClassifier(random_state=0).fit(
    train.drop(columns=["client_id", "churn"]), train["churn"]
)

In [None]:
preds = clf.predict(test.drop(columns=["client_id", "churn"]))

In [None]:
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(test["churn"], preds)

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_curve

cm = confusion_matrix(test["churn"], preds)

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot();

In [None]:
import matplotlib.pyplot as plt
import numpy as np

fpr, tpr, _ = roc_curve(
    test["churn"],
    clf.predict_proba(test.drop(columns=["client_id", "churn"]))[:, 1],
)

# create ROC curve
plt.plot(fpr, tpr)

# Add a random prediction line (diagonal line)
random_line = np.linspace(0, 1, num=100)
plt.plot(
    random_line,
    random_line,
    linestyle="--",
    label="Random Prediction Line",
    color="red",
)

plt.ylabel("True Positive Rate")
plt.xlabel("False Positive Rate")
plt.show()

In [None]:
train

In [None]:
test