In [9]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from dotenv import load_dotenv
import mlflow
import optuna
import matplotlib.pyplot as plt
import seaborn as sns

## Loading Data Splits

In [15]:
#Unprocessed (unscaled data)
raw_train_df = pd.read_csv("train/splits/train_split.csv")
raw_test_df = pd.read_csv("train/splits/test_split.csv")
val_df = pd.read_csv("train/splits/val_split.csv")

In [16]:
#processed data ready for ML
train_df = pd.read_csv("train/splits/train_processed.csv")
test_df = pd.read_csv("train/splits/test_processed.csv")
val_df = pd.read_csv("train/splits/val_processed.csv")

In [18]:
raw_train_df.head()

Unnamed: 0,msno,city,bd,gender,registered_via,registration_init_time,num_25_mean,num_50_mean,num_50_min,num_50_max,...,membership_expire_day_of_week_mean,membership_expire_month_mean,membership_expire_month_count,last_transaction_date,is_churn,registration_year,registration_month,last_transaction_year,last_transaction_month,usage_period_days
0,P7W1hJVPC9+lZf5WLhskxKvQQEJJdpKkKVu4v7RA0zQ=,13.0,29.0,female,9.0,2004-03-26,2.86,0.0,0.0,0.0,...,3.0,4.0,1.0,2017-03-01,0.0,2004.0,3.0,2017.0,3.0,4723.0
1,yKDUvz1yO1xZ/fwG0bMyIa5K+w7+kIAc+qV3vZKYz9E=,22.0,36.0,female,9.0,2004-03-26,2.0,0.5,0.0,1.0,...,5.0,4.5,2.0,2017-03-31,0.0,2004.0,3.0,2017.0,3.0,4753.0
2,STy1UqGkW8U4LNr6usNMy5hpCt1fZs96eLFHmhnb0tE=,10.0,32.0,female,9.0,2004-03-26,3.0,0.5,0.0,1.0,...,6.0,4.0,1.0,2017-03-31,0.0,2004.0,3.0,2017.0,3.0,4753.0
3,35OTDI3ToY0rj1BtWTbDJ9YV8yDygsv1sNM08SQ7pQc=,22.0,33.0,female,9.0,2004-03-26,4.17,0.86,0.0,3.0,...,6.0,4.0,1.0,2017-03-31,0.0,2004.0,3.0,2017.0,3.0,4753.0
4,9CqDxOPBQwLzxCOywM3l763Yq4yFQhPhFFk79rPs5x0=,5.0,0.0,male,9.0,2004-03-26,4.11,3.83,0.0,19.0,...,0.0,4.0,1.0,2017-03-17,0.0,2004.0,3.0,2017.0,3.0,4739.0


In [19]:
train_df.head()

Unnamed: 0,city,bd,gender,registered_via,num_25_mean,num_50_mean,num_50_min,num_50_max,num_50_std,num_75_mean,...,transaction_day_of_year_max,membership_expire_day_of_week_mean,membership_expire_month_mean,membership_expire_month_count,registration_year,registration_month,last_transaction_year,last_transaction_month,usage_period_days,is_churn
0,13.0,0.753517,-1,9.0,-0.370597,-0.832318,-0.099333,-0.669389,-0.776265,-0.802106,...,-0.654925,-0.149502,-0.180628,-0.166229,-2.975821,-1.001783,0.061397,-0.036571,3.109596,0.0
1,22.0,1.115001,-1,9.0,-0.49873,-0.5103,-0.099333,-0.569047,-0.481149,-0.973001,...,0.307792,0.842618,0.72069,0.488523,-2.975821,-1.001783,0.061397,-0.036571,3.137803,0.0
2,10.0,0.908439,-1,9.0,-0.349738,-0.5103,-0.099333,-0.569047,-0.535184,-0.973001,...,0.307792,1.338677,-0.180628,-0.166229,-2.975821,-1.001783,0.061397,-0.036571,3.137803,0.0
3,22.0,0.96008,-1,9.0,-0.175418,-0.278446,-0.099333,-0.368361,-0.335669,-0.301627,...,0.307792,1.338677,-0.180628,-0.166229,-2.975821,-1.001783,0.061397,-0.036571,3.137803,0.0
4,5.0,-0.744061,1,9.0,-0.184357,1.634343,-0.099333,1.237121,1.622076,2.420491,...,-0.141476,-1.637681,-0.180628,-0.166229,-2.975821,-1.001783,0.061397,-0.036571,3.12464,0.0


## Evaluation of Churn Predicts

To come up with bussiness metric for perfomance of this model, I assume the following:
1) If we know that the customer is up to leave our service, we will give him a promo of **80% discount for 1 year**
   which costs us about **120 USD**

2) We really care about those who use our service already for a long time. We really want to be **accurate** in predictions about **long-living clients**. Let's assume that if we lose our client, **we lose about 1/4 of his LTV**


### Bussiness metric

In order to evaluate the model, I would like to state the following LOSS formula to estimate.
We weight missclassification according to potential expenses of our company.


$$
\[
\begin{aligned}
&\text{Let's define the following variables:} \\
&\hat{y} : \text{Predicted churn labels (1 for churn, 0 for no churn)} \\
&y : \text{Actual churn labels (1 for churn, 0 for no churn)} \\
&LTV : \text{Lifetime value of a customer} \\
&C_{\text{promo}} : \text{Cost of the promo offer (\$120)} \\
&\alpha : \text{Fraction of LTV lost if a long-term client churns (0.25)} \\
&X_{\text{long}} : \text{Indicator function for long-term clients} \\
\\
&\text{The business loss formula can be written as follows:} \\
&\text{Business Loss} = \sum_{i=1}^{n} \left( \hat{y}_i (1 - y_i) C_{\text{promo}} + y_i (1 - \hat{y}_i) \alpha \cdot LTV_i \cdot X_{\text{long}, i} \right) \\
\\
&\text{This formula comprises two components:} \\
&\text{1. False Positives Cost: } \hat{y}_i (1 - y_i) C_{\text{promo}} \\
&\quad \text{When we predict a customer will churn (} \hat{y}_i = 1 \text{) but they do not churn (} y_i = 0 \text{), we incur the cost of the promo offer (} C_{\text{promo}} \text{).} \\
&\text{2. False Negatives Cost: } y_i (1 - \hat{y}_i) \alpha \cdot LTV_i \cdot X_{\text{long}, i} \\
&\quad \text{When we fail to predict a churn (} \hat{y}_i = 0 \text{) but the customer churns (} y_i = 1 \text{), we lose a fraction (} \alpha \text{) of the customer's LTV if they are a long-term client (} X_{\text{long}, i} = 1 \text{).} \\
\\
&\text{To further clarify:} \\
&\hat{y}_i (1 - y_i) \text{ identifies false positives (cases where the model incorrectly predicts churn).} \\
&y_i (1 - \hat{y}_i) \text{ identifies false negatives (cases where the model fails to predict churn).}
\end{aligned}
\]
$$


In [28]:
promo_cost = raw_train_df["actual_amount_paid_mean"].mean() * 0.8
print("Expenses on Promo for client: ", np.round(promo_cost, 2))

Expenses on Promo for client:  119.77


In [30]:
def calculate_business_loss(y_pred, y_test, LTV,  promo_cost=120, alpha=0.25):
    """
    Calculate the business loss based on predictions and actual values.

    Args:
        y_pred (array-like): Predicted churn labels.
        y_test (array-like): Actual churn labels.
        LTV (array-like): Lifetime values of customers.
        promo_cost (float): Cost of the promo offer.
        alpha (float): Fraction of LTV lost if a long-term client churns.

    Returns:
        float: Total business loss.
    """
    false_positives_cost = sum(y_pred[i] * (1 - y_test[i]) * promo_cost for i in range(len(y_pred)))
    false_negatives_cost = sum(y_test[i] * (1 - y_pred[i]) * alpha * LTV[i]  for i in range(len(y_pred)))

    total_loss = false_positives_cost + false_negatives_cost
    return total_loss