# Logistic Regression

Find out whether a user is going to churn within the next 120 days using Logistic Regression.

In [97]:
import pandas as pd
from datetime import timedelta

## Load Dataset

In [98]:
df_fct = (
    pd.read_csv('data/fct_sales.csv')
    .assign(
        purchase_timestamp=lambda x: pd.to_datetime(x['purchase_timestamp'], format='%Y-%m-%d %H:%M:%S'),
        invoice_id=lambda x: x['invoice_id'].astype('str'),
        customer_id=lambda x: x['customer_id'].astype('str'),
        date_id=lambda x: pd.to_datetime(x['date_id'])
    )
)
df_fct.info()
display(df_fct.sample(10))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 779495 entries, 0 to 779494
Data columns (total 7 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   invoice_id          779495 non-null  object        
 1   product_id          779495 non-null  object        
 2   customer_id         779495 non-null  object        
 3   date_id             779495 non-null  datetime64[ns]
 4   order_amt           779495 non-null  int64         
 5   product_price       779495 non-null  float64       
 6   purchase_timestamp  779495 non-null  datetime64[ns]
dtypes: datetime64[ns](2), float64(1), int64(1), object(3)
memory usage: 41.6+ MB


Unnamed: 0,invoice_id,product_id,customer_id,date_id,order_amt,product_price,purchase_timestamp
592693,562890,47566,14191,2011-08-10,4,4.95,2011-08-10 12:11:00
186846,513570,20971,18168,2010-06-25,8,1.25,2010-06-25 14:46:00
103019,502895,20695,17448,2010-03-29,3,3.75,2010-03-29 11:49:00
378220,535409,22459,14507,2010-11-26,1,2.55,2010-11-26 11:24:00
169349,511328,22382,13178,2010-06-08,20,1.65,2010-06-08 10:06:00
494445,550350,22669,15491,2011-04-18,5,2.95,2011-04-18 10:04:00
778457,581437,23375,14621,2011-12-08,10,0.82,2011-12-08 16:22:00
153129,509378,22385,12389,2010-05-21,10,1.95,2010-05-21 15:12:00
210362,516613,85150,13426,2010-07-21,6,2.55,2010-07-21 14:48:00
387489,536446,84347,15983,2010-12-01,12,2.55,2010-12-01 12:15:00


In [99]:
df_customer = (
    pd.read_csv('data/dim_customer.csv')
)
df_customer.info()
display(df_customer.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5894 entries, 0 to 5893
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   customer_id       5894 non-null   int64 
 1   customer_country  5894 non-null   object
dtypes: int64(1), object(1)
memory usage: 92.2+ KB


Unnamed: 0,customer_id,customer_country
0,13085,United Kingdom
1,13078,United Kingdom
2,15362,United Kingdom
3,18102,United Kingdom
4,12682,France


In [100]:
df_product = (
    pd.read_csv('data/dim_product.csv')
)
df_product.info()
display(df_product.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5315 entries, 0 to 5314
Data columns (total 2 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   product_id           5315 non-null   object
 1   product_description  5315 non-null   object
dtypes: object(2)
memory usage: 83.2+ KB


Unnamed: 0,product_id,product_description
0,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS
1,79323P,PINK CHERRY LIGHTS
2,79323W,WHITE CHERRY LIGHTS
3,22041,"RECORD FRAME 7"" SINGLE SIZE"
4,21232,STRAWBERRY CERAMIC TRINKET BOX


In [101]:
df_date = (
    pd.read_csv('data/dim_date.csv')
)
df_date.info()
display(df_date.sample(10))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 739 entries, 0 to 738
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   date_id      739 non-null    object
 1   year         739 non-null    int64 
 2   month        739 non-null    int64 
 3   day          739 non-null    int64 
 4   day_of_week  739 non-null    int64 
 5   is_weekend   739 non-null    bool  
 6   quarter      739 non-null    int64 
dtypes: bool(1), int64(5), object(1)
memory usage: 35.5+ KB


Unnamed: 0,date_id,year,month,day,day_of_week,is_weekend,quarter
146,2010-04-26,2010,4,26,0,False,2
121,2010-04-01,2010,4,1,3,False,2
190,2010-06-09,2010,6,9,2,False,2
643,2011-09-05,2011,9,5,0,False,3
218,2010-07-07,2010,7,7,2,False,3
125,2010-04-05,2010,4,5,0,False,2
119,2010-03-30,2010,3,30,1,False,1
429,2011-02-03,2011,2,3,3,False,1
329,2010-10-26,2010,10,26,1,False,4
610,2011-08-03,2011,8,3,2,False,3


In [102]:
df_churn = (
    pd.read_csv('data/result_churn.csv')
    .assign(
        customer_id=lambda x: x['customer_id'].astype('str')
    )
)
df_churn.info()
display(df_churn)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5881 entries, 0 to 5880
Data columns (total 4 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   customer_id               5881 non-null   object
 1   last_purchase_date        5881 non-null   object
 2   days_since_last_purchase  5881 non-null   int64 
 3   is_churned                5881 non-null   bool  
dtypes: bool(1), int64(1), object(2)
memory usage: 143.7+ KB


Unnamed: 0,customer_id,last_purchase_date,days_since_last_purchase,is_churned
0,12346,2011-01-18,326,True
1,12347,2011-12-07,3,False
2,12348,2011-09-25,76,False
3,12349,2011-11-21,19,False
4,12350,2011-02-02,311,True
...,...,...,...,...
5876,18283,2011-12-06,4,False
5877,18284,2010-10-04,432,True
5878,18285,2010-02-17,661,True
5879,18286,2010-08-20,477,True


In [103]:
df_rfm = (
    pd.read_csv('data/result_rfm.csv')
    .assign(
        customer_id=lambda x: x['customer_id'].astype('str'),
        last_purchase_date=lambda x: pd.to_datetime(x['last_purchase_date'])
    )
)
df_rfm.info()
display(df_rfm)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5881 entries, 0 to 5880
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   customer_id         5881 non-null   object        
 1   last_purchase_date  5881 non-null   datetime64[ns]
 2   frequency           5881 non-null   int64         
 3   monetary            5881 non-null   float64       
 4   recency             5881 non-null   int64         
 5   r_rank              5881 non-null   int64         
 6   f_rank              5881 non-null   int64         
 7   m_rank              5881 non-null   int64         
 8   rfm_rank            5881 non-null   int64         
 9   segment             5881 non-null   object        
dtypes: datetime64[ns](1), float64(1), int64(6), object(2)
memory usage: 459.6+ KB


Unnamed: 0,customer_id,last_purchase_date,frequency,monetary,recency,r_rank,f_rank,m_rank,rfm_rank,segment
0,12346,2011-01-18,12,77556.46,326,3,1,1,2,Mid-value Customers
1,12347,2011-12-07,8,4921.53,3,1,1,1,1,High-value Customers
2,12348,2011-09-25,5,2019.40,76,2,2,1,2,Mid-value Customers
3,12349,2011-11-21,4,4428.69,19,1,2,1,1,High-value Customers
4,12350,2011-02-02,1,334.40,311,3,3,3,3,Low-value Customers
...,...,...,...,...,...,...,...,...,...,...
5876,18283,2011-12-06,22,2664.90,4,1,1,1,1,High-value Customers
5877,18284,2010-10-04,1,461.68,432,3,3,2,3,Low-value Customers
5878,18285,2010-02-17,1,427.00,661,3,3,3,3,Low-value Customers
5879,18286,2010-08-20,2,1296.43,477,3,2,2,2,Mid-value Customers


## Feature Engineering

In [104]:
df = (
    df_rfm
    .merge(df_churn[['customer_id', 'days_since_last_purchase', 'is_churned']], on='customer_id')
    .merge(
        (
            df_fct
            .groupby('customer_id', as_index=False)
            .agg(
                first_day=('date_id', 'min')
            )
        ),
        on='customer_id'
    )
    .assign(
        customer_age_in_days=lambda x: (
            (x['last_purchase_date'].max() + timedelta(days=1)) -
            (x['first_day'])
        ).dt.days
    )
)

df

Unnamed: 0,customer_id,last_purchase_date,frequency,monetary,recency,r_rank,f_rank,m_rank,rfm_rank,segment,days_since_last_purchase,is_churned,first_day,customer_age_in_days
0,12346,2011-01-18,12,77556.46,326,3,1,1,2,Mid-value Customers,326,True,2009-12-14,726
1,12347,2011-12-07,8,4921.53,3,1,1,1,1,High-value Customers,3,False,2010-10-31,405
2,12348,2011-09-25,5,2019.40,76,2,2,1,2,Mid-value Customers,76,False,2010-09-27,439
3,12349,2011-11-21,4,4428.69,19,1,2,1,1,High-value Customers,19,False,2010-04-29,590
4,12350,2011-02-02,1,334.40,311,3,3,3,3,Low-value Customers,311,True,2011-02-02,311
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5876,18283,2011-12-06,22,2664.90,4,1,1,1,1,High-value Customers,4,False,2010-02-19,659
5877,18284,2010-10-04,1,461.68,432,3,3,2,3,Low-value Customers,432,True,2010-10-04,432
5878,18285,2010-02-17,1,427.00,661,3,3,3,3,Low-value Customers,661,True,2010-02-17,661
5879,18286,2010-08-20,2,1296.43,477,3,2,2,2,Mid-value Customers,477,True,2009-12-16,724


## Preprocessing

In [105]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [106]:
df_preprocessing = (
    df
    .assign(
        is_churned=lambda x: x['is_churned'].astype('int')
    )
)

X = (
    df_preprocessing
    .loc[:, ['frequency', 'monetary', 'customer_age_in_days']]
    .to_numpy()
)

y = (
    df_preprocessing
    .loc[:, ['is_churned']]
    .to_numpy()
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, train_size=0.75, random_state=31
)

scaler = StandardScaler()
X_train = scaler.fit_transform(X=X_train)
X_test = scaler.transform(X=X_test)

## Training

In [107]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=41).fit(X_train, y_train)
y_pred = clf.predict(X_test)

  y = column_or_1d(y, warn=True)


## Test and Evaluation

In [108]:
from sklearn.metrics import classification_report

report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.84      0.77      0.81       802
           1       0.75      0.82      0.79       669

    accuracy                           0.80      1471
   macro avg       0.80      0.80      0.80      1471
weighted avg       0.80      0.80      0.80      1471

