In [1]:
import numpy as np
import scipy as sp
from scipy import linalg
from scipy import optimize
from scipy import interpolate
import sympy as sm
from scipy.special import erfinv
import pandas as pd
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
from statsmodels.discrete.discrete_model import Logit, Probit, MNLogit
import statsmodels.formula.api as smf
import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_columns', None)

In [21]:
#importing all datasets(!!please change the path before running the codes)
df_Calls = pd.read_csv('b. CARTIER_CALLS.csv')
df_Clienteling = pd.read_csv('c. CARTIER_CLIENTELING.csv')
df_Livechat = pd.read_csv('d. CARTIER_LIVECHAT.csv')
df_PrevSales = pd.read_csv('e. CARTIER_PREVIOUS_SALES.csv')
df_Sales = pd.read_csv('f. CARTIER_SALES.csv')
df_Wishlist = pd.read_csv('g. CARTIER_WISHLIST.csv')

df_AllSales = pd.read_csv('df_AllSalesLogit.csv') # This is the merged dataset that has been cleaned 

# df_AllSales has been transformed in the following way

    # TransactionCategory is only Sales
    # All Records before 2010 have been dropped
    # nb_days_since_last_sale "NAN" have been recoded to 0 if transactiondate = dateoffirstsale
    # dropped hierarchy variables
    # duplicates have been dropped
    # Calls, Clienteling, Wishlist, Livechat dummy variables have been created
        # =1 if ClientID is present in the corresponding dataset, =0 otherwise
        # We have to create a more accurate dummy that will be =1 if ClientID was contacted between his last sale
        # and the one before that
    #repurchase_long & repurchase_exact4 dummy variable created
        # repurchase_long =1 if nb_days_since_last_sale > 1460
        # repurchase_exact4 =1 if nb_days_since_last_sale is between [1400,1520]


In [6]:
# Looking at how many NAs we have is useful becasuse Logit model will drop all rows with missing values
# Here Wedding Date is missing on 1 407 946 observations (90% of total)

print("Total dataset size is :",df_AllSales.shape[0], " observations")
df_AllSales.isna().sum()

Total dataset size is : 1569700  observations


Unnamed: 0                       0
InvoiceHeader                 5582
Channel                          0
TransactionDate                  0
TransactionDate_FYYYY            0
TransactionCategory              0
ClientID                         0
AgeAtTransaction            638287
Gender                        1813
PersonBirthDate             528936
WeddingDate                1407946
SpokenLanguage                1334
WrittenLanguage               1334
FirstSalesDate                 445
FirstTransactionDate           245
ProductCategory               1177
ProductSubCategory            1177
ProductFunction               1177
Turnover                     74074
quantity                         0
seq_sales_trs                    0
nb_days_since_last_sale      17554
PurchasedMarketA                 0
PurchasedRegionA                 0
ResidencyRegionA                 0
ResidencyMarketA                 0
BoutiqueNameA               513011
ResidencyCountryA                6
nationalityA        

In [22]:
## Create Dummy variables (GENDER)

gender_grp = df_AllSales.groupby(['Gender']).count()
print(gender_grp)

# Here we have 103348 couples
# 811035 Female
# 626660 Male
# 26844 Unknown
#df_AllSales = df_AllSales.assign(Calls=df_AllSales.ClientID.isin(df_Calls.ClientID).astype(int))

#Pre-process "Gender" as the first dummy independent variable
dummy_Gender=pd.get_dummies(df_AllSales['Gender'],prefix='Gender')
print(dummy_Gender)
df_AllSales= pd.concat([df_AllSales, dummy_Gender], axis=1)


         Unnamed: 0  InvoiceHeader  Channel  TransactionDate  \
Gender                                                         
Couple       103348         103270   103348           103348   
Female       811035         807851   811035           811035   
Male         626660         624368   626660           626660   
Unknown       26844          26843    26844            26844   

         TransactionDate_FYYYY  TransactionCategory  ClientID  \
Gender                                                          
Couple                  103348               103348    103348   
Female                  811035               811035    811035   
Male                    626660               626660    626660   
Unknown                  26844                26844     26844   

         AgeAtTransaction  PersonBirthDate  WeddingDate  SpokenLanguage  \
Gender                                                                    
Couple              66191            78083        23871          103345   

In [23]:
#Pre-process "AgeAtTransaction" as a serial independent variable
#data2= data2.dropna(subset=['AgeAtTransaction'])
#data2['AgeAtTransaction']=data2['AgeAtTransaction'].astype(int)
#data2.head()

#Pre-process "Channel" as the second dummy indepedent variable.
dummy_Channel=pd.get_dummies(df_AllSales['Channel'],prefix='Channel')
df_AllSales = pd.concat([df_AllSales, dummy_Channel], axis=1)

df_AllSales.head

<bound method NDFrame.head of          Unnamed: 0                            InvoiceHeader    Channel  \
0            122626                                      NaN   Boutique   
1            134321                                      NaN   Boutique   
2            125277                                      NaN   Boutique   
3             22358  302109220925007-GEMINI-/BIC/AZRIRBARP00   Boutique   
4            110727                                      NaN   Boutique   
...             ...                                      ...        ...   
1569695      357224  S34602000003263-GEMINI-/BIC/AZRIRBAAP00   Boutique   
1569696     1468692     171611808-GEMINIB2B-/BIC/AZ4CRME0100  Wholesale   
1569697       92232     166727100-GEMINIB2B-/BIC/AZ4CRME0400  Wholesale   
1569698      350821       XCN1027212-GEMINI-/BIC/AZRIRBAAP00   Boutique   
1569699      252156  S21003000004080-GEMINI-/BIC/AZRIRBAAP00   Boutique   

        TransactionDate  TransactionDate_FYYYY TransactionCategory  \

In [None]:
#Make variables calls, livechat, clienteling, and wishlist into dummy variables.
#dummy_Calls=pd.get_dummies(data2['Calls'],prefix='Calls')
#data2['Calls_yes']=dummy_Calls['Calls_yes']
#dummy_LiveChat=pd.get_dummies(data2['LiveChat'],prefix='LiveChat')
#data2['LiveChat_yes']=dummy_LiveChat['LiveChat_yes']
#dummy_Wishlist=pd.get_dummies(data2['Wishlist'],prefix='Wishlist')
#data2['Wishlist_yes']=dummy_Wishlist['Wishlist_yes']
#dummy_Clienteling=pd.get_dummies(data2['Clienteling'],prefix='Clienteling')
#data2['Clienteling_yes']=dummy_Clienteling['Clienteling_yes']
#Here 1 means the interaction via calls, livechat or clitenteling exists before the transaction, or the creation of wishlist exists

In [25]:
# Create birthday_occasion dummy variable -> we might want to adjust the range that we consider birthday occasion to <60

YEAR_IN_DAYS = 365.25
TWO_MONTHS_IN_DAYS = 60

# Function to apply to every row
def Birthday_Occasion(b, t):
    delta=(t - b).days % YEAR_IN_DAYS
    return int(delta == 0 or YEAR_IN_DAYS - delta <= TWO_MONTHS_IN_DAYS)


# Convert columns into date objects
df_AllSales['TransactionDate'] = pd.to_datetime(df_AllSales['TransactionDate'], format='%Y-%m-%d')
df_AllSales['PersonBirthDate'] = pd.to_datetime(df_AllSales['PersonBirthDate'], format='%Y-%m-%d')

# Apply function on row and save dataset
df_AllSales['Birthday_Occasion'] = df_AllSales.apply(lambda x: Birthday_Occasion(x.PersonBirthDate, x.TransactionDate), axis=1)

In [26]:
# Create wedding_occasion dummy variable -> we might want to adjust the range that we consider wedding occasion to <60

# Function to apply to every row
def Wedding_Occasion(b, t):
    delta=(t - b).days % YEAR_IN_DAYS
    return int(delta == 0 or YEAR_IN_DAYS - delta <= TWO_MONTHS_IN_DAYS)


# Convert columns into date objects
#df_AllSalesDate['WeddingDate'] = pd.to_datetime(df_AllSalesDate['WeddingDate'], format='%Y-%m-%d')
df_AllSales['WeddingDate']= pd.to_datetime(df_AllSales['WeddingDate'],errors = 'coerce')

#Delete unreasonable dates
df_AllSales.loc[df_AllSales['PersonBirthDate'].dt.year <= 1900, 'PersonBirthDate']=np.nan
df_AllSales.loc[df_AllSales['WeddingDate'].dt.year >= 2030, 'WeddingDate']=np.nan
df_AllSales.loc[df_AllSales['WeddingDate'].dt.year <= 1900, 'WeddingDate']=np.nan

# Apply function on row and save dataset
df_AllSales['Wedding_Occasion'] = df_AllSales.apply(lambda x: Wedding_Occasion(x.WeddingDate, x.TransactionDate), axis=1)

## This is the code to Group the dataset by ClientID
## We will do that once we've further trimmed down the dataset to the 2 last transactions

In [43]:
df_AllSalesClient = df_AllSales.groupby(['ClientID'])[['repurchase_long','repurchase_exact4','Gender_Female','Gender_Male','Gender_Couple','Channel_Boutique','Channel_Web','Channel_CC',
                        'Calls','Clienteling','Livechat','Wishlist','Birthday_Occasion','Wedding_Occasion', 'Turnover', 'PersonBirthDate', 'WeddingDate','nb_days_since_last_sale']].max()  

In [42]:
print(df_AllSalesClient.head(5))

                    repurchase_long  repurchase_exact4  Gender_Female  \
ClientID                                                                
0011i00000UNT9LAAX                1                  0              0   
0011i00000UNTHbAAP                0                  0              0   
0011i00000UNTM0AAP                0                  0              0   
0011i00000UNTMyAAP                1                  0              0   
0011i00000UNTXwAAP                0                  0              1   

                    Gender_Male  Gender_Couple  Channel_Boutique  Channel_Web  \
ClientID                                                                        
0011i00000UNT9LAAX            1              0                 1            0   
0011i00000UNTHbAAP            1              0                 1            0   
0011i00000UNTM0AAP            1              0                 1            0   
0011i00000UNTMyAAP            1              0                 1            0   
00

## Logit Regression Transaction Level with repurchase_long

In [46]:
#add constant
df_AllSales['intercept']=1.0
df_AllSalesClient['intercept']=1.0

In [30]:
Model1=df_AllSales[['repurchase_long','Gender_Female','Gender_Male','Gender_Couple','Channel_Boutique','Channel_Web',
                    'Channel_CC','Calls','Livechat','Clienteling','Wishlist','Turnover','intercept']]

#Since there are too many missing values in wedding date and birth date, we build up two models, one with them and one without.
#extract Indepedent variables into a dataframe for the model without wedding date
Model1=Model1.dropna()
Inde_var1=Model1.loc[:,['Gender_Female','Gender_Male','Gender_Couple','Channel_Boutique','Channel_Web','Channel_CC',
                        'Calls','Clienteling','Livechat','Wishlist','Turnover','intercept']]
#fit the logit model
logit1=sm.Logit(Model1['repurchase_long'],Inde_var1)
result1=logit1.fit()
print(result1.summary())

Optimization terminated successfully.
         Current function value: 0.048321
         Iterations 10
                           Logit Regression Results                           
Dep. Variable:      repurchase_exact4   No. Observations:              1495626
Model:                          Logit   Df Residuals:                  1495614
Method:                           MLE   Df Model:                           11
Date:                Fri, 06 Jan 2023   Pseudo R-squ.:                0.003217
Time:                        15:09:11   Log-Likelihood:                -72270.
converged:                       True   LL-Null:                       -72503.
Covariance Type:            nonrobust   LLR p-value:                 4.394e-93
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
Gender_Female        0.1210      0.071      1.698      0.090      -0.019       0.261
Gender_Mal

## Logit Regression Transaction Level with repurchase_exact4

In [None]:
Model2=df_AllSales[['repurchase_exact4','Gender_Female','Gender_Male','Gender_Couple','Channel_Boutique','Channel_Web',
                    'Channel_CC','Calls','Livechat','Clienteling','Wishlist','Turnover','intercept', 'nb_days_since_last_sale']]

#Since there are too many missing values in wedding date and birth date, we build up two models, one with them and one without.
#extract Indepedent variables into a dataframe for the model without wedding date
Model2=Model2.dropna()
#fit the logit model
logit2=sm.Logit(Model2['repurchase_exact4'],Inde_var1)
result2=logit2.fit()
print(result2.summary())

## Logit regression with Birthday_Occasion 

In [35]:
Model3=df_AllSales[['repurchase_exact4','Gender_Female','Gender_Male','Gender_Couple','Channel_Boutique','Channel_Web',
                    'Channel_CC','Calls','Livechat','Clienteling','Wishlist','Turnover','intercept','Birthday_Occasion',
                    'PersonBirthDate', 'nb_days_since_last_sale']]

#Since there are too many missing values in wedding date and birth date, we build up two models, one with them and one without.
#extract Indepedent variables into a dataframe for the model without wedding date
Model3=Model3.dropna()
Inde_var3=Model3.loc[:,['Gender_Female','Gender_Male','Gender_Couple','Channel_Boutique','Channel_Web','Channel_CC',
                        'Calls','Clienteling','Livechat','Wishlist','Turnover','intercept','Birthday_Occasion']]
#fit the logit model
logit3=sm.Logit(Model3['repurchase_exact4'],Inde_var3)
result3=logit3.fit()
print(result3.summary())

Optimization terminated successfully.
         Current function value: 0.050486
         Iterations 10
                           Logit Regression Results                           
Dep. Variable:      repurchase_exact4   No. Observations:               904939
Model:                          Logit   Df Residuals:                   904926
Method:                           MLE   Df Model:                           12
Date:                Fri, 06 Jan 2023   Pseudo R-squ.:                0.001877
Time:                        15:20:22   Log-Likelihood:                -45687.
converged:                       True   LL-Null:                       -45773.
Covariance Type:            nonrobust   LLR p-value:                 1.994e-30
                        coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
Gender_Female         0.1149      0.112      1.025      0.305      -0.105       0.335
Gender_

## Logit regression with Wedding_Occasion

In [36]:
Model4=df_AllSales[['repurchase_exact4','Gender_Female','Gender_Male','Gender_Couple','Channel_Boutique','Channel_Web',
                    'Channel_CC','Calls','Livechat','Clienteling','Wishlist','Turnover','intercept', 
                    'nb_days_since_last_sale', 'WeddingDate','Wedding_Occasion']]

#Since there are too many missing values in wedding date and birth date, we build up two models, one with them and one without.
#extract Indepedent variables into a dataframe for the model without wedding date
Model4=Model4.dropna()
Inde_var3=Model4.loc[:,['Gender_Female','Gender_Male','Gender_Couple','Channel_Boutique','Channel_Web','Channel_CC',
                        'Calls','Clienteling','Livechat','Wishlist','Turnover','intercept','Wedding_Occasion']]
#fit the logit model
logit4=sm.Logit(Model4['repurchase_exact4'],Inde_var3)
result4=logit4.fit()
print(result4.summary())

Optimization terminated successfully.
         Current function value: 0.048659
         Iterations 10
                           Logit Regression Results                           
Dep. Variable:      repurchase_exact4   No. Observations:               124556
Model:                          Logit   Df Residuals:                   124543
Method:                           MLE   Df Model:                           12
Date:                Fri, 06 Jan 2023   Pseudo R-squ.:                0.006958
Time:                        15:33:54   Log-Likelihood:                -6060.7
converged:                       True   LL-Null:                       -6103.2
Covariance Type:            nonrobust   LLR p-value:                 4.692e-13
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
Gender_Female       -0.3112      0.215     -1.446      0.148      -0.733       0.111
Gender_Mal

## Logit regression with Both occasions

In [38]:
Model5=df_AllSales[['repurchase_exact4','Gender_Female','Gender_Male','Gender_Couple','Channel_Boutique','Channel_Web',
                    'Channel_CC','Calls','Livechat','Clienteling','Wishlist','Turnover','intercept','Birthday_Occasion',
                    'PersonBirthDate','Wedding_Occasion', 'WeddingDate','nb_days_since_last_sale']]

#Since there are too many missing values in wedding date and birth date, we build up two models,
#one with them and one without.
#extract Indepedent variables into a dataframe for the model without wedding date
Model5=Model5.dropna()
Inde_var4=Model5.loc[:,['Gender_Female','Gender_Male','Gender_Couple','Channel_Boutique','Channel_Web','Channel_CC',
                        'Calls','Clienteling','Livechat','Wishlist','Turnover','intercept','Wedding_Occasion', 'Birthday_Occasion']]
#fit the logit model
logit5=sm.Logit(Model5['repurchase_exact4'],Inde_var4)
result5=logit5.fit()
print(result5.summary())

Optimization terminated successfully.
         Current function value: 0.048981
         Iterations 10
                           Logit Regression Results                           
Dep. Variable:      repurchase_exact4   No. Observations:               116884
Model:                          Logit   Df Residuals:                   116870
Method:                           MLE   Df Model:                           13
Date:                Fri, 06 Jan 2023   Pseudo R-squ.:                0.007168
Time:                        15:36:56   Log-Likelihood:                -5725.1
converged:                       True   LL-Null:                       -5766.4
Covariance Type:            nonrobust   LLR p-value:                 3.461e-12
                        coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
Gender_Female        -0.3237      0.220     -1.470      0.142      -0.755       0.108
Gender_

## Client Level Logit model without occasions

In [51]:
Model7=df_AllSalesClient[['repurchase_exact4','Gender_Female','Gender_Male','Gender_Couple','Channel_Boutique','Channel_Web',
                    'Channel_CC','Calls','Livechat','Clienteling','Wishlist','Turnover','intercept','nb_days_since_last_sale']]

#Since there are too many missing values in wedding date and birth date, we build up two models, one with them and one without.
#extract Indepedent variables into a dataframe for the model without wedding date
Model7=Model7.dropna()
Inde_var4=Model7.loc[:,['Gender_Female','Gender_Male','Gender_Couple','Channel_Boutique','Channel_Web','Channel_CC',
                        'Calls','Clienteling','Livechat','Wishlist','Turnover','intercept']]
#fit the logit model
logit7=sm.Logit(Model7['repurchase_exact4'],Inde_var4)
result7=logit7.fit()
print(result7.summary())

Optimization terminated successfully.
         Current function value: 0.125033
         Iterations 9
                           Logit Regression Results                           
Dep. Variable:      repurchase_exact4   No. Observations:               341661
Model:                          Logit   Df Residuals:                   341649
Method:                           MLE   Df Model:                           11
Date:                Fri, 06 Jan 2023   Pseudo R-squ.:                0.006747
Time:                        16:12:10   Log-Likelihood:                -42719.
converged:                       True   LL-Null:                       -43009.
Covariance Type:            nonrobust   LLR p-value:                2.172e-117
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
Gender_Female        0.2318      0.084      2.774      0.006       0.068       0.396
Gender_Male

## Client Level Logit model with occasions

In [52]:
Model6=df_AllSalesClient[['repurchase_exact4','Gender_Female','Gender_Male','Gender_Couple','Channel_Boutique','Channel_Web',
                    'Channel_CC','Calls','Livechat','Clienteling','Wishlist','Turnover','intercept','Birthday_Occasion',
                    'PersonBirthDate','Wedding_Occasion', 'WeddingDate','nb_days_since_last_sale']]

#Since there are too many missing values in wedding date and birth date, we build up two models,
#one with them and one without.
#extract Indepedent variables into a dataframe for the model without wedding date
Model6=Model6.dropna()
Inde_var4=Model6.loc[:,['Gender_Female','Gender_Male','Gender_Couple','Channel_Boutique','Channel_Web','Channel_CC',
                        'Calls','Clienteling','Livechat','Wishlist','Turnover','intercept','Wedding_Occasion', 'Birthday_Occasion']]
#fit the logit model
logit6=sm.Logit(Model6['repurchase_exact4'],Inde_var4)
result6=logit6.fit()
print(result6.summary())

Optimization terminated successfully.
         Current function value: 0.124845
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:      repurchase_exact4   No. Observations:                28620
Model:                          Logit   Df Residuals:                    28606
Method:                           MLE   Df Model:                           13
Date:                Fri, 06 Jan 2023   Pseudo R-squ.:                0.001735
Time:                        16:12:17   Log-Likelihood:                -3573.1
converged:                       True   LL-Null:                       -3579.3
Covariance Type:            nonrobust   LLR p-value:                    0.4937
                        coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
Gender_Female         0.1854      0.297      0.624      0.532      -0.397       0.768
Gender_M

## Descriptive statistics to give overall portrait

In [58]:
df_AllSales.groupby(['repurchase_exact4','Wedding_Occasion'])['ClientID'].count()

repurchase_exact4  Wedding_Occasion
0                  0                   1527527
                   1                     29159
1                  0                     12697
                   1                       317
Name: ClientID, dtype: int64

In [67]:
df_AllSales.groupby(['repurchase_exact4','Wedding_Occasion'])['InvoiceHeader'].count()

repurchase_exact4  Wedding_Occasion
0                  0                   1522089
                   1                     29045
1                  0                     12668
                   1                       316
Name: InvoiceHeader, dtype: int64

In [59]:
df_AllSales.groupby(['repurchase_long','Wedding_Occasion'])['ClientID'].count()

repurchase_long  Wedding_Occasion
0                0                   1433915
                 1                     27391
1                0                    106309
                 1                      2085
Name: ClientID, dtype: int64

In [60]:
df_AllSales.groupby(['repurchase_exact4','Birthday_Occasion'])['ClientID'].count()

repurchase_exact4  Birthday_Occasion
0                  0                    1364771
                   1                     191915
1                  0                      11096
                   1                       1918
Name: ClientID, dtype: int64

In [61]:
df_AllSales.groupby(['repurchase_long','Wedding_Occasion'])['ClientID'].count()

repurchase_long  Wedding_Occasion
0                0                   1433915
                 1                     27391
1                0                    106309
                 1                      2085
Name: ClientID, dtype: int64

In [None]:
df_AllSales.groupby(['repurchase_exact4','Wedding_Occasion'])['ClientID'].count()

In [65]:
#df_AllSales['repurchase_exact4'].hist(by=df_AllSales['Wedding_Occasion'])

In [66]:
df_AllSales.groupby(['repurchase_exact4','Gender'])['ClientID'].count()

repurchase_exact4  Gender 
0                  Couple     102477
                   Female     804203
                   Male       621557
                   Unknown     26641
1                  Couple        871
                   Female       6832
                   Male         5103
                   Unknown       203
Name: ClientID, dtype: int64