In [1]:
import pandas as pd
import numpy as np
from scipy.stats import norm, ttest_ind, stats
from statsmodels.stats.multitest import multipletests
import statsmodels.api as sm

In [2]:
order_file = pd.read_csv("C:\\Users\\Savannah Truluck\\DATA 5300\\Final Project\\order_file.csv")
penalty_file = pd.read_csv("C:\\Users\\Savannah Truluck\\DATA 5300\\Final Project\\penalty_file.csv")

data = pd.merge(order_file, penalty_file, on="driver.id")

In [5]:
# Number of successes in group 0
x0 = data[(data['penalty.variant'] == 0) & (data['cancel.dummy'] == 0)].shape[0]  
# Total number of observations in group 0
n0 = data[data['penalty.variant'] == 0].shape[0]  
# Number of successes in group 10
x10 = data[(data['penalty.variant'] == 10) & (data['cancel.dummy'] == 0)].shape[0]  
# Total number of observations in group 10
n10 = data[data['penalty.variant'] == 10].shape[0]  
# Number of successes in group 20
x20 = data[(data['penalty.variant'] == 20) & (data['cancel.dummy'] == 0)].shape[0]  
# Total number of observations in group 20
n20 = data[data['penalty.variant'] == 20].shape[0]  

# Proportions
p0 = x0 / n0
p10 = x10 / n10
p20 = x20 / n20

# Wald Method
# Difference in proportions
diff_010 = p0 - p10
diff_1020 = p10 - p20

# Standard error for the Wald method
se_wald_010 = np.sqrt((p0 * (1 - p0) / n0) + (p10 * (1 - p10) / n10))
se_wald_1020 = np.sqrt((p10 * (1 - p10) / n10) + (p20 * (1 - p20) / n20))

# Wald confidence interval
z = norm.ppf(0.975)  # 95% CI
wald_ci_0 = (p0 - z * se_wald_0, p0 + z * se_wald_0)
wald_ci_010 = (diff_010 - z * se_wald_010, diff_010 + z * se_wald_010)
wald_ci_1020 = (diff_1020 - z * se_wald_1020, diff_1020 + z * se_wald_1020)

print(wald_ci_010)
print(wald_ci_1020)

(0.8109388665512268, 0.8131705269975439)
(-0.03570699473884643, -0.03266887578989185)
(-0.05485176258028372, -0.0521579578967178)


Confidence Intervals for groups 0 & 10, and groups 10 & 20:

(-0.03570699473884643, -0.03266887578989185)

(-0.05485176258028372, -0.0521579578967178)

These confidence intervals don't include 0 which implies that there is a statistically significant effect

In [26]:
# Grouping data by penalty and cancellation
data_changed = data.copy()
data_changed.loc[(data['cancel.dummy'] == 1) & (data['penalty.variant'] == 0), 'expected.profit'] = 0
data_changed.loc[(data['cancel.dummy'] == 1) & (data['penalty.variant'] == 10), 'expected.profit'] = 10
data_changed.loc[(data['cancel.dummy'] == 1) & (data['penalty.variant'] == 20), 'expected.profit'] = 20

data_changed = data_changed.dropna()

completed_rides = data[data['cancel.dummy'] == 0]

# P-Values
# mu0 = mu10 vs mu0 != mu10
p1 = ttest_ind(data_changed[data_changed['penalty.variant'] == 0]['expected.profit'],
                data_changed[data_changed['penalty.variant'] == 10]['expected.profit']).pvalue
# mu0 = mu20 vs mu0 != mu20
p2 = ttest_ind(data_changed[data_changed['penalty.variant'] == 0]['expected.profit'],
                data_changed[data_changed['penalty.variant'] == 20]['expected.profit']).pvalue
# mu20 = mu10 vs mu20 != mu10
p3 = ttest_ind(data_changed[data_changed['penalty.variant'] == 20]['expected.profit'],
                data_changed[data_changed['penalty.variant'] == 10]['expected.profit']).pvalue

rides_per_customer = completed_rides.groupby(['driver.id', 'penalty.variant']).size().reset_index(name='completed_rides')

rides_pen0 = rides_per_customer[rides_per_customer['penalty.variant'] == 0]['completed_rides']
rides_pen10 = rides_per_customer[rides_per_customer['penalty.variant'] == 10]['completed_rides']
rides_pen20 = rides_per_customer[rides_per_customer['penalty.variant'] == 20]['completed_rides']

# d0 = d10 vs d0 != d10
p4 = ttest_ind(rides_pen0, rides_pen10).pvalue
# d0 = d20 vs d0 != d20
p5 = ttest_ind(rides_pen0, rides_pen20).pvalue
# d20 = d10 vs d20 != d10
p6 = ttest_ind(rides_pen20, rides_pen10).pvalue

p = np.array([p1, p2, p3, p4, p5, p6])

# Adjust p-values
p_adjusted = multipletests(p, method='fdr_by')[1]

# Results
p

array([7.94651702e-01, 1.95633469e-03, 4.27998865e-03, 3.32477632e-01,
       8.32362470e-06, 3.46634091e-08])

The p-values returned are:
1.00000000e+00, 9.58603996e-03, 1.57289583e-02 
9.77484239e-01, 6.11786415e-05, 5.09552113e-07

The first three refer to the expected profit per penalty variant. The first being between 0 & 10, then 0 & 20, and 20 & 10.
The second three refer to the completed rides per penalty variant, in the same order of penalty comparisons as the first three tests.

This shows us that there is no significant difference for expected profit for penalty groups 0 & 10, although there is a difference between 0 & 20, and 20 & 10.

In terms of completed rides, we see no significant difference for completed rides per penalty variant in groups 0 & 10, although we do see a difference between groups 0 & 20, and groups 20 & 10.

Cancellation Cost Impact will show us whether there is a significant amount of money lost based on the lost revenue due to cancellations. We will look at each group, calculate (cancellations * expected profit) - (cancellations * penalty) as we only want to look at what we would have gained if the order had gone through and factor out what was gained by the penalty.

In [None]:
cost_impact_alt = data.groupby('driver.id').agg(
    cancellations=('cancel.dummy', 'sum'),
    expected_profit=pd.NamedAgg(
        column='expected.profit',
        aggfunc=lambda x: np.sum(x * data.loc[x.index, 'cancel.dummy'])
    ),
    penalty_paid=pd.NamedAgg(
        column='penalty.variant',
        aggfunc=lambda x: np.sum(x * data.loc[x.index, 'cancel.dummy'])
    )
)

cost_impact_alt['cancellation_cost_impact'] = (
    cost_impact_alt['expected_profit'] - cost_impact_alt['penalty_paid']
)

I saved this output as a csv so that I wouldn't have to do the lengthy reprocessing each time 

In [9]:
cost_impact = pd.read_csv("C:\\Users\\Savannah Truluck\\DATA 5300\\Final Project\\costimpact.csv")
cost_impact = cost_impact.drop("Unnamed: 0", axis = 'columns')
cost_impact = cost_impact.dropna(subset=['expected.profit'])

# P-Values
p7 = ttest_ind(cost_impact['expected.profit'][cost_impact['penalty.variant'] == 0],
                     cost_impact['expected.profit'][cost_impact['penalty.variant'] == 10]).pvalue
p8 = ttest_ind(cost_impact['expected.profit'][cost_impact['penalty.variant'] == 0],
                     cost_impact['expected.profit'][cost_impact['penalty.variant'] == 20]).pvalue
p9 = ttest_ind(cost_impact['expected.profit'][cost_impact['penalty.variant'] == 20],
                     cost_impact['expected.profit'][cost_impact['penalty.variant'] == 10]).pvalue

p = [p7, p8, p9]

p_adjusted = multipletests(p, method='bonferroni')[1]
p

[0.7475632007185573, 3.1564385182070245e-05, 6.2251140865013245e-06]

The p-values returned are: 
0.7475632007185573, 3.1564385182070245e-05, 6.2251140865013245e-06

This shows that there is no significant difference between penalty groups 0 and 10 in terms of cost impact, however, for groups 0/20 and 20/10 we are seeing a statistically significant difference

In [13]:
#if cancel is 0 then set retained to 1, if it's not 0 set retained to 0
data['retained'] = np.where(data['cancel.dummy'] == 0, 1, 0)
if data['order.placed.time'].dtype == 'object':
    data['order.placed.time'] = pd.to_datetime(data['order.placed.time'].str.replace(' America/Los_Angeles', ''))
else:
    data['order.placed.time'] = pd.to_datetime(data['order.placed.time'])

data['week'] = (data['order.placed.time'] - data['order.placed.time'].min()).dt.total_seconds() / (7 * 24 * 60 * 60)

long_term_model = sm.GLM(data['retained'], 
                        sm.add_constant(data[['penalty.variant', 'week']]), 
                        family=sm.families.Binomial()).fit()
print(long_term_model.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:               retained   No. Observations:              1402312
Model:                            GLM   Df Residuals:                  1402309
Model Family:                Binomial   Df Model:                            2
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:            -5.7983e+05
Date:                Mon, 02 Dec 2024   Deviance:                   1.1597e+06
Time:                        15:26:10   Pearson chi2:                 1.40e+06
No. Iterations:                     5   Pseudo R-squ. (CS):            0.01020
Covariance Type:            nonrobust                                         
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
const               1.4588      0.005    2

This analysis was done for the most basic interpretation of retention, if a ride is canceled then the driver is lost, if they complete the order then they are retained. If we look at the constant coefficient, we get the base log-odds retention when all other variables are 0 which returns as 1.4588. When converted to probability gives us approximately 0.81 or 81% baseline retention. The penalty variant shows that the penalty variant is associated with higher retention and for each unit increase in penalty the log-odds of retention increase 0.0354. However, if we look at week, we see a decrease in retention shown by the -0.0092 log-odds coefficient.

In [85]:
driver_stats = data.groupby('driver.id').agg({
    'retained': 'mean', 
    'penalty.variant': 'first', 
    'week': ['min', 'max']
}).reset_index()

driver_model = sm.GLM(
    driver_stats['retained'], 
    sm.add_constant(driver_stats[['penalty.variant', 'week']]),
    family=sm.families.Binomial()
).fit()

print(driver_model.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                   mean   No. Observations:               530756
Model:                            GLM   Df Residuals:                   530752
Model Family:                Binomial   Df Model:                            3
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:            -1.9534e+05
Date:                Mon, 25 Nov 2024   Deviance:                   3.4261e+05
Time:                        19:37:03   Pearson chi2:                 4.23e+05
No. Iterations:                     5   Pseudo R-squ. (CS):            0.01112
Covariance Type:            nonrobust                                         
                                   coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------
const           

This test analyses a more complex method of checking for retention. For each driver we look at the mean amount of cancellations vs completions. It also looks at their earliest drive and their latest drive to see how long they were driving for.

If we look at the constant coefficient, we get the base log-odds retention when all other variables are 0 which returns as 1.5889. When converted to probability gives us approximately 0.83 or 83% baseline retention. The penalty variant coefficient shows that the penalty is associated with higher retention and for each unit increase in penalty the log-odds of retention increase 0.0354. Week minimum shows positive retention for drivers who showed up later in the study period, given by the coefficient value of 0.0307, while drivers who had been driving longer seemed to decline in retention, shown by the negative coefficient value -0.0399