# Milestone 5: OLS Model
## Estée Lauder ATE Calculation From OLS Model
## Author: Sahiti Srikakolapu

In [None]:
# Import libraries and packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from statsmodels.stats.power import TTestIndPower
import statsmodels.api as sm

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Load dataset
# df = pd.read_parquet("/content/drive/MyDrive/Estee Lauder 1A Team Folder/experiment_results_1a.parquet")
df = pd.read_parquet("https://drive.google.com/uc?export=download&id=1qTcf1HueAbq9X2PkAwEBTsLPhhyu30YR")

print("Dataset loaded successfully!")
print(f"Shape: {df.shape}")
print(df.head())

Dataset loaded successfully!
Shape: (5577, 8)
                                customer_id             name  aov (t-1)  \
18951  be17d8f7-7bb0-40ad-90cd-d4ebb4f5aa51   Christine Sims     134.23   
19248  a4435752-02b4-47c0-8681-9f9b9926dbcc     Billy Orozco      87.62   
323    2b1cfeb3-d356-40bd-93e4-165423c8144e  Stephanie Davis      95.04   
16141  7c961398-d620-43c7-a6e9-60942dd91c78    Angela Miller      52.51   
9276   7e69aa44-a295-49a0-a7f8-9c77bba8d1f8  Michael Ramirez      68.77   

       days_since_last_purchase (t-1)  tenure_in_days(t-1)  \
18951                              34                  102   
19248                               7                   36   
323                                 0                   72   
16141                               4                   18   
9276                               72                   52   

       loyalty_membership  revenue (t)  assignment  
18951                   0   166.233993           1  
19248                   

In [None]:
# Fit OLS Model to Estimate ATE

# Define outcome (y) and treatment indicator (X)
y = df['revenue (t)']
assignment = df['assignment']

# Add constant term for intercept
X = sm.add_constant(assignment)

# Fit OLS regression with robust standard errors (HC0)
model = sm.OLS(y, X).fit(cov_type='HC0')

# Print full model summary
print("="*80)
print("OLS Regression Results: Estimating Average Treatment Effect (ATE)")
print("="*80)
print(model.summary(xname=['const', 'T']))
print("\n")

OLS Regression Results: Estimating Average Treatment Effect (ATE)
                            OLS Regression Results                            
Dep. Variable:            revenue (t)   R-squared:                       0.014
Model:                            OLS   Adj. R-squared:                  0.014
Method:                 Least Squares   F-statistic:                     81.77
Date:                Fri, 31 Oct 2025   Prob (F-statistic):           2.07e-19
Time:                        00:18:04   Log-Likelihood:                -25644.
No. Observations:                5577   AIC:                         5.129e+04
Df Residuals:                    5575   BIC:                         5.131e+04
Df Model:                           1                                         
Covariance Type:                  HC0                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------

In [None]:
# Extract key results
ate = model.params[1]  # Coefficient on assignment (treatment effect)
ci_lower, ci_upper = model.conf_int().iloc[1]  # 95% CI for treatment coefficient
ci_width = ci_upper - ci_lower
p_value = model.pvalues[1]  # p-value for treatment coefficient
std_error = model.bse[1]  # Standard error

# Display extracted results
print("="*80)
print("Summary:")
print("="*80)
print("Average Treatment Effect (ATE): $",round(ate,2))
print("95% Confidence Interval: [$",round(ci_lower,2),", ","$",round(ci_upper,2),"]")
print("CI Width: $",round(ci_width,2))
print("Standard Error: $",round(std_error,2))
print("P-value: ",p_value)
print(f"Statistical Significance (α=0.05): {'Yes' if p_value < 0.05 else 'No'}")
print(f"CI Includes Zero: {'Yes' if ci_lower <= 0 <= ci_upper else 'No'}")
print("="*80)

Summary:
Average Treatment Effect (ATE): $ 5.82
95% Confidence Interval: [$ 4.56 ,  $ 7.08 ]
CI Width: $ 2.52
Standard Error: $ 0.64
P-value:  1.5276667805859296e-19
Statistical Significance (α=0.05): Yes
CI Includes Zero: No


  ate = model.params[1]  # Coefficient on assignment (treatment effect)
  p_value = model.pvalues[1]  # p-value for treatment coefficient
  std_error = model.bse[1]  # Standard error


In [None]:
# Additional analysis: Sample statistics
control_mean = df[df['assignment'] == 0]['revenue (t)'].mean()
treatment_mean = df[df['assignment'] == 1]['revenue (t)'].mean()
control_n = (df['assignment'] == 0).sum()
treatment_n = (df['assignment'] == 1).sum()

print("\nSample Statistics:")
print(f"Control Group Mean Revenue: ${control_mean:.4f} (n={control_n})")
print(f"Treatment Group Mean Revenue: ${treatment_mean:.4f} (n={treatment_n})")
print(f"Difference (Treatment - Control): ${treatment_mean - control_mean:.4f}")
print(f"\nThe ATE from OLS regression (${ate:.4f}) equals the difference in means.")


Sample Statistics:
Control Group Mean Revenue: $117.6066 (n=2782)
Treatment Group Mean Revenue: $123.4253 (n=2795)
Difference (Treatment - Control): $5.8186

The ATE from OLS regression ($5.8186) equals the difference in means.
