In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder as SklearnOneHotEncoder
from statsmodels.regression.linear_model import OLS

In [2]:
class OneHotEncoder(SklearnOneHotEncoder):
    def __init__(self, **kwargs):
        super(OneHotEncoder, self).__init__(**kwargs)
        self.fit_flag = False

    def fit(self, X, **kwargs):
        out = super().fit(X)
        self.fit_flag = True
        return out

    def transform(self, X, **kwargs):
        sparse_matrix = super(OneHotEncoder, self).transform(X)
        new_columns = self.get_new_columns(X=X)
        d_out = pd.DataFrame(sparse_matrix.toarray(), columns=new_columns, index=X.index)
        return d_out

    def fit_transform(self, X, **kwargs):
        self.fit(X)
        return self.transform(X)

    def get_new_columns(self, X):
        new_columns = []
        for i, column in enumerate(X.columns):
            j = 0
            while j < len(self.categories_[i]):
                new_columns.append(f'{column}_<{self.categories_[i][j]}>')
                j += 1
        return new_columns

In [3]:
df = pd.read_csv('multiple_promo_df.csv')

In [4]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,gmv,title,delivery_discount,surge_increment,order_id
0,0,22,SALE15,0,0,768977643
1,1,44,LUCKY,1,0,768977644
2,2,26,SUMMER,0,0,768977645


In [5]:
df.query('title == "no_promo"').shape[0] / df.shape[0]

0.5904545516019529

In [6]:
encoder = OneHotEncoder()

In [7]:
encoder.fit_transform(df[['title']]).head(3)

Unnamed: 0,title_<LUCKY>,title_<SALE15>,title_<SORRY>,title_<SUMMER>,title_<TAKE30>,title_<WINTER>,title_<no_promo>
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [8]:
Y = df['gmv']

X = encoder.fit_transform(df[['title']]).drop('title_<no_promo>', axis=1).assign(aov=1)
X['delivery_discount'] = df['delivery_discount']
X['surge_increment'] = df['surge_increment']

In [9]:
X.head(3)

Unnamed: 0,title_<LUCKY>,title_<SALE15>,title_<SORRY>,title_<SUMMER>,title_<TAKE30>,title_<WINTER>,aov,delivery_discount,surge_increment
0,0.0,1.0,0.0,0.0,0.0,0.0,1,0,0
1,1.0,0.0,0.0,0.0,0.0,0.0,1,1,0
2,0.0,0.0,0.0,1.0,0.0,0.0,1,0,0


In [10]:
estimator = OLS(Y, X).fit()

In [11]:
print(estimator.summary())

                            OLS Regression Results                            
Dep. Variable:                    gmv   R-squared:                       0.058
Model:                            OLS   Adj. R-squared:                  0.058
Method:                 Least Squares   F-statistic:                     2844.
Date:                Tue, 19 Jul 2022   Prob (F-statistic):               0.00
Time:                        11:36:48   Log-Likelihood:            -1.3137e+06
No. Observations:              369705   AIC:                         2.627e+06
Df Residuals:                  369696   BIC:                         2.628e+06
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
title_<LUCKY>        -4.7214      0.05