In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder as SklearnOneHotEncoder
from statsmodels.regression.linear_model import OLS

In [2]:
class OneHotEncoder(SklearnOneHotEncoder):
    def __init__(self, **kwargs):
        super(OneHotEncoder, self).__init__(**kwargs)
        self.fit_flag = False

    def fit(self, X, **kwargs):
        out = super().fit(X)
        self.fit_flag = True
        return out

    def transform(self, X, **kwargs):
        sparse_matrix = super(OneHotEncoder, self).transform(X)
        new_columns = self.get_new_columns(X=X)
        d_out = pd.DataFrame(sparse_matrix.toarray(), columns=new_columns, index=X.index)
        return d_out

    def fit_transform(self, X, **kwargs):
        self.fit(X)
        return self.transform(X)

    def get_new_columns(self, X):
        new_columns = []
        for i, column in enumerate(X.columns):
            j = 0
            while j < len(self.categories_[i]):
                new_columns.append(f'{column}_<{self.categories_[i][j]}>')
                j += 1
        return new_columns

In [3]:
dataset_size = 100000
promo_size = int(0.1 * dataset_size)

order_ids = [i for i in range(88977643, 88977643 + dataset_size)]

no_promo = list(map(int, np.random.normal(30, 8, dataset_size - promo_size)))
no_promo_title = ['no_promo'] * (dataset_size - promo_size)

promo = list(map(int, np.random.normal(29.8, 10, promo_size)))
promo_title = ['SALE15'] * (promo_size)

gmv_list = no_promo + promo
titles_list = no_promo_title + promo_title

In [4]:
df = pd.DataFrame()

df['order_id'] = order_ids
df = df.sample(frac=1).reset_index(drop=True)

df['order_value'] = gmv_list
df['promo_type'] = titles_list
df = df.sample(frac=1).reset_index(drop=True)

df = df.query('order_value > 0')

df.to_csv('one_promo_df.csv')

In [8]:
df = pd.read_csv('one_promo_df.csv')

In [5]:
encoder = OneHotEncoder()

X = encoder.fit_transform(
    df[['promo_type']]
).drop('promo_type_<no_promo>', axis=1).assign(aov=1)

Y = df['order_value']

In [6]:
estimator = OLS(Y, X).fit()

In [7]:
print(estimator.summary())

                            OLS Regression Results                            
Dep. Variable:            order_value   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     1.166
Date:                Tue, 19 Jul 2022   Prob (F-statistic):              0.280
Time:                        09:24:17   Log-Likelihood:            -3.5255e+05
No. Observations:               99969   AIC:                         7.051e+05
Df Residuals:                   99967   BIC:                         7.051e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
promo_type_<SALE15>    -0.0938    