In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

data = pd.read_excel("../EDA/Datasets/ad-data-cleaned.xlsx")

# Impute missing values in the 'days_since_last_punishment' column with 0 
data.fillna({'days_since_last_punishment': 0}, inplace=True)

# Data Preprocessing
To build a stochastic optimization model to predict `ad_revenue`, we will use the below features:
1. `queue_market`
2. `punish_num`
3. `avg_ad_revenue`
4. `baseline_st` (derived from `delivery_country, product_line` and `task_type_en` as seen in EDA)
5. `days_since_last_punishment`

Before training the model, appropriate data preprocessing steps have to be taken:
1. Encoding of Categorical Features
2. Data Splitting into training and test sets
3. Feature Scaling to standardize feature values

In [4]:
# Extract relevant columns
features = ['queue_market', 'punish_num', 'avg_ad_revenue', 'baseline_st', 'days_since_last_punishment']
X = data[features]
y = data['ad_revenue']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Set up preprocessing steps for the features
# - One-hot encode the 'queue_market' column
# - Standardize the numerical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['punish_num', 'avg_ad_revenue', 'baseline_st', 'days_since_last_punishment']),
        ('cat', OneHotEncoder(drop='first'), ['queue_market'])
    ])

# Fit the preprocessor on the training data and transform both training and test data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

X_train_preprocessed.shape, X_test_preprocessed.shape

((31651, 61), (7913, 61))