In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error as mae
from xgboost import XGBRegressor

In [49]:
# Fill missing values and extract temporal features
def preprocess(df):
    df.drop(['SKU_id'], axis=1,inplace=True)
    df['Promo'].fillna(0.0, inplace=True)
    df['Promo_Price'].fillna(0, inplace=True)
    df.loc[df['Promo_Price'] == 0, 'Promo_Price'] = df['Regular_Price']
    df['Date'] = pd.to_datetime(df['Date'], dayfirst=True)
    df['weekday'] = df['Date'].dt.weekday
    df['monthday'] = df['Date'].dt.day
    df['is_weekend'] = df['weekday'].isin([5, 6]).astype(int)
    df['year'] = df['Date'].dt.year
    df['month'] = df['Date'].dt.month
    df.drop('Regular_Price',axis=1,inplace=True)
    return df

In [50]:
# Define constants
MONTHS_EXPECTED_LOWER = (1, 2)
MONTHS_EXPECTED_HIGHER = (7, 11)

# Load data
df = pd.read_csv('train_kaggle.csv')
test_data = pd.read_csv('test_kaggle.csv')
df = df[stats.zscore(df.Demand)<3]

df = preprocess(df)

In [51]:
# Split data into train and test sets
target = df['Demand']
features = df.drop(['Demand', 'Date'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [52]:
# Preprocess data using a pipeline
numerical_cols = [ 'weekday', 'monthday', 'Promo_Price', 'month','year']
preprocessor = ColumnTransformer(transformers=[('num', StandardScaler(), numerical_cols)], remainder='drop')
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])
X_train = pipeline.fit_transform(X_train)
X_test = pipeline.transform(X_test)

In [53]:
# Train XGBoost model
model = XGBRegressor( max_depth=10)
model.fit(X_train, y_train)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=10, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             objective='reg:squarederror', predictor=None, ...)

In [54]:
features = df.drop(['Demand', 'Date'], axis=1)
preds = model.predict(X_test)
print(mae(preds,y_test))
features = pipeline.fit_transform(features)
model.fit(features,target)

71.45122228500087


XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=10, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             objective='reg:squarederror', predictor=None, ...)

In [55]:
df=pd.read_csv('test_kaggle.csv')
df = preprocess(df)
ids = df['id']
df.drop('id',axis=1,inplace=True)
target = df['Demand']
features = df.drop(['Demand', 'Date'], axis=1)
features = pipeline.fit_transform(features)
preds = model.predict(features)
res = pd.DataFrame(ids)
res['Demand'] = preds
res.to_csv('testing.csv',index=False)

In [56]:
#Now, let's try the Random forest regressor

from sklearn.ensemble import RandomForestRegressor as rf

In [57]:
model1 = rf(n_estimators=100)
model1.fit(X_train,y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [58]:
df=pd.read_csv('test_kaggle.csv')
df = preprocess(df)
ids = df['id']
df.drop('id',axis=1,inplace=True)
target = df['Demand']
features = df.drop(['Demand', 'Date'], axis=1)
features = pipeline.fit_transform(features)
preds = model.predict(features)
res = pd.DataFrame(ids)
res['Demand'] = preds
res.to_csv('testing.csv',index=False)
mae(pred,y_test)

71.41930703557786

In [28]:
res

Unnamed: 0,id,Demand
0,0,481.114105
1,1,23.687914
2,2,7.237731
3,3,7.425086
4,4,7.862178
...,...,...
5965,5965,20.543852
5966,5966,13.242611
5967,5967,23.315908
5968,5968,18.678003


In [33]:
features

array([[-1.50612336,  0.8260139 , -1.42810953, -1.47726192,  0.        ],
       [-1.0062592 ,  0.93474787, -0.22429533, -1.47726192,  0.        ],
       [-0.50639505,  1.04348184,  0.12478513, -1.47726192,  0.        ],
       ...,
       [ 0.49333327,  0.17361008,  0.29932535,  0.67692803,  0.        ],
       [ 0.99319743,  0.28234405,  0.29932535,  0.67692803,  0.        ],
       [ 1.49306159,  0.39107802,  0.29932535,  0.67692803,  0.        ]])