In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor, GradientBoostingRegressor, BaggingRegressor, ExtraTreesRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from lightgbm import LGBMRegressor
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor



# Ignore Warnings
# To avoid warnings cluttering the output, we disable them here
import warnings
warnings.simplefilter("ignore")

# Data Loading
# Load training and testing datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Save Test IDs
# Store the 'id' column from the test set for later use in submission
test_id = test_df['id']

# Data Preprocessing
# Convert 'date' column to datetime format in the training data
train_df['date'] = pd.to_datetime(train_df['date'])

# Fill missing values in 'holiday_name' column with 'None'
train_df['holiday_name'].fillna('None', inplace=True)
test_df['holiday_name'].fillna('None', inplace=True)

# Select base features from test data (excluding 'id')
base_features = test_df.drop(columns=['id']).columns

# Combine training and testing datasets for uniform preprocessing
train_df = pd.concat([train_df[base_features], train_df['orders']], axis=1)
test_df = test_df[base_features]

# Merge Data for Preprocessing
# Combine both datasets to apply transformations consistently
all_df = pd.concat([train_df, test_df], sort=False).reset_index(drop=True)

# Feature Engineering - Date
# Extract year, month, day, and day of the week from the 'date' column
date_col = ['date']
for _col in date_col:
    date_col = pd.to_datetime(all_df[_col], errors='coerce')
    all_df[_col + "_year"] = date_col.dt.year.fillna(-1)
    all_df[_col + "_month"] = date_col.dt.month.fillna(-1)
    all_df[_col + "_day"] = date_col.dt.day.fillna(-1)
    all_df[_col + "_day_of_week"] = date_col.dt.dayofweek.fillna(-1)
    all_df.drop(_col, axis=1, inplace=True)

# Fill missing 'holiday_name' values
all_df['holiday_name'].fillna('None', inplace=True)

# Encoding Categorical Features
# One-Hot Encoding for 'holiday_name' column
enc = OneHotEncoder(sparse=False)
holiday_encoded = enc.fit_transform(all_df[['holiday_name']])
encoded_df = pd.DataFrame(holiday_encoded, columns=enc.get_feature_names_out(['holiday_name']))
all_df = pd.concat([all_df, encoded_df], axis=1)
all_df = all_df.drop('holiday_name', axis=1)

# Label Encoding for 'warehouse' column
le = preprocessing.LabelEncoder()
all_df['warehouse'] = le.fit_transform(all_df['warehouse'])

# Split Data Back into Train and Test
# Separate the combined dataset back into training and test sets
train_df_le = all_df[~all_df['orders'].isnull()]
test_df_le = all_df[all_df['orders'].isnull()]

# Features and Target Separation
# Separate features (X) and target variable (y)
X = train_df_le.drop(columns=['orders'])
y = train_df_le['orders']

# Train-Test Split
# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=777)



# Initialize models
models = {
    'XGBRegressor': XGBRegressor(),
    'HistGradientBoostingRegressor': HistGradientBoostingRegressor(),
    'LGBMRegressor': LGBMRegressor(),
    'RandomForestRegressor': RandomForestRegressor(),
    'BaggingRegressor': BaggingRegressor(),
    'ExtraTreesRegressor': ExtraTreesRegressor(),
    'GradientBoostingRegressor': GradientBoostingRegressor(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'ExtraTreeRegressor': ExtraTreeRegressor(),
    'AdaBoostRegressor': AdaBoostRegressor(),
    'KNeighborsRegressor': KNeighborsRegressor()
}

# Initialize list to hold MAPE results
mape_results = []

# Train, predict, and calculate MAPE for each model
for model_name, model in models.items():
    # Create a pipeline with standard scaling
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', model)
    ])
    
    # Train the model
    pipeline.fit(X_train, y_train)
    
    # Predict on validation set
    pred_val = pipeline.predict(X_val)
    
    # Calculate MAPE
    mape = mean_absolute_percentage_error(y_val, pred_val)
    
    # Append results
    mape_results.append({'Model': model_name, 'MAPE': mape})

# Convert results to DataFrame for better readability
mape_df = pd.DataFrame(mape_results)

# Format MAPE to 5 decimal places
mape_df['MAPE'] = mape_df['MAPE'].apply(lambda x: f"{x:.5f}")

# Sort by MAPE
mape_df = mape_df.sort_values(by='MAPE')

# Print MAPE results
print("MAPE Results for Each Model:")
print(mape_df)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000250 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 82
[LightGBM] [Info] Number of data points in the train set: 5872, number of used features: 10
[LightGBM] [Info] Start training from score 5542.194142
MAPE Results for Each Model:
                            Model     MAPE
0                    XGBRegressor  0.03999
1   HistGradientBoostingRegressor  0.04350
2                   LGBMRegressor  0.04351
3           RandomForestRegressor  0.04492
5             ExtraTreesRegressor  0.04659
4                BaggingRegressor  0.04733
7           DecisionTreeRegressor  0.05810
8              ExtraTreeRegressor  0.06146
6       GradientBoostingRegressor  0.07577
10            KNeighborsRegressor  0.20448
9               AdaBoostRegressor  0.23689


In [9]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the XGBRegressor model
model = XGBRegressor()
model.fit(X_train, y_train)

# Predict on validation set
y_pred = model.predict(X_val)

# Calculate MAPE
mape = mean_absolute_percentage_error(y_val, y_pred)


# Print MAPE
print(f"\nMAPE for XGBRegressor: {mape:.5f}")


MAPE for XGBRegressor: 0.03928


In [10]:
# Print feature importances
feature_importances = model.feature_importances_
features = X.columns if hasattr(X, 'columns') else np.arange(X.shape[1])

print("\nFeature Importances:")
importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': feature_importances
})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
print(importance_df)


Feature Importances:
                                              Feature  Importance
0                                           warehouse    0.622150
30                         holiday_name_New Years Day    0.092631
5                                           date_year    0.072311
12                         holiday_name_Christmas Eve    0.047119
6                                          date_month    0.032187
1                                             holiday    0.025168
2                                        shops_closed    0.024899
8                                    date_day_of_week    0.021119
16                   holiday_name_Den ceske statnosti    0.015761
17                        holiday_name_Den osvobozeni    0.013576
7                                            date_day    0.005830
23                               holiday_name_Jan Hus    0.004591
19                         holiday_name_Easter Monday    0.004160
4                                     school_holidays 

In [11]:
# Define the best hyperparameters
best_params = {
    'colsample_bytree': 0.9,
    'learning_rate': 0.1,
    'max_depth': 9,
    'min_child_weight': 5,
    'n_estimators': 200,
    'subsample': 0.9
}

# Initialize and train the XGBRegressor model with best hyperparameters
model = XGBRegressor(**best_params)
model.fit(X_train, y_train)

# Predict on validation set
y_pred = model.predict(X_val)

# Calculate MAPE
mape = mean_absolute_percentage_error(y_val, y_pred)

# Print MAPE
print(f"\nMAPE for Optimized XGBRegressor: {mape:.5f}")


MAPE for Optimized XGBRegressor: 0.03730


In [12]:
# Make predictions on the test set
pred_test = model.predict(test_df_le.drop(columns=['orders']))

# Prepare the submission file
submission = pd.DataFrame({
    'id': test_id,
    'orders': pred_test
})

# Save the predictions to a CSV file
submission.to_csv('prediction_python_xgboost.csv', index=False)

# Print the submission DataFrame
print(submission)

                        id        orders
0      Prague_1_2024-03-16  10325.419922
1      Prague_1_2024-03-17  10102.911133
2      Prague_1_2024-03-18   9737.808594
3      Prague_1_2024-03-19   9722.055664
4      Prague_1_2024-03-20   9534.790039
..                     ...           ...
392  Budapest_1_2024-05-11   7141.117676
393  Budapest_1_2024-05-12   6339.903809
394  Budapest_1_2024-05-13   6513.333008
395  Budapest_1_2024-05-14   6788.502930
396  Budapest_1_2024-05-15   6627.637207

[397 rows x 2 columns]
