In [1]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error, root_mean_squared_error
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from math import sqrt

In [2]:
data = [{'quantity': '5', 'created_at': '2024-01-01', 'positive': 5, 'negative': 0}, {'quantity': '9', 'created_at': '2024-01-02', 'positive': 3, 'negative': 6}, {'quantity': '5', 'created_at': '2024-01-03', 'positive': 3, 'negative': 2}, {'quantity': '5', 'created_at': '2024-01-04', 'positive': 4, 'negative': 1}, {'created_at': '2024-01-05', 'quantity': 0}, {'quantity': '4', 'created_at': '2024-01-06', 'positive': 2, 'negative': 2}, {'quantity': '5', 'created_at': '2024-01-07', 'positive': 1, 'negative': 4}, {'quantity': '4', 'created_at': '2024-01-08', 'positive': 1, 'negative': 3}, {'quantity': '9', 'created_at': '2024-01-09', 'positive': 4, 'negative': 5}, {'quantity': '3', 'created_at': '2024-01-10', 'positive': 3, 'negative': 0}, {'quantity': '1', 'created_at': '2024-01-11', 'positive': 0, 'negative': 1}, {'quantity': '5', 'created_at': '2024-01-12', 'positive': 5, 'negative': 0}, {'quantity': '9', 'created_at': '2024-01-13', 'positive': 9, 'negative': 0}, {'quantity': '6', 'created_at': '2024-01-14', 'positive': 4, 'negative': 2}, {'quantity': '6', 'created_at': '2024-01-15', 'positive': 2, 'negative': 4}, {'quantity': '8', 'created_at': '2024-01-16', 'positive': 8, 'negative': 0}, {'quantity': '5', 'created_at': '2024-01-17', 'positive': 5, 'negative': 0}, {'quantity': '2', 'created_at': '2024-01-18', 'positive': 2, 'negative': 0}, {'quantity': '7', 'created_at': '2024-01-19', 'positive': 5, 'negative': 2}, {'quantity': '5', 'created_at': '2024-01-20', 'positive': 4, 'negative': 1}, {'quantity': '9', 'created_at': '2024-01-21', 'positive': 9, 'negative': 0}, {'quantity': '4', 'created_at': '2024-01-22', 'positive': 2, 'negative': 2}, {'quantity': '6', 'created_at': '2024-01-23', 'positive': 2, 'negative': 4}, {'quantity': '3', 'created_at': '2024-01-24', 'positive': 0, 'negative': 3}, {'quantity': '5', 'created_at': '2024-01-25', 'positive': 4, 'negative': 1}, {'created_at': '2024-01-26', 'quantity': 0}, {'quantity': '6', 'created_at': '2024-01-27', 'positive': 2, 'negative': 4}, {'quantity': '7', 'created_at': '2024-01-28', 'positive': 1, 'negative': 6}, {'quantity': '6', 'created_at': '2024-01-29', 'positive': 4, 'negative': 2}, {'quantity': '5', 'created_at': '2024-01-30', 'positive': 2, 'negative': 3}, {'quantity': '6', 'created_at': '2024-01-31', 'positive': 2, 'negative': 4}, {'quantity': '6', 'created_at': '2024-02-01', 'positive': 0, 'negative': 6}, {'quantity': '5', 'created_at': '2024-02-02', 'positive': 5, 'negative': 0}, {'created_at': '2024-02-03', 'quantity': 0}, {'quantity': '3', 'created_at': '2024-02-04', 'positive': 3, 'negative': 0}, {'quantity': '6', 'created_at': '2024-02-05', 'positive': 6, 'negative': 0}, {'quantity': '7', 'created_at': '2024-02-06', 'positive': 6, 'negative': 1}, {'quantity': '10', 'created_at': '2024-02-07', 'positive': 10, 'negative': 0}, {'quantity': '10', 'created_at': '2024-02-08', 'positive': 3, 'negative': 7}, {'quantity': '7', 'created_at': '2024-02-09', 'positive': 7, 'negative': 0}, {'quantity': '10', 'created_at': '2024-02-10', 'positive': 4, 'negative': 6}, {'quantity': '11', 'created_at': '2024-02-11', 'positive': 11, 'negative': 0}, {'quantity': '7', 'created_at': '2024-02-12', 'positive': 7, 'negative': 0}, {'created_at': '2024-02-13', 'quantity': 0}, {'quantity': '5', 'created_at': '2024-02-14', 'positive': 1, 'negative': 4}, {'quantity': '1', 'created_at': '2024-02-15', 'positive': 1, 'negative': 0}, {'quantity': '7', 'created_at': '2024-02-16', 'positive': 4, 'negative': 3}, {'quantity': '5', 'created_at': '2024-02-17', 'positive': 4, 'negative': 1}, {'quantity': '3', 'created_at': '2024-02-18', 'positive': 0, 'negative': 3}, {'quantity': '5', 'created_at': '2024-02-19', 'positive': 4, 'negative': 1}, {'quantity': '6', 'created_at': '2024-02-20', 'positive': 5, 'negative': 1}, {'quantity': '9', 'created_at': '2024-02-21', 'positive': 7, 'negative': 2}, {'quantity': '6', 'created_at': '2024-02-22', 'positive': 6, 'negative': 0}, {'quantity': '5', 'created_at': '2024-02-23', 'positive': 5, 'negative': 0}, {'quantity': '9', 'created_at': '2024-02-24', 'positive': 6, 'negative': 3}, {'created_at': '2024-02-25', 'quantity': 0}, {'quantity': '5', 'created_at': '2024-02-26', 'positive': 5, 'negative': 0}, {'quantity': '6', 'created_at': '2024-02-27', 'positive': 6, 'negative': 0}, {'quantity': '10', 'created_at': '2024-02-28', 'positive': 10, 'negative': 0}, {'quantity': '6', 'created_at': '2024-02-29', 'positive': 6, 'negative': 0}, {'quantity': '10', 'created_at': '2024-03-01', 'positive': 10, 'negative': 0}, {'quantity': '5', 'created_at': '2024-03-02', 'positive': 4, 'negative': 1}, {'quantity': '8', 'created_at': '2024-03-03', 'positive': 4, 'negative': 4}, {'quantity': '9', 'created_at': '2024-03-04', 'positive': 9, 'negative': 0}, {'quantity': '6', 'created_at': '2024-03-05', 'positive': 1, 'negative': 5}, {'quantity': '6', 'created_at': '2024-03-06', 'positive': 5, 'negative': 1}, {'quantity': '6', 'created_at': '2024-03-07', 'positive': 6, 'negative': 0}, {'quantity': '9', 'created_at': '2024-03-08', 'positive': 9, 'negative': 0}, {'quantity': '1', 'created_at': '2024-03-09', 'positive': 0, 'negative': 1}, {'quantity': '2', 'created_at': '2024-03-10', 'positive': 2, 'negative': 0}, {'quantity': '3', 'created_at': '2024-03-11', 'positive': 2, 'negative': 1}, {'quantity': '6', 'created_at': '2024-03-12', 'positive': 6, 'negative': 0}, {'quantity': '4', 'created_at': '2024-03-13', 'positive': 3, 'negative': 1}, {'quantity': '5', 'created_at': '2024-03-14', 'positive': 4, 'negative': 1}, {'created_at': '2024-03-15', 'quantity': 0}, {'quantity': '5', 'created_at': '2024-03-16', 'positive': 4, 'negative': 1}, {'quantity': '7', 'created_at': '2024-03-17', 'positive': 7, 'negative': 0}, {'quantity': '7', 'created_at': '2024-03-18', 'positive': 6, 'negative': 1}, {'quantity': '1', 'created_at': '2024-03-19', 'positive': 1, 'negative': 0}, {'quantity': '5', 'created_at': '2024-03-20', 'positive': 2, 'negative': 3}, {'quantity': '8', 'created_at': '2024-03-21', 'positive': 1, 'negative': 7}, {'created_at': '2024-03-22', 'quantity': 0}, {'quantity': '11', 'created_at': '2024-03-23', 'positive': 11, 'negative': 0}, {'quantity': '3', 'created_at': '2024-03-24', 'positive': 3, 'negative': 0}, {'quantity': '11', 'created_at': '2024-03-25', 'positive': 5, 'negative': 6}, {'quantity': '1', 'created_at': '2024-03-26', 'positive': 0, 'negative': 1}, {'created_at': '2024-03-27', 'quantity': 0}, {'quantity': '7', 'created_at': '2024-03-28', 'positive': 7, 'negative': 0}, {'quantity': '5', 'created_at': '2024-03-29', 'positive': 4, 'negative': 1}, {'quantity': '6', 'created_at': '2024-03-30', 'positive': 6, 'negative': 0}, {'quantity': '1', 'created_at': '2024-03-31', 'positive': 0, 'negative': 1}, {'quantity': '6', 'created_at': '2024-04-01', 'positive': 6, 'negative': 0}, {'quantity': '3', 'created_at': '2024-04-02', 'positive': 0, 'negative': 3}, {'quantity': '8', 'created_at': '2024-04-03', 'positive': 8, 'negative': 0}, {'quantity': '3', 'created_at': '2024-04-04', 'positive': 2, 'negative': 1}, {'quantity': '5', 'created_at': '2024-04-05', 'positive': 5, 'negative': 0}, {'quantity': '8', 'created_at': '2024-04-06', 'positive': 4, 'negative': 4}, {'quantity': '2', 'created_at': '2024-04-07', 'positive': 2, 'negative': 0}, {'created_at': '2024-04-08', 'quantity': 0}, {'quantity': '8', 'created_at': '2024-04-09', 'positive': 6, 'negative': 2}, {'quantity': '6', 'created_at': '2024-04-10', 'positive': 6, 'negative': 0}, {'quantity': '6', 'created_at': '2024-04-11', 'positive': 5, 'negative': 1}, {'quantity': '9', 'created_at': '2024-04-12', 'positive': 5, 'negative': 4}, {'quantity': '11', 'created_at': '2024-04-13', 'positive': 9, 'negative': 2}, {'quantity': '4', 'created_at': '2024-04-14', 'positive': 4, 'negative': 0}, {'quantity': '10', 'created_at': '2024-04-15', 'positive': 4, 'negative': 6}, {'quantity': '8', 'created_at': '2024-04-16', 'positive': 8, 'negative': 0}, {'quantity': '7', 'created_at': '2024-04-17', 'positive': 7, 'negative': 0}, {'quantity': '11', 'created_at': '2024-04-18', 'positive': 2, 'negative': 9}, {'quantity': '10', 'created_at': '2024-04-19', 'positive': 3, 'negative': 7}, {'quantity': '7', 'created_at': '2024-04-20', 'positive': 6, 'negative': 1}, {'quantity': '8', 'created_at': '2024-04-21', 'positive': 8, 'negative': 0}, {'quantity': '11', 'created_at': '2024-04-22', 'positive': 10, 'negative': 1}, {'quantity': '9', 'created_at': '2024-04-23', 'positive': 8, 'negative': 1}, {'quantity': '7', 'created_at': '2024-04-24', 'positive': 7, 'negative': 0}, {'quantity': '11', 'created_at': '2024-04-25', 'positive': 6, 'negative': 5}, {'quantity': '7', 'created_at': '2024-04-26', 'positive': 3, 'negative': 4}, {'quantity': '5', 'created_at': '2024-04-27', 'positive': 5, 'negative': 0}, {'quantity': '7', 'created_at': '2024-04-28', 'positive': 7, 'negative': 0}, {'quantity': '6', 'created_at': '2024-04-29', 'positive': 0, 'negative': 6}, {'quantity': '6', 'created_at': '2024-04-30', 'positive': 6, 'negative': 0}]

In [3]:
# FEATURES_1 = ['month', 'day_of_month', 'day_of_year', 'week_of_year', 'day_of_week','year','is_wknd','is_month_start','is_month_end']
# TARGET = 'quantity'

In [4]:
# check if data is a list and convert to pandas dataframe
if data and isinstance(data, list):
    print('Data is a list')
    try:
        # convert data to a pandas dataframe
        df = pd.DataFrame(data)
        # convert created_at to datetime value
        df['created_at'] = pd.to_datetime(df['created_at'])
        # set created_at as index
        df.index = df['created_at']
        # drop created_at column because it is now the index
        df.drop('created_at', axis=1, inplace=True)
        print('Dataframe created successfully')
    except KeyError:
        print('KeyError: created_at column not found')
else:
    print('data is not a list')
    

Data is a list
Dataframe created successfully


In [5]:
df.fillna(np.nan, inplace=True)
df['quantity'] = df['quantity'].astype(float)
df['positive'] = df['positive'].astype(float)
# print(df.dtypes)
# df.head()

In [11]:
df.dropna(inplace=True)
df.isnull().sum()

quantity    0
positive    0
negative    0
dtype: int64

In [12]:
min = df.index.min()
max = df.index.max()
print(f'min: {min}, max: {max}')

min: 2024-01-01 00:00:00, max: 2024-04-30 00:00:00


# FEATURE ENGINEERING

In [13]:
def generate_date_features(dataframe: pd.DataFrame):
    df_copy = dataframe.copy()  # Create a copy of the DataFrame to ensure the original remains unchanged

    df_copy['month'] = df_copy.index.month    
    df_copy['day_of_month'] = df_copy.index.day
    df_copy['day_of_year'] = df_copy.index.dayofyear
    df_copy['week_of_year'] = df_copy.index.isocalendar().week
    df_copy['day_of_week'] = df_copy.index.dayofweek
    df_copy['year'] = df_copy.index.year
    df_copy["is_wknd"] = df_copy.index.weekday // 4
    df_copy['is_month_start'] = df_copy.index.is_month_start.astype(int)
    df_copy['is_month_end'] = df_copy.index.is_month_end.astype(int)
    
    return df_copy

user_defined_lag = 7
user_defined_days = 30 # get from the api

def generate_lag_features(dataframe: pd.DataFrame):
    df_copy = dataframe.copy()
    # create lag features
    for i in range(user_defined_days + 1):
        df_copy[f'quantity_lag_{i}'] = df_copy['quantity'].shift(i)
        df_copy[f'positive_lag_{i}'] = df_copy['positive'].shift(i)
        df_copy[f'negative_lag_{i}'] = df_copy['negative'].shift(i)
        # # convert to int
        
    # df_copy['lag_1'] = (df.index - pd.Timedelta(days=7)).map(target_map)
    # df_copy['lag_2'] = (df.index - pd.Timedelta(days=14)).map(target_map)
    # df_copy['lag_3'] = (df.index - pd.Timedelta(days=21)).map(target_map)
    return df_copy

# APPLY FEATURE ENGINEERING

In [14]:
df_w_date_features = generate_date_features(df)
df_w_date_lag_features = generate_lag_features(df_w_date_features)

# DROP MISSING VALUES

In [15]:
print(f'df zero value count: {df_w_date_lag_features.isnull().sum()}')
# count all 0 value in quantity column
print(f'quantity zero value count: {df_w_date_lag_features[df_w_date_lag_features["quantity"] == 0].shape[0]}')

df zero value count: quantity            0
positive            0
negative            0
month               0
day_of_month        0
                   ..
positive_lag_29    29
negative_lag_29    29
quantity_lag_30    30
positive_lag_30    30
negative_lag_30    30
Length: 105, dtype: int64
quantity zero value count: 0


In [16]:
# create date features for the dataframe
# df_w_features = generate_date_features(df)

In [17]:
# df_w_features.dtypes

# GENERATE LAG FEATURES

In [18]:
# user_defined_lag = 7
# user_defined_days = 30 # get from the api
# 
# def generate_lag_features(df):
#     df_copy = df.copy()
#     # create lag features
#     for i in range(user_defined_days + 1):
#         df_copy[f'quantity_lag_{i}'] = df_copy['quantity'].shift(i)
#         df_copy[f'positive_lag_{i}'] = df_copy['positive'].shift(i)
#         df_copy[f'negative_lag_{i}'] = df_copy['negative'].shift(i)
#         # # convert to int
#         
#     # df_copy['lag_1'] = (df.index - pd.Timedelta(days=7)).map(target_map)
#     # df_copy['lag_2'] = (df.index - pd.Timedelta(days=14)).map(target_map)
#     # df_copy['lag_3'] = (df.index - pd.Timedelta(days=21)).map(target_map)
#     return df_copy
#     
# # df_with_lag_features = generate_lag_features(df_w_features)

In [19]:
# print(f'df zero value count: {df_with_lag_features.isnull().sum()}')
# df_with_lag_features.fillna(np.nan, inplace=True)

#convert to int
# df_with_lag_features = df_with_lag_features.astype(int)

# df_with_lag_features.head(10)

# df_with_lag_features

# TIME SERIES CROSS VALIDATION

from sklearn.model_selection import TimeSeriesSplit
import numpy as np

# Example data (7 days worth of data)
data = np.random.rand(7)  # Replace this with your actual data

# Number of splits (e.g., 5-fold cross-validation)
n_splits = 5

# Create TimeSeriesSplit object
tscv = TimeSeriesSplit(n_splits=n_splits)

# Iterate over the splits and print the train and test indices
for train_index, test_index in tscv.split(data):
    print("Train indices:", train_index)
    print("Test indices:", test_index)
    print("---")


# TRAIN TEST SPLIT

In [20]:
import pandas as pd

def time_based_split(data, split_ratio=0.8):
    """
    Splits a time series dataset into training and testing sets based on a time-based split ratio.

    Args:
        data (pd.Series or pd.DataFrame): The time series data to split.
        split_ratio (float): The proportion of data to use for training (0 to 1). Default is 0.8.

    Returns:
        tuple: A tuple containing the training and testing sets as pandas DataFrames.
    """

    total_length = len(data)
    train_size = int(total_length * split_ratio)
    test_size = total_length - train_size

    train = data.iloc[:train_size]
    test = data.iloc[train_size:]

    return train, test


In [21]:
# # train test split
# train, test = time_based_split(df, split_ratio=0.8)
# 
# print(f'train shape: {train.shape}, test shape: {test.shape}')
# print(f'train min and max: {train.index.min()}, {train.index.max()}')
# print(f'test min and max: {test.index.min()}, {test.index.max()}')

In [22]:
def handle_train_test_size(df, splitting_method, split_ratio):
    if splitting_method == 'time_based_split':
        if split_ratio > 1 or split_ratio < 0:
            raise ValueError('split_ratio must be between 0 and 1')
        
        train, test = time_based_split(df, split_ratio)
        
        return train, test
    
    elif splitting_method == 'week':
        end_data = df.index.max()
        # subtract 7 days from end_date
        start_data = end_data - pd.Timedelta(days=7)
        
        train = df[df.index < start_data]
        test = df[df.index >= start_data]
        
        return train, test
    
    elif splitting_method == 'month':
        end_data = df.index.max()
        # subtract 30 days from end_date
        start_data = end_data - pd.Timedelta(days=30)
        
        train = df[df.index < start_data]
        test = df[df.index >= start_data]
        
        return train, test
    else:
        raise ValueError('splitting_method must be either time_based_split, week or month')

# TRAIN TEST SPLIT

In [23]:
train, test = handle_train_test_size(df_w_date_lag_features, 'time_based_split', 0.8)
print(f'train shape: {train.shape}, test shape: {test.shape}')
print(f'train min and max: {train.index.min()}, {train.index.max()}')
print(f'test min and max: {test.index.min()}, {test.index.max()}')

train shape: (89, 105), test shape: (23, 105)
train min and max: 2024-01-01 00:00:00, 2024-04-06 00:00:00
test min and max: 2024-04-07 00:00:00, 2024-04-30 00:00:00


# TRAIN TEST SPLIT WEEK METHOD

In [24]:
train, test = handle_train_test_size(df_w_date_lag_features, 'week', 0.8)
print(f'train shape: {train.shape}, test shape: {test.shape}')
print(f'train min and max: {train.index.min()}, {train.index.max()}')
print(f'test min and max: {test.index.min()}, {test.index.max()}')

train shape: (104, 105), test shape: (8, 105)
train min and max: 2024-01-01 00:00:00, 2024-04-22 00:00:00
test min and max: 2024-04-23 00:00:00, 2024-04-30 00:00:00


# TRAIN TEST SPLIT MONTH METHOD

In [25]:
train, test = handle_train_test_size(df_w_date_lag_features, 'month', 0.8)
print(f'train shape: {train.shape}, test shape: {test.shape}')
print(f'train min and max: {train.index.min()}, {train.index.max()}')
print(f'test min and max: {test.index.min()}, {test.index.max()}')

train shape: (82, 105), test shape: (30, 105)
train min and max: 2024-01-01 00:00:00, 2024-03-30 00:00:00
test min and max: 2024-03-31 00:00:00, 2024-04-30 00:00:00


In [26]:
# # train test split
# 
# 
# train = df_with_lag_features[df_with_lag_features.index < '2024-03-01']
# test = df_with_lag_features[df_with_lag_features.index >= '2024-03-01']
# 
# 
# 
# print(f'train shape: {train.shape}, test shape: {test.shape}')
# print(f'train min and max: {train.index.min()}, {train.index.max()}')
# print(f'test min and max: {test.index.min()}, {test.index.max()}')

In [27]:
def create_features_target(dataframe, target):
    DATE_FEATURES = ['month', 'day_of_month', 'day_of_year', 'week_of_year', 'day_of_week','year','is_wknd','is_month_start','is_month_end']
    LAG_FEATURES = [col for col in dataframe.columns if 'quantity_lag' in col or 'positive_lag' in col or 'negative_lag' in col]
    FEATURES = DATE_FEATURES + LAG_FEATURES
    TARGET = target
    return FEATURES, TARGET

In [28]:
FEATURES, TARGET = create_features_target(train, 'quantity')

In [29]:
TARGET

'quantity'

In [82]:
# FEATURES_2 = [col for col in df_with_lag_features.columns if 'quantity_lag' in col or 'positive_lag' in col or 'negative_lag' in col]
# FEATURES_COMBINED = FEATURES_1 + FEATURES_2
print(FEATURES)

['month', 'day_of_month', 'day_of_year', 'week_of_year', 'day_of_week', 'year', 'is_wknd', 'is_month_start', 'is_month_end', 'quantity_lag_0', 'positive_lag_0', 'negative_lag_0', 'quantity_lag_1', 'positive_lag_1', 'negative_lag_1', 'quantity_lag_2', 'positive_lag_2', 'negative_lag_2', 'quantity_lag_3', 'positive_lag_3', 'negative_lag_3', 'quantity_lag_4', 'positive_lag_4', 'negative_lag_4', 'quantity_lag_5', 'positive_lag_5', 'negative_lag_5', 'quantity_lag_6', 'positive_lag_6', 'negative_lag_6', 'quantity_lag_7', 'positive_lag_7', 'negative_lag_7', 'quantity_lag_8', 'positive_lag_8', 'negative_lag_8', 'quantity_lag_9', 'positive_lag_9', 'negative_lag_9', 'quantity_lag_10', 'positive_lag_10', 'negative_lag_10', 'quantity_lag_11', 'positive_lag_11', 'negative_lag_11', 'quantity_lag_12', 'positive_lag_12', 'negative_lag_12', 'quantity_lag_13', 'positive_lag_13', 'negative_lag_13', 'quantity_lag_14', 'positive_lag_14', 'negative_lag_14', 'quantity_lag_15', 'positive_lag_15', 'negative_la

In [32]:
# create X_train, y_train
# X_train = train[FEATURES]
# y_train = train[TARGET]
# 
# # create X_test, y_test
# X_test = test[FEATURES]
# y_test = test[TARGET]

In [33]:
def create_train_data(dataframe, features, target):
    X_train = dataframe[features]
    y_train = dataframe[target]
    return X_train, y_train

def create_test_data(dataframe, features, target):
    X_test = dataframe[features]
    y_test = dataframe[target]
    return X_test, y_test

In [36]:
X_train, y_train = create_train_data(train, FEATURES, TARGET)
X_test, y_test = create_test_data(test, FEATURES, TARGET)

# MODEL TRAINING

In [37]:
# Define the parameter grid
param_grid = {
    'n_estimators': [500, 1000, 2000],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3],
    'subsample': [0.6, 0.8, 1.0],
    'min_child_weight': [1, 3, 5]
}

# Initialize the XGBRegressor
reg = xgb.XGBRegressor(
    base_score=0.5,
    booster='gbtree',
    # objective='reg:squarederror',
    objective='reg:squaredlogerror',
    early_stopping_rounds=50,
    eval_metric='mae',
    n_jobs=-1,
)

tscv = TimeSeriesSplit(n_splits=3)

# Initialize the GridSearchCV
grid_search = GridSearchCV(estimator=reg, param_grid=param_grid, cv=tscv, n_jobs=-1)

# Fit the GridSearchCV to the data
grid_search.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=0)

# Get the best parameters
best_params = grid_search.best_params_

# Print the best parameters
print(f'Best parameters: {best_params}')

Best parameters: {'learning_rate': 0.3, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 500, 'subsample': 1.0}


In [38]:
# # Train the model using the best parameters
# stopping_round = {'early_stopping_rounds': 50}
# 
# best_params = {
#     **best_params,
#     **stopping_round
# }

print(f'Best parameters: {best_params}')

reg = xgb.XGBRegressor(
    base_score=0.5,
    booster='gbtree',
    # objective='reg:squarederror',
    objective='reg:squaredlogerror',
    early_stopping_rounds=50,
    eval_metric='mae',
    n_jobs=-1,
    **best_params
)
reg.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    verbose=1000
)

Best parameters: {'learning_rate': 0.3, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 500, 'subsample': 1.0}
[0]	validation_0-mae:5.18185	validation_1-mae:6.23745
[83]	validation_0-mae:0.70015	validation_1-mae:1.39678


# MODEL EVALUATION

In [39]:
# print(FEATURES_COMBINED)

NameError: name 'FEATURES_COMBINED' is not defined

In [40]:
# Generate predictions
# y_pred_train = reg.predict(X_train)
y_pred_test = reg.predict(X_test)

# Calculate metrics for the training set
# mse_train = mean_squared_error(y_train, y_pred_train)
# mae_train = mean_absolute_error(y_train, y_pred_train)
# r2_train = r2_score(y_train, y_pred_train)
# mape_train = mean_absolute_percentage_error(y_train, y_pred_train)
# rmse_train = sqrt(mse_train)

# Calculate metrics for the test set
mse_test = mean_squared_error(y_test, y_pred_test)
mae_test = mean_absolute_error(y_test, y_pred_test)
r2_test = r2_score(y_test, y_pred_test)
mape_test = mean_absolute_percentage_error(y_test, y_pred_test)
rmse_test = root_mean_squared_error(y_test, y_pred_test)
nrmse = rmse_test / (y_test.max() - y_test.min())  # Calculate NRMSE

# Print the metrics
# print(f'Training set metrics: MSE={mse_train}, MAE={mae_train}, R2={r2_train}, MAPE={mape_train}, RMSE={rmse_train}')
print(f'Test set metrics: MSE={mse_test}, MAE={mae_test}, R2={r2_test}, MAPE={mape_test}, RMSE={rmse_test}, NRMSE={nrmse}')

Test set metrics: MSE=3.556197587983379, MAE=1.3210352222124735, R2=0.49197177314523155, MAPE=0.16481359042990607, RMSE=1.8857883200357826, NRMSE=0.18857883200357825


In [138]:
def mode_evaluate(model, X_test, y_test):
    try:
        print('Evaluating model')
        y_pred_test = model.predict(X_test)
        

        # Calculate metrics for the test set
        mse_test = mean_squared_error(y_test, y_pred_test)
        mae_test = mean_absolute_error(y_test, y_pred_test)
        r2_test = r2_score(y_test, y_pred_test)
        mape_test = mean_absolute_percentage_error(y_test, y_pred_test)
        rmse_test = root_mean_squared_error(y_test, y_pred_test)
        nrmse = rmse_test / (y_test.max() - y_test.min())  # Calculate NRMSE
        
        # create a dataframe with index of X_test  and columns of y_pred_test and y_test 
        test_df = pd.DataFrame({'date': X_test.index ,'actual': y_test, 'predicted': y_pred_test}, index=X_test.index)
        
        test_df['date'] = test_df['date'].dt.strftime('%Y-%m-%d')
    
        print(
            f'Test set metrics: MSE={mse_test}, MAE={mae_test}, R2={r2_test}, MAPE={mape_test}, RMSE={rmse_test}, NRMSE={nrmse}')

        accuracy = {
            'mse': mse_test,
            'mae': mae_test,
            'r2': r2_test,
            'mape': mape_test,
            'rmse': rmse_test,
            'nrmse': nrmse,
            'acurracy_test': test_df.to_dict('records'),
        }
        print('Model evaluated successfully')
        return accuracy
    
    except Exception as e:
        print(f"An error occurred: {e}")
        return False

In [139]:
accuracy = mode_evaluate(reg, X_test, y_test)

Evaluating model
Test set metrics: MSE=3.556197587983379, MAE=1.3210352222124735, R2=0.49197177314523155, MAPE=0.16481359042990607, RMSE=1.8857883200357826, NRMSE=0.18857883200357825
Model evaluated successfully


In [140]:
accuracy

{'mse': 3.556197587983379,
 'mae': 1.3210352222124735,
 'r2': 0.49197177314523155,
 'mape': 0.16481359042990607,
 'rmse': 1.8857883200357826,
 'nrmse': 0.18857883200357825,
 'acurracy_test': [{'date': '2024-03-31',
   'actual': 1.0,
   'predicted': 1.3457766771316528},
  {'date': '2024-04-01', 'actual': 6.0, 'predicted': 6.996027946472168},
  {'date': '2024-04-02', 'actual': 3.0, 'predicted': 3.1805803775787354},
  {'date': '2024-04-03', 'actual': 8.0, 'predicted': 6.990198135375977},
  {'date': '2024-04-04', 'actual': 3.0, 'predicted': 3.2320964336395264},
  {'date': '2024-04-05', 'actual': 5.0, 'predicted': 5.5846734046936035},
  {'date': '2024-04-06', 'actual': 8.0, 'predicted': 6.921599388122559},
  {'date': '2024-04-07', 'actual': 2.0, 'predicted': 2.0965464115142822},
  {'date': '2024-04-09', 'actual': 8.0, 'predicted': 7.148423671722412},
  {'date': '2024-04-10', 'actual': 6.0, 'predicted': 6.97487211227417},
  {'date': '2024-04-11', 'actual': 6.0, 'predicted': 6.828033447265625

In [141]:
y_pred_test

array([1.3457767, 6.996028 , 3.1805804, 6.990198 , 3.2320964, 5.5846734,
       6.9215994, 2.0965464, 7.1484237, 6.974872 , 6.8280334, 6.737056 ,
       7.0969076, 4.463112 , 6.8231735, 7.0931945, 7.0454273, 6.6636167,
       6.9407988, 7.041714 , 6.923392 , 6.6891127, 6.86637  , 7.0969076,
       7.153965 , 6.5652847, 5.5672307, 6.795858 , 6.6636167, 6.923392 ],
      dtype=float32)

In [29]:
# test_w_prediction = test.copy()
# test_w_prediction['quantity_pred'] = reg.predict(X_test)
# 
# # Display the first few rows of the test_w_prediction DataFrame
# test_w_prediction.head()

In [53]:
# plot quanity and quantity_pred
# plt.figure(figsize=(15, 7))
# plt.plot(test_w_prediction.index, test_w_prediction['quantity'], label='quantity')
# plt.plot(test_w_prediction.index, test_w_prediction['quantity_pred'], label='quantity_pred')
# plt.legend()
# plt.show()

# MODEL EVALUATION

# GENERATE FUTURE DATES

In [74]:
max = df.index.max()
# add 1 day to max because we want to predict for the next 7 days, and we have today's sales data
# max = max + pd.Timedelta(days=1)

In [83]:
predictfornext = pd.date_range(max, periods=30, freq='D')

In [84]:
predictfornext

DatetimeIndex(['2024-04-30', '2024-05-01', '2024-05-02', '2024-05-03',
               '2024-05-04', '2024-05-05', '2024-05-06', '2024-05-07',
               '2024-05-08', '2024-05-09', '2024-05-10', '2024-05-11',
               '2024-05-12', '2024-05-13', '2024-05-14', '2024-05-15',
               '2024-05-16', '2024-05-17', '2024-05-18', '2024-05-19',
               '2024-05-20', '2024-05-21', '2024-05-22', '2024-05-23',
               '2024-05-24', '2024-05-25', '2024-05-26', '2024-05-27',
               '2024-05-28', '2024-05-29'],
              dtype='datetime64[ns]', freq='D')

In [154]:
def generate_future_target_dates(last_date_of_data, pred_range: int):
    # Add 1 day to max because we want to predict for the next 7 days, and we have today's sales data
    end_date = last_date_of_data + pd.Timedelta(days=1)
    # create a Datetimeindex from pass variables
    future_dates = pd.date_range(end_date, periods=pred_range, freq='D')
    # Create a single column called date
    future_dates_df = pd.DataFrame(future_dates, columns=['date'])
    # Make the date the first column the index
    future_dates_df.set_index('date', inplace=True)
    # Add quantity column
    future_dates_df['quantity'] = np.nan
    # add isFuture column
    future_dates_df['isFuture'] = True
    # # create date features
    # future_dates_df = create_date_features(future_dates_df)
    return future_dates_df


def generate_future_w_lags_dates(orignal_dataframe, future_dataframe):
    try:
        original_dataframe_copy = orignal_dataframe.copy()
        
        original_dataframe_copy['isFuture'] = False
        
        # get the original dataframe and concat with the future dates dataframe
        future_dates_df = pd.concat([original_dataframe_copy, future_dataframe])
        
        future_dates_df = generate_date_features(future_dates_df)
        future_dates_df = generate_lag_features(future_dates_df)
        
        return future_dates_df
    except Exception as e:
        print(f"An error occurred: {e}")
        return False
    
    
def generate_prediction_for_future_dates(model, future_dates_df, features):

    predict_data = future_dates_df.query('isFuture == True').copy()
    
    predict_data['pred'] = model.predict(predict_data[features])
    
    return predict_data

In [155]:
test_df = generate_future_target_dates(max, pred_range=30)

In [156]:
test_df

Unnamed: 0_level_0,quantity,isFuture
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2024-05-01,,True
2024-05-02,,True
2024-05-03,,True
2024-05-04,,True
2024-05-05,,True
2024-05-06,,True
2024-05-07,,True
2024-05-08,,True
2024-05-09,,True
2024-05-10,,True


In [157]:
test = generate_future_w_lags_dates(df, test_df)

In [158]:
test

Unnamed: 0,quantity,positive,negative,isFuture,month,day_of_month,day_of_year,week_of_year,day_of_week,year,...,negative_lag_27,quantity_lag_28,positive_lag_28,negative_lag_28,quantity_lag_29,positive_lag_29,negative_lag_29,quantity_lag_30,positive_lag_30,negative_lag_30
2024-01-01,5.0,5.0,0.0,False,1,1,1,1,0,2024,...,,,,,,,,,,
2024-01-02,9.0,3.0,6.0,False,1,2,2,1,1,2024,...,,,,,,,,,,
2024-01-03,5.0,3.0,2.0,False,1,3,3,1,2,2024,...,,,,,,,,,,
2024-01-04,5.0,4.0,1.0,False,1,4,4,1,3,2024,...,,,,,,,,,,
2024-01-06,4.0,2.0,2.0,False,1,6,6,1,5,2024,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-05-26,,,,True,5,26,147,21,6,2024,...,6.0,7.0,7.0,0.0,5.0,5.0,0.0,7.0,3.0,4.0
2024-05-27,,,,True,5,27,148,22,0,2024,...,0.0,6.0,0.0,6.0,7.0,7.0,0.0,5.0,5.0,0.0
2024-05-28,,,,True,5,28,149,22,1,2024,...,,6.0,6.0,0.0,6.0,0.0,6.0,7.0,7.0,0.0
2024-05-29,,,,True,5,29,150,22,2,2024,...,,,,,6.0,6.0,0.0,6.0,0.0,6.0


In [159]:
test

Unnamed: 0,quantity,positive,negative,isFuture,month,day_of_month,day_of_year,week_of_year,day_of_week,year,...,negative_lag_27,quantity_lag_28,positive_lag_28,negative_lag_28,quantity_lag_29,positive_lag_29,negative_lag_29,quantity_lag_30,positive_lag_30,negative_lag_30
2024-01-01,5.0,5.0,0.0,False,1,1,1,1,0,2024,...,,,,,,,,,,
2024-01-02,9.0,3.0,6.0,False,1,2,2,1,1,2024,...,,,,,,,,,,
2024-01-03,5.0,3.0,2.0,False,1,3,3,1,2,2024,...,,,,,,,,,,
2024-01-04,5.0,4.0,1.0,False,1,4,4,1,3,2024,...,,,,,,,,,,
2024-01-06,4.0,2.0,2.0,False,1,6,6,1,5,2024,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-05-26,,,,True,5,26,147,21,6,2024,...,6.0,7.0,7.0,0.0,5.0,5.0,0.0,7.0,3.0,4.0
2024-05-27,,,,True,5,27,148,22,0,2024,...,0.0,6.0,0.0,6.0,7.0,7.0,0.0,5.0,5.0,0.0
2024-05-28,,,,True,5,28,149,22,1,2024,...,,6.0,6.0,0.0,6.0,0.0,6.0,7.0,7.0,0.0
2024-05-29,,,,True,5,29,150,22,2,2024,...,,,,,6.0,6.0,0.0,6.0,0.0,6.0


In [161]:
output = generate_prediction_for_future_dates(reg, test, features=FEATURES)

In [162]:
output

Unnamed: 0,quantity,positive,negative,isFuture,month,day_of_month,day_of_year,week_of_year,day_of_week,year,...,quantity_lag_28,positive_lag_28,negative_lag_28,quantity_lag_29,positive_lag_29,negative_lag_29,quantity_lag_30,positive_lag_30,negative_lag_30,pred
2024-05-01,,,,True,5,1,122,18,2,2024,...,3.0,0.0,3.0,6.0,6.0,0.0,1.0,0.0,1.0,6.855209
2024-05-02,,,,True,5,2,123,18,3,2024,...,8.0,8.0,0.0,3.0,0.0,3.0,6.0,6.0,0.0,6.668281
2024-05-03,,,,True,5,3,124,18,4,2024,...,3.0,2.0,1.0,8.0,8.0,0.0,3.0,0.0,3.0,6.974872
2024-05-04,,,,True,5,4,125,18,5,2024,...,5.0,5.0,0.0,3.0,2.0,1.0,8.0,8.0,0.0,6.737056
2024-05-05,,,,True,5,5,126,18,6,2024,...,8.0,4.0,4.0,5.0,5.0,0.0,3.0,2.0,1.0,6.91785
2024-05-06,,,,True,5,6,127,19,0,2024,...,2.0,2.0,0.0,8.0,4.0,4.0,5.0,5.0,0.0,7.030101
2024-05-07,,,,True,5,7,128,19,1,2024,...,8.0,6.0,2.0,2.0,2.0,0.0,8.0,4.0,4.0,6.731345
2024-05-08,,,,True,5,8,129,19,2,2024,...,6.0,6.0,0.0,8.0,6.0,2.0,2.0,2.0,0.0,6.821541
2024-05-09,,,,True,5,9,130,19,3,2024,...,6.0,5.0,1.0,6.0,6.0,0.0,8.0,6.0,2.0,7.024596
2024-05-10,,,,True,5,10,131,19,4,2024,...,9.0,5.0,4.0,6.0,5.0,1.0,6.0,6.0,0.0,7.041678


In [98]:
output = generate_prediction_for_future_dates(reg, test, TARGET, FEATURES)

In [145]:
output

Unnamed: 0,quantity,positive,negative,isFuture,month,day_of_month,day_of_year,week_of_year,day_of_week,year,...,quantity_lag_28,positive_lag_28,negative_lag_28,quantity_lag_29,positive_lag_29,negative_lag_29,quantity_lag_30,positive_lag_30,negative_lag_30,pred
2024-05-01,,,,True,5,1,122,18,2,2024,...,3.0,0.0,3.0,6.0,6.0,0.0,1.0,0.0,1.0,6.855209
2024-05-02,,,,True,5,2,123,18,3,2024,...,8.0,8.0,0.0,3.0,0.0,3.0,6.0,6.0,0.0,6.668281
2024-05-03,,,,True,5,3,124,18,4,2024,...,3.0,2.0,1.0,8.0,8.0,0.0,3.0,0.0,3.0,6.974872
2024-05-04,,,,True,5,4,125,18,5,2024,...,5.0,5.0,0.0,3.0,2.0,1.0,8.0,8.0,0.0,6.737056
2024-05-05,,,,True,5,5,126,18,6,2024,...,8.0,4.0,4.0,5.0,5.0,0.0,3.0,2.0,1.0,6.91785
2024-05-06,,,,True,5,6,127,19,0,2024,...,2.0,2.0,0.0,8.0,4.0,4.0,5.0,5.0,0.0,7.030101
2024-05-07,,,,True,5,7,128,19,1,2024,...,8.0,6.0,2.0,2.0,2.0,0.0,8.0,4.0,4.0,6.731345
2024-05-08,,,,True,5,8,129,19,2,2024,...,6.0,6.0,0.0,8.0,6.0,2.0,2.0,2.0,0.0,6.821541
2024-05-09,,,,True,5,9,130,19,3,2024,...,6.0,5.0,1.0,6.0,6.0,0.0,8.0,6.0,2.0,7.024596
2024-05-10,,,,True,5,10,131,19,4,2024,...,9.0,5.0,4.0,6.0,5.0,1.0,6.0,6.0,0.0,7.041678


In [33]:
# future_dates_df_with_lag_features_copy = future_dates_df_with_lag_features.query('isFuture == True').copy()

In [144]:
# predict = test.query('isFuture == True').copy()
# predict

In [35]:
# future_dates_df_with_lag_features_copy['quantity_pred'] = reg.predict(future_dates_df_with_lag_features_copy[FEATURES_COMBINED])

In [143]:
# future_dates_df_with_lag_features_copy

In [142]:
# reg