In [202]:
import pandas as pd
import datetime

import numpy as np
import pandas as pd
import requests
from io import StringIO
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime as dt
from datetime import timedelta
from tqdm import tqdm
from tqdm.notebook import trange

try:
    import holidays
except:
    !pip install holidays
    import holidays
try:
    import xgboost as xgb
except:
    !pip install xgboost
    import xgboost as xgb
try:
    from lunardate import LunarDate
except:
    !pip install lunardate
    from lunardate import LunarDate

from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from scipy.fft import fft
import warnings

warnings.simplefilter(action='ignore')

# Define url path to CSV
annex1 = 'https://raw.githubusercontent.com/prattapong/Data-Science-Portfolio/main/Projects/Supermaket%20Sales%20Data/data/annex1.csv'
annex2 = 'https://raw.githubusercontent.com/prattapong/Data-Science-Portfolio/main/Projects/Supermaket%20Sales%20Data/data/annex2.csv'
annex3 = 'https://raw.githubusercontent.com/prattapong/Data-Science-Portfolio/main/Projects/Supermaket%20Sales%20Data/data/annex3.csv'
annex4 = 'https://raw.githubusercontent.com/prattapong/Data-Science-Portfolio/main/Projects/Supermaket%20Sales%20Data/data/annex4.csv'

def get_df_from_url(url):
    # Create request response
    response = requests.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Create a file-like object from the raw content
        csv_content = StringIO(response.text)

        # Read the CSV file into a DataFrame
        df = pd.read_csv(csv_content)

        # Return DataFrame
        return df
    else:
        print(f"Failed to fetch data. Status code: {response.status_code}")

df_item = get_df_from_url(annex1)
df_transaction = get_df_from_url(annex2)
df_wholesale = get_df_from_url(annex3)
df_loss = get_df_from_url(annex4)

def merge_all_df(df_transaction: pd.DataFrame = df_transaction,
                 df_item: pd.DataFrame = df_item,
                 df_wholesale: pd.DataFrame = df_wholesale,
                 df_loss: pd.DataFrame = df_loss):

    df_merge = df_transaction.merge(df_item,
                                    how = 'left',
                                    on = 'Item Code')
    df_merge = df_merge.merge(df_wholesale,
                            how = 'left',
                            on = ['Date', 'Item Code'])
    df_merge = df_merge.merge(df_loss.drop('Item Name', axis = 1),
                            how = 'left',
                            on = 'Item Code')
    
    return df_merge

def create_date_feature(df: pd.DataFrame,
                       date_column: str):
    # Convert the 'Date' column to datetime type
    df['Date'] = pd.to_datetime(df['Date'])
    df['Date'] = df['Date'].astype('datetime64[ns]')

    # Extract date components
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day
    df['DayOfWeek'] = df['Date'].dt.dayofweek  # Monday is 0 and Sunday is 6
    df['WeekNumber'] = df['Date'].dt.isocalendar().week
    df['WeekNumber'] = df['WeekNumber'].astype('int64')
    df['Quarter'] = df['Date'].dt.quarter
    
    return df

def is_holiday(date_value):
    china_holidays = holidays.country_holidays('CN')
    value = 1 if date_value in china_holidays else 0
    return value

def create_lag_feature(df: pd.DataFrame,
                       days: int):
    
    df[f'lag_{days}'] = df['Sales'].shift(days)
    df[f'lag_{days}'] = df[f'lag_{days}'].astype('float64')
    
    return df

def get_chinese_new_year_dates(start_year, end_year):
    chinese_new_year_dates = []

    for year in range(start_year, end_year + 1):
        lunar_date = LunarDate(year, 1, 1)
        cny_date = pd.Timestamp(lunar_date.toSolarDate())

        chinese_new_year_dates.append({'Chinese New Year Date': cny_date})

    chinese_new_year_df = pd.DataFrame(chinese_new_year_dates)
    chinese_new_year_df['Last Chinese New Year Date'] = chinese_new_year_df['Chinese New Year Date'].shift(1)
    chinese_new_year_df.dropna(subset=['Last Chinese New Year Date'], inplace = True)
    
    return chinese_new_year_df

def get_chinese_new_year_period(df, days):
    days = 7
    # new_years = df['Chinese New Year Date'].unique()
    # for new_year in new_years:
    new_year_dict = dict(zip(df['Chinese New Year Date'], df['Last Chinese New Year Date']))
    for this_year, last_year in new_year_dict.items():
        for day in range(-days, days + 1):
            if day != 0:
                df = pd.concat([df, pd.DataFrame({'Chinese New Year Date': [this_year + timedelta(days = day)],
                                                  'Last Chinese New Year Date': [last_year + timedelta(days = day)]})],
                               axis = 0,
                               ignore_index = True)
    return df

def create_last_new_year_feature(df: pd.DataFrame,
                                 days_before_after: int):
    
    # Get start and end year to generate Chinese New Year DataFrame from input data
    df['Date'] = df['Date'].astype('datetime64[ns]')
    df['Date'] = pd.to_datetime(df['Date'])
    start_year = df['Date'].min().year
    end_year = df['Date'].max().year
    
    # Get Chinese New Year DataFrame
    df_chinese_new_year = get_chinese_new_year_dates(start_year = start_year,
                                                     end_year = end_year)
    df_chinese_new_year_period = get_chinese_new_year_period(df = df_chinese_new_year,
                                                             days = 7)

    # Cast date column
    df['Date'] = pd.to_datetime(df['Date'])

    # Merge current new year date with last new year date
    df_last_new_year = df.copy()
    df_last_new_year = df_last_new_year.merge(df_chinese_new_year_period,
                                              how = 'left',
                                              left_on = 'Date',
                                              right_on = 'Last Chinese New Year Date')
    # Cleanup DataFrame
    df_last_new_year = df_last_new_year[~df_last_new_year['Last Chinese New Year Date'].isna()][['Chinese New Year Date', 'Category Code', 'Sales']]

    # Change column name before merge avoiding duplicated columns
    df_last_new_year.columns = ['Date', 'Category Code', 'Last New Year Sales']

    # Merge current new year with current new year to get "Last New Year Sales"
    df = df.merge(df_last_new_year,
                  on = ['Date', 'Category Code'],
                  how = 'left')

    return df

df_merge = merge_all_df()
# Create Sales column
df_merge['Sales'] = df_merge['Unit Selling Price (RMB/kg)'] * df_merge['Quantity Sold (kilo)']

# Aggregate as new table
df_agg = df_merge.groupby(['Date', 'Category Code'], as_index = False)['Sales'].sum()
df_agg.head()

Unnamed: 0,Date,Category Code,Sales
0,2020-07-01,1011010101,1503.7896
1,2020-07-01,1011010201,592.53
2,2020-07-01,1011010402,70.2838
3,2020-07-01,1011010501,176.818
4,2020-07-01,1011010504,759.9902


In [209]:
def category_feature_engineering(df_category: pd.DataFrame):
    df_category = create_date_feature(df_category,
                                    date_column = 'Date')
    df_category = create_lag_feature(df = df_category,
                                    days = 364)
    df_category = create_lag_feature(df = df_category,
                                    days = 7)
    df_category = create_lag_feature(df = df_category,
                                    days = 14)
    df_category = create_lag_feature(df = df_category,
                                    days = 28)
    df_category = create_last_new_year_feature(df = df_category,
                                            days_before_after = 7)
    df_category['is_holiday'] = df_category['Date'].apply(is_holiday)

    df_final = df_category.drop(['Date', 'Category Code'], axis = 1)
    df_final['Sales'] = df_final['Sales'].astype('float64')
    df_final['Last New Year Sales'] = df_final['Last New Year Sales'].astype('float64')

    return df_final

def category_date_feature(df_category: pd.DataFrame):
    df_category = create_date_feature(df_category,
                                    date_column = 'Date')
    df_category['is_holiday'] = df_category['Date'].apply(is_holiday)

    return df_category

def category_lag_feature(df_category: pd.DataFrame):
    df_category = create_lag_feature(df = df_category,
                                    days = 364)
    df_category = create_lag_feature(df = df_category,
                                    days = 7)
    df_category = create_lag_feature(df = df_category,
                                    days = 14)
    df_category = create_lag_feature(df = df_category,
                                    days = 28)
    df_category = create_last_new_year_feature(df = df_category,
                                            days_before_after = 7)

    df_final = df_category.drop(['Date', 'Category Code'], axis = 1)

    return df_final

In [210]:
start = dt.date(2024, 1, 1)
end = dt.date(2024, 2, 1)
date_generated = [start + dt.timedelta(days = x) for x in range(0, (end-start).days + 1)]
df_date_future = pd.DataFrame(date_generated, columns = ['Date'])

TypeError: descriptor 'date' for 'datetime.datetime' objects doesn't apply to a 'int' object

In [211]:
categories = ['1011010101', '1011010201', '1011010402', '1011010501', '101101050']

In [7]:
n_splits = 3
tscv = TimeSeriesSplit(n_splits=n_splits)

# Define parameter lists
n_estimators_list = [5000]
learning_rate_list = [0.001]
max_depth_list = [5]
subsample_list = [0.5]
colsample_bytree_list = [1.0]
early_stopping_rounds_list = [200]

# Calculate the total number of combinations
total_combinations = (
    len(n_estimators_list) *
    len(learning_rate_list) *
    len(max_depth_list) *
    len(subsample_list) *
    len(colsample_bytree_list) *
    len(early_stopping_rounds_list)
)

scaler = StandardScaler()
# categories = [1011010101]
categories = df_agg['Category Code'].unique()
category_best_params = {}
category_best_score = {'Category': [],
                       'RMSE': [],
                       'Normalized RMSE': [],
                       'MAPE': []}
for category in categories:
    print('\n######################################################')
    print(f'################ CATEGORY: {category} ################')
    print('######################################################\n')

    df_category = df_agg[df_agg['Category Code'] == category]

    # Feature Engineering
    df_final = category_feature_engineering(df_category = df_category)

    # Create a progress bar for the overall progress
    overall_progress_bar = tqdm(total = total_combinations,
                                desc = 'Overall Progress',
                                unit = 'combination')
    
    # Loop all parameters
    best_params = {}
    best_score = None
    for n_estimators in n_estimators_list:
        for learning_rate in learning_rate_list:
            for max_depth in max_depth_list:
                for subsample in subsample_list:
                    for colsample_bytree in colsample_bytree_list:
                        for early_stopping_rounds in early_stopping_rounds_list:
                             
                            # Set up XGBRegressor
                            xgb_r = xgb.XGBRegressor(
                                n_estimators = n_estimators,
                                eval_metrics = mean_squared_error,
                                learning_rate = learning_rate,
                                max_depth = max_depth,
                                subsample = subsample,
                                colsample_bytree = colsample_bytree,
                                early_stopping_rounds = early_stopping_rounds,
                                random_state = 244
                            )

                            # Define X, y
                            X = df_final.drop('Sales', axis = 1)
                            y = df_final[['Sales']]

                            rmse_list = []
                            normalized_rmse_list = []
                            mape_list = []
                            # Cross Validate
                            for train_index, test_index in tscv.split(X):
                                X_train, X_test = X.iloc[train_index], X.iloc[test_index]
                                y_train, y_test = y.iloc[train_index], y.iloc[test_index]

                                # Scale X
                                X_train[X_train.columns] = scaler.fit_transform(X_train)
                                X_test[X_test.columns] = scaler.transform(X_test)

                                # Fit the model
                                xgb_r.fit(
                                    X = X_train,
                                    y = y_train,
                                    eval_set = [(X_train, y_train), (X_test, y_test)],
                                    verbose = False
                                )

                                # Predict
                                y_pred = xgb_r.predict(X_test)

                                # Calculate score
                                rmse = np.sqrt(mean_squared_error(y_test, y_pred))
                                normalized_rmse = rmse / (float(y_test.max()) - float(y_test.min()))
                                mape = mean_absolute_percentage_error(y_true = y_test, y_pred = y_pred)

                                # Store RMSE for each fold
                                rmse_list.append(rmse)
                                normalized_rmse_list.append(normalized_rmse)
                                mape_list.append(mape)

                            
                            new_rmse = sum(rmse_list) / len(rmse_list)
                            new_normalized_rmse = sum(normalized_rmse_list) / len(normalized_rmse_list)
                            new_mape = sum(mape_list) / len(mape_list)
                            # print(f'n_estimators: {n_estimators} | learning_rate: {learning_rate} | max_depth: {max_depth} | subsample: {subsample} | colsample_bytree: {colsample_bytree} | early_stopping_rounds: {early_stopping_rounds} >>> RMSE: {new_rmse:.2f} | Normalized RMSE: {new_normalized_rmse:.2f} | MAPE: {new_mape:.2f} %')
                            if best_score is None:
                                best_score = new_rmse
                                best_normalized_rmse = new_normalized_rmse
                                best_mape = new_mape

                                # Update best_params
                                best_params['n_estimators'] = n_estimators
                                best_params['learning_rate'] = learning_rate
                                best_params['max_depth'] = max_depth
                                best_params['subsample'] = subsample
                                best_params['colsample_bytree'] = colsample_bytree
                                best_params['early_stopping_rounds'] = early_stopping_rounds
                            elif new_rmse < best_score:
                                best_score = new_rmse
                                best_normalized_rmse = new_normalized_rmse
                                best_mape = new_mape

                                # Update best_params
                                best_params['n_estimators'] = n_estimators
                                best_params['learning_rate'] = learning_rate
                                best_params['max_depth'] = max_depth
                                best_params['subsample'] = subsample
                                best_params['colsample_bytree'] = colsample_bytree
                                best_params['early_stopping_rounds'] = early_stopping_rounds
                            
                            # Update the overall progress bar
                            overall_progress_bar.update(1)

    # Close the overall progress bar
    overall_progress_bar.close()

    # Print the best parameters and corresponding mean squared error
    print(f'\nCategory {category} Best Parameters: {best_params}')
    print(f'Category {category} Best Score: {best_score}')

    # Store best_params for each category
    category_best_params[category] = best_params
    category_best_score['Category'].append(category)
    category_best_score['RMSE'].append(best_score)
    category_best_score['Normalized RMSE'].append(best_normalized_rmse)
    category_best_score['MAPE'].append(best_mape)

# Summarize
print('\n>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SUMMARY <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
display(pd.DataFrame(category_best_params))
display(pd.DataFrame(category_best_score))


######################################################
################ CATEGORY: 1011010101 ################
######################################################



Overall Progress: 100%|██████████| 1/1 [00:21<00:00, 21.05s/combination]



Category 1011010101 Best Parameters: {'n_estimators': 5000, 'learning_rate': 0.001, 'max_depth': 5, 'subsample': 0.5, 'colsample_bytree': 1.0, 'early_stopping_rounds': 200}
Category 1011010101 Best Score: 316.97717263735973

######################################################
################ CATEGORY: 1011010201 ################
######################################################



Overall Progress: 100%|██████████| 1/1 [00:19<00:00, 19.46s/combination]



Category 1011010201 Best Parameters: {'n_estimators': 5000, 'learning_rate': 0.001, 'max_depth': 5, 'subsample': 0.5, 'colsample_bytree': 1.0, 'early_stopping_rounds': 200}
Category 1011010201 Best Score: 164.16392822078865

######################################################
################ CATEGORY: 1011010402 ################
######################################################



Overall Progress: 100%|██████████| 1/1 [00:11<00:00, 11.58s/combination]



Category 1011010402 Best Parameters: {'n_estimators': 5000, 'learning_rate': 0.001, 'max_depth': 5, 'subsample': 0.5, 'colsample_bytree': 1.0, 'early_stopping_rounds': 200}
Category 1011010402 Best Score: 228.17485568859695

######################################################
################ CATEGORY: 1011010501 ################
######################################################



Overall Progress: 100%|██████████| 1/1 [00:10<00:00, 10.81s/combination]



Category 1011010501 Best Parameters: {'n_estimators': 5000, 'learning_rate': 0.001, 'max_depth': 5, 'subsample': 0.5, 'colsample_bytree': 1.0, 'early_stopping_rounds': 200}
Category 1011010501 Best Score: 101.89716264057903

######################################################
################ CATEGORY: 1011010504 ################
######################################################



Overall Progress: 100%|██████████| 1/1 [00:11<00:00, 11.52s/combination]



Category 1011010504 Best Parameters: {'n_estimators': 5000, 'learning_rate': 0.001, 'max_depth': 5, 'subsample': 0.5, 'colsample_bytree': 1.0, 'early_stopping_rounds': 200}
Category 1011010504 Best Score: 415.45312249214413

######################################################
################ CATEGORY: 1011010801 ################
######################################################



Overall Progress: 100%|██████████| 1/1 [00:15<00:00, 15.24s/combination]


Category 1011010801 Best Parameters: {'n_estimators': 5000, 'learning_rate': 0.001, 'max_depth': 5, 'subsample': 0.5, 'colsample_bytree': 1.0, 'early_stopping_rounds': 200}
Category 1011010801 Best Score: 272.9629753738954

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SUMMARY <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<





Unnamed: 0,1011010101,1011010201,1011010402,1011010501,1011010504,1011010801
n_estimators,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
learning_rate,0.001,0.001,0.001,0.001,0.001,0.001
max_depth,5.0,5.0,5.0,5.0,5.0,5.0
subsample,0.5,0.5,0.5,0.5,0.5,0.5
colsample_bytree,1.0,1.0,1.0,1.0,1.0,1.0
early_stopping_rounds,200.0,200.0,200.0,200.0,200.0,200.0


Unnamed: 0,Category,RMSE,Normalized RMSE,MAPE
0,1011010101,316.977173,0.114963,0.260083
1,1011010201,164.163928,0.143043,0.601241
2,1011010402,228.174856,0.12793,0.622419
3,1011010501,101.897163,0.120202,0.59592
4,1011010504,415.453122,0.102336,0.407825
5,1011010801,272.962975,0.125069,0.408118


In [212]:
def create_future_date_dataframe(df: pd.DataFrame,
                                 date_column: str,
                                 days: int = 60):
    try:
        start = dt.strptime(df[date_column].max(), '%Y-%m-%d').date() + timedelta(days = 1)
    except:
        start = df[date_column].max().date() + timedelta(days = 1)
    end = start + timedelta(days = days - 1)
    date_generated = [start + timedelta(days = x) for x in range(0, (end - start).days + 1)]
    df_date_future = pd.DataFrame(date_generated, columns = [date_column])
    return df_date_future

df_date_future = create_future_date_dataframe(df = df_category,
                                              date_column = 'Date',
                                              days = 10)
df_date_future.head()

Unnamed: 0,Date
0,2023-07-01
1,2023-07-02
2,2023-07-03
3,2023-07-04
4,2023-07-05


In [218]:
# Break DataFrame to Category
df_category = df_agg[df_agg['Category Code'] == category]
df_category.reset_index(inplace = True, drop = True)

# Feature Engineering
df_train = df_category.copy()
df_train = category_feature_engineering(df_category = df_train)

# Split train and evaluate dataset
train_idx_split = int(df_category.shape[0]*0.8)
eval_idx_split = (df_category.shape[0])
train = df_train.iloc[0:train_idx_split,:]
eval = df_train.iloc[train_idx_split:eval_idx_split,:]

# Create X and y
X_train = train.drop('Sales', axis = 1)
y_train = train[['Sales']]
X_test = eval.drop('Sales', axis = 1)
y_test = eval[['Sales']]

# Train
xgb_r = xgb.XGBRegressor(**category_best_params[category], random_state = 244)
xgb_r.fit(X_train,
          y_train,
          eval_set = [(X_train, y_train), (X_test, y_test)],
          verbose = False)

# Loop through each date
df_future = df_date_future.copy()
df_future['Category Code'] = category
df_future['Sales'] = None
df_new_category = df_category.copy()
for i in range(df_future.shape[0]):
    df_new_category = pd.concat([df_category, df_future.iloc[0:i+1,:]], axis = 0)
    df_new_category = category_feature_engineering(df_category = df_new_category)

    # Predict future date
    X_future = df_new_category.iloc[-1:].drop('Sales', axis =1)
    y_pred = xgb_r.predict(X_future)

    # Update y_pred to the last row of Sales column (new date)
    df_future.iloc[i:i+1, df_future.columns.get_loc('Sales')] = y_pred[0]

# Update df_new_category last loop
df_new_category.iloc[-1, df_new_category.columns.get_loc('Sales')] = y_pred[0]



In [221]:
df_new_category.iloc[-20:-1,:]

Unnamed: 0,Sales,Year,Month,Day,DayOfWeek,WeekNumber,Quarter,lag_364,lag_7,lag_14,lag_28,Last New Year Sales,is_holiday
1075,566.102,2023,6,21,2,25,2,450.1206,635.9628,719.88,656.304,,0
1076,859.332,2023,6,22,3,25,2,488.562,593.5972,625.0968,597.0716,,1
1077,855.5916,2023,6,23,4,25,2,522.9046,900.0056,721.6428,783.0974,,1
1078,810.902,2023,6,24,5,25,2,824.6844,1294.141,1288.7028,1360.7152,,0
1079,522.7804,2023,6,25,6,25,2,829.0606,1301.476,1148.2452,1353.1002,,0
1080,430.5642,2023,6,26,0,26,2,557.3336,630.4718,645.9176,746.1758,,0
1081,642.0896,2023,6,27,1,26,2,512.5522,620.0676,775.7412,771.464,,0
1082,630.4388,2023,6,28,2,26,2,484.8422,566.102,635.9628,722.856,,0
1083,646.1398,2023,6,29,3,26,2,382.3718,859.332,593.5972,706.9904,,0
1084,667.7684,2023,6,30,4,26,2,483.137,855.5916,900.0056,853.786,,0


In [141]:
# start = dt.date(2024, 1, 1)
def create_future_date_dataframe(df: pd.DataFrame,
                                 date_column: str,
                                 days: int = 60):
    start = dt.strptime(df[date_column].max(), '%Y-%m-%d').date() + timedelta(days = 1)
    end = start + timedelta(days = days - 1)
    date_generated = [start + timedelta(days = x) for x in range(0, (end - start).days + 1)]
    df_date_future = pd.DataFrame(date_generated, columns = [date_column])
    return df_date_future

df_date_future = create_future_date_dataframe(df = df_category)

Unnamed: 0,Date
0,2023-07-01
1,2023-07-02
2,2023-07-03
3,2023-07-04
4,2023-07-05
...,...
56,2023-08-26
57,2023-08-27
58,2023-08-28
59,2023-08-29


In [152]:
X_train.dtypes

Year                     int64
Month                    int64
Day                      int64
DayOfWeek                int64
WeekNumber              UInt32
Quarter                  int64
lag_364                float64
lag_7                  float64
lag_14                 float64
lag_28                 float64
Last New Year Sales    float64
is_holiday               int64
dtype: object