In [2]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Time Series Libraries
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.arima.model import ARIMA

# Machine Learning Frameworks
import xgboost as xgb

In [3]:
train=pd.read_csv("Data/train.csv")
train.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0
1,1,2013-01-01,1,BABY CARE,0.0,0
2,2,2013-01-01,1,BEAUTY,0.0,0
3,3,2013-01-01,1,BEVERAGES,0.0,0
4,4,2013-01-01,1,BOOKS,0.0,0


In [4]:
test=pd.read_csv("Data/test.csv")
test.head()

Unnamed: 0,id,date,store_nbr,family,onpromotion
0,3000888,2017-08-16,1,AUTOMOTIVE,0
1,3000889,2017-08-16,1,BABY CARE,0
2,3000890,2017-08-16,1,BEAUTY,2
3,3000891,2017-08-16,1,BEVERAGES,20
4,3000892,2017-08-16,1,BOOKS,0


In [5]:
oil_data = pd.read_csv('Data/oil.csv')
holidays_data = pd.read_csv('Data/holidays_events.csv')
store_data = pd.read_csv('Data/stores.csv')
transactions_data = pd.read_csv('Data/transactions.csv')

In [6]:
oil_data.head()

Unnamed: 0,date,dcoilwtico
0,2013-01-01,
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-07,93.2


In [7]:
holidays_data["type"].unique()

array(['Holiday', 'Transfer', 'Additional', 'Bridge', 'Work Day', 'Event'],
      dtype=object)

In [8]:
store_data.head()

Unnamed: 0,store_nbr,city,state,type,cluster
0,1,Quito,Pichincha,D,13
1,2,Quito,Pichincha,D,13
2,3,Quito,Pichincha,D,8
3,4,Quito,Pichincha,D,9
4,5,Santo Domingo,Santo Domingo de los Tsachilas,D,4


In [9]:
transactions_data.head()

Unnamed: 0,date,store_nbr,transactions
0,2013-01-01,25,770
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922


# Data processing

In [10]:
train['dataset'] = 'train'
test['dataset'] = 'test'

# concatenate the datasets with the added 'dataset' column
df = pd.concat([train, test], axis=0).reset_index(drop=True)

In [11]:
df.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,dataset
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0,train
1,1,2013-01-01,1,BABY CARE,0.0,0,train
2,2,2013-01-01,1,BEAUTY,0.0,0,train
3,3,2013-01-01,1,BEVERAGES,0.0,0,train
4,4,2013-01-01,1,BOOKS,0.0,0,train


In [12]:
print(df.columns)
print(oil_data.columns)
print(holidays_data.columns)
print(store_data.columns)
print(transactions_data.columns)

Index(['id', 'date', 'store_nbr', 'family', 'sales', 'onpromotion', 'dataset'], dtype='object')
Index(['date', 'dcoilwtico'], dtype='object')
Index(['date', 'type', 'locale', 'locale_name', 'description', 'transferred'], dtype='object')
Index(['store_nbr', 'city', 'state', 'type', 'cluster'], dtype='object')
Index(['date', 'store_nbr', 'transactions'], dtype='object')


In [13]:
holidays_data.head()

Unnamed: 0,date,type,locale,locale_name,description,transferred
0,2012-03-02,Holiday,Local,Manta,Fundacion de Manta,False
1,2012-04-01,Holiday,Regional,Cotopaxi,Provincializacion de Cotopaxi,False
2,2012-04-12,Holiday,Local,Cuenca,Fundacion de Cuenca,False
3,2012-04-14,Holiday,Local,Libertad,Cantonizacion de Libertad,False
4,2012-04-21,Holiday,Local,Riobamba,Cantonizacion de Riobamba,False


In [14]:
holidays_data[holidays_data['transferred'] == True]

Unnamed: 0,date,type,locale,locale_name,description,transferred
19,2012-10-09,Holiday,National,Ecuador,Independencia de Guayaquil,True
72,2013-10-09,Holiday,National,Ecuador,Independencia de Guayaquil,True
135,2014-10-09,Holiday,National,Ecuador,Independencia de Guayaquil,True
255,2016-05-24,Holiday,National,Ecuador,Batalla de Pichincha,True
266,2016-07-25,Holiday,Local,Guayaquil,Fundacion de Guayaquil,True
268,2016-08-10,Holiday,National,Ecuador,Primer Grito de Independencia,True
297,2017-01-01,Holiday,National,Ecuador,Primer dia del ano,True
303,2017-04-12,Holiday,Local,Cuenca,Fundacion de Cuenca,True
312,2017-05-24,Holiday,National,Ecuador,Batalla de Pichincha,True
324,2017-08-10,Holiday,National,Ecuador,Primer Grito de Independencia,True


In [15]:
holidays_data[holidays_data['type'] == 'Transfer']

Unnamed: 0,date,type,locale,locale_name,description,transferred
20,2012-10-12,Transfer,National,Ecuador,Traslado Independencia de Guayaquil,False
73,2013-10-11,Transfer,National,Ecuador,Traslado Independencia de Guayaquil,False
136,2014-10-10,Transfer,National,Ecuador,Traslado Independencia de Guayaquil,False
256,2016-05-27,Transfer,National,Ecuador,Traslado Batalla de Pichincha,False
265,2016-07-24,Transfer,Local,Guayaquil,Traslado Fundacion de Guayaquil,False
269,2016-08-12,Transfer,National,Ecuador,Traslado Primer Grito de Independencia,False
298,2017-01-02,Transfer,National,Ecuador,Traslado Primer dia del ano,False
304,2017-04-13,Transfer,Local,Cuenca,Fundacion de Cuenca,False
313,2017-05-26,Transfer,National,Ecuador,Traslado Batalla de Pichincha,False
325,2017-08-11,Transfer,National,Ecuador,Traslado Primer Grito de Independencia,False


In [16]:
holidays_data[holidays_data['type'] == 'Transfer']

Unnamed: 0,date,type,locale,locale_name,description,transferred
20,2012-10-12,Transfer,National,Ecuador,Traslado Independencia de Guayaquil,False
73,2013-10-11,Transfer,National,Ecuador,Traslado Independencia de Guayaquil,False
136,2014-10-10,Transfer,National,Ecuador,Traslado Independencia de Guayaquil,False
256,2016-05-27,Transfer,National,Ecuador,Traslado Batalla de Pichincha,False
265,2016-07-24,Transfer,Local,Guayaquil,Traslado Fundacion de Guayaquil,False
269,2016-08-12,Transfer,National,Ecuador,Traslado Primer Grito de Independencia,False
298,2017-01-02,Transfer,National,Ecuador,Traslado Primer dia del ano,False
304,2017-04-13,Transfer,Local,Cuenca,Fundacion de Cuenca,False
313,2017-05-26,Transfer,National,Ecuador,Traslado Batalla de Pichincha,False
325,2017-08-11,Transfer,National,Ecuador,Traslado Primer Grito de Independencia,False


In [17]:
holidays_data.loc[holidays_data['type'] == 'Transfer', 'type'] = 'Holiday'

In [18]:
holidays_data['Holiday summary'] = (
    holidays_data['type'].astype(str) + " " +
    holidays_data['locale'].astype(str) + " " +
    holidays_data['locale_name'].astype(str) + " " 
    # holidays_data['description'].astype(str)
)

In [19]:
holidays_data= holidays_data.drop(columns=['type', 'locale', 'locale_name', 'description'] , errors='ignore' )

In [20]:
holidays_data['Holiday summary'].value_counts()

Holiday summary
Holiday National Ecuador                            68
Event National Ecuador                              56
Additional National Ecuador                         40
Holiday Local Latacunga                             12
Holiday Local Riobamba                              12
Holiday Local Guaranda                              12
Holiday Local Ambato                                12
Holiday Local Cuenca                                 7
Holiday Local Quito                                  7
Holiday Local Ibarra                                 7
Holiday Local Puyo                                   6
Holiday Local Libertad                               6
Holiday Regional Cotopaxi                            6
Holiday Local Manta                                  6
Holiday Local Esmeraldas                             6
Holiday Local Cayambe                                6
Holiday Local El Carmen                              6
Holiday Local Santo Domingo                      

In [21]:
store_data

Unnamed: 0,store_nbr,city,state,type,cluster
0,1,Quito,Pichincha,D,13
1,2,Quito,Pichincha,D,13
2,3,Quito,Pichincha,D,8
3,4,Quito,Pichincha,D,9
4,5,Santo Domingo,Santo Domingo de los Tsachilas,D,4
5,6,Quito,Pichincha,D,13
6,7,Quito,Pichincha,D,8
7,8,Quito,Pichincha,D,8
8,9,Quito,Pichincha,B,6
9,10,Quito,Pichincha,C,15


In [22]:
transactions_data

Unnamed: 0,date,store_nbr,transactions
0,2013-01-01,25,770
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922
...,...,...,...
83483,2017-08-15,50,2804
83484,2017-08-15,51,1573
83485,2017-08-15,52,2255
83486,2017-08-15,53,932


In [23]:
df = df.merge(oil_data, on='date', how='left')
# Merge holiday data on date
df = df.merge(holidays_data, on='date', how='left')
# Merge store data on store number
df = df.merge(store_data, on='store_nbr', how='left')
# Merge transaction data on store number and date
df = df.merge(transactions_data, on=['date', 'store_nbr'], how='left')

In [24]:
df.sample(10)

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,dataset,dcoilwtico,transferred,Holiday summary,city,state,type,cluster,transactions
2619764,2575214,2016-12-19,15,PET SUPPLIES,1.0,0,train,52.13,,,Ibarra,Imbabura,C,15,1599.0
2280593,2244953,2016-06-16,48,PREPARED FOODS,96.579,0,train,46.14,,,Quito,Pichincha,A,14,2313.0
1351049,1331447,2015-01-20,17,PREPARED FOODS,46.0,0,train,46.79,,,Quito,Pichincha,C,12,1211.0
1945994,1921046,2015-12-17,10,HOME APPLIANCES,0.0,0,train,34.98,,,Quito,Pichincha,C,15,914.0
2967564,2915886,2017-06-29,24,CELEBRATION,23.0,0,train,44.88,,,Guayaquil,Guayas,D,1,2094.0
36797,36797,2013-01-21,41,BEAUTY,2.0,0,train,,,,Machala,El Oro,D,4,715.0
316570,312829,2013-06-25,36,"LIQUOR,WINE,BEER",17.0,0,train,95.25,False,Holiday Local Machala,Libertad,Guayas,E,10,917.0
1189496,1173458,2014-10-22,34,FROZEN FOODS,112.177,0,train,80.52,,,Guayaquil,Guayas,B,6,2050.0
2931814,2883700,2017-06-11,20,POULTRY,579.486,0,train,,,,Quito,Pichincha,B,6,2144.0
1427292,1407690,2015-03-03,7,DELI,200.192,0,train,50.43,,,Quito,Pichincha,D,8,1741.0


In [25]:
print("\nSummary Statistics for Numerical Columns:")
df.describe()


Summary Statistics for Numerical Columns:


Unnamed: 0,id,store_nbr,sales,onpromotion,dcoilwtico,cluster,transactions
count,3082860.0,3082860.0,3054348.0,3082860.0,2120580.0,3082860.0,2805231.0
mean,1518250.0,27.5,359.0209,2.657692,67.8064,8.481481,1697.071
std,874291.2,15.58579,1107.286,12.36626,25.64571,4.649735,966.8317
min,0.0,1.0,0.0,0.0,26.19,1.0,5.0
25%,761804.8,14.0,0.0,0.0,46.46,4.0,1046.0
50%,1521828.0,27.5,11.0,0.0,53.25,8.5,1395.0
75%,2272940.0,41.0,196.011,0.0,95.72,13.0,2081.0
max,3029399.0,54.0,124717.0,741.0,110.62,17.0,8359.0


In [26]:
df["dcoilwtico"]=df["dcoilwtico"].fillna(method="ffill").fillna(method="bfill")

  df["dcoilwtico"]=df["dcoilwtico"].fillna(method="ffill").fillna(method="bfill")


In [27]:
df["dcoilwtico"].isnull().sum()

0

In [28]:
df['transactions'] = df.groupby('family')['transactions'].transform(
    lambda x: x.fillna(x.median())
)

In [29]:
df["transactions"].isnull().sum()

0

In [30]:
df["is_holiday"]=df["Holiday summary"].fillna('Not Holiday').apply(lambda x: 0 if x == 'Not Holiday' else 1)

In [31]:
df["is_holiday"].unique().sum()

1

In [32]:
df.isnull().sum()

id                       0
date                     0
store_nbr                0
family                   0
sales                28512
onpromotion              0
dataset                  0
dcoilwtico               0
transferred        2578554
Holiday summary    2578554
city                     0
state                    0
type                     0
cluster                  0
transactions             0
is_holiday               0
dtype: int64

In [33]:
df['transactions'] = df.groupby('family')['transactions'].transform(
    lambda x: x.fillna(x.median())
)

In [34]:
df['transactions'].fillna(method='ffill', inplace=True)
df['transactions'].fillna(df['transactions'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['transactions'].fillna(method='ffill', inplace=True)
  df['transactions'].fillna(method='ffill', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['transactions'].fillna(df['transactions'].mean(), inplace=True)


In [35]:
df = df.drop(columns=['transferred', 'Holiday summary'], errors='ignore')

**convert date time col**

In [36]:
import datetime as dt

In [37]:
df['date'] = pd.to_datetime(df['date'])
df["year"]=df['date'].dt.year
df["month"]=df["date"].dt.month
df["day"]=df['date'].dt.day
df['day_of_week'] = df['date'].dt.dayofweek

In [38]:
df.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,dataset,dcoilwtico,city,state,type,cluster,transactions,is_holiday,year,month,day,day_of_week
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0,train,93.14,Quito,Pichincha,D,13,1395.0,1,2013,1,1,1
1,1,2013-01-01,1,BABY CARE,0.0,0,train,93.14,Quito,Pichincha,D,13,1395.0,1,2013,1,1,1
2,2,2013-01-01,1,BEAUTY,0.0,0,train,93.14,Quito,Pichincha,D,13,1395.0,1,2013,1,1,1
3,3,2013-01-01,1,BEVERAGES,0.0,0,train,93.14,Quito,Pichincha,D,13,1395.0,1,2013,1,1,1
4,4,2013-01-01,1,BOOKS,0.0,0,train,93.14,Quito,Pichincha,D,13,1395.0,1,2013,1,1,1


**encode categorical columns**

In [39]:
df['family'].value_counts()
df.groupby('family', as_index=False).agg(
    sales_count=('sales', 'count'),
    sales_mean=('sales', 'mean')
).sort_values(by='sales_mean', ascending=False)[:10]

Unnamed: 0,family,sales_count,sales_mean
12,GROCERY I,92556,3790.432797
3,BEVERAGES,92556,2394.912701
30,PRODUCE,92556,1355.373698
7,CLEANING,92556,1074.171518
8,DAIRY,92556,711.175991
5,BREAD/BAKERY,92556,464.150612
28,POULTRY,92556,351.078816
24,MEATS,92556,341.965905
25,PERSONAL CARE,92556,271.192381
9,DELI,92556,265.629746


In [40]:
df.groupby('store_nbr', as_index=False).agg(
    sales_count=('sales', 'count'),
    sales_mean=('sales', 'mean')
).sort_values(by='sales_mean', ascending=False)[:10]

Unnamed: 0,store_nbr,sales_count,sales_mean
43,44,56562,1120.118405
44,45,56562,984.565998
46,47,56562,919.777871
2,3,56562,911.098054
48,49,56562,784.039156
45,46,56562,756.775349
47,48,56562,649.584599
50,51,56562,594.106667
7,8,56562,550.264615
49,50,56562,517.551554


In [41]:
df=pd.get_dummies(df,columns=["store_nbr","family"],dtype=int, drop_first=True)

In [42]:
from sklearn.preprocessing import LabelEncoder
from category_encoders import TargetEncoder




# Function for Target Encoding multiple categorical columns
def target_encoding_multiple_columns(df, target_column, categorical_columns, dataset_column='dataset'):
    
    for col in categorical_columns:
        # Calculate target mean for each category in the column using training data
        target_mean = df[df[dataset_column] == 'train'].groupby(col)[target_column].mean()

        # Map the target mean encoding to the entire dataset
        df[f'{col}_encoded'] = df[col].map(target_mean)

        # Handle missing values for categories not present in training
        df[f'{col}_encoded'].fillna(df[target_column].mean(), inplace=True)

    return df

# Example usage: Apply Target Encoding to multiple columns
categorical_columns = ['month', 'day', 'year', 'day_of_week']
df = target_encoding_multiple_columns(df, target_column='sales', categorical_columns=categorical_columns)


# Drop unnecessary columns after encoding
columns_to_drop = [
    'locale_name', 'description', 'transferred', 'city', 'state', 'family',
    'month', 'day_of_week', 'day', 'year', 'cluster', 'type'
]
df.drop(columns=columns_to_drop, errors='ignore', inplace=True)
df.columns

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f'{col}_encoded'].fillna(df[target_column].mean(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f'{col}_encoded'].fillna(df[target_column].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f'{col}_encoded'].fillna(df[target_column].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermed

Index(['id', 'date', 'sales', 'onpromotion', 'dataset', 'dcoilwtico',
       'transactions', 'is_holiday', 'store_nbr_2', 'store_nbr_3',
       'store_nbr_4', 'store_nbr_5', 'store_nbr_6', 'store_nbr_7',
       'store_nbr_8', 'store_nbr_9', 'store_nbr_10', 'store_nbr_11',
       'store_nbr_12', 'store_nbr_13', 'store_nbr_14', 'store_nbr_15',
       'store_nbr_16', 'store_nbr_17', 'store_nbr_18', 'store_nbr_19',
       'store_nbr_20', 'store_nbr_21', 'store_nbr_22', 'store_nbr_23',
       'store_nbr_24', 'store_nbr_25', 'store_nbr_26', 'store_nbr_27',
       'store_nbr_28', 'store_nbr_29', 'store_nbr_30', 'store_nbr_31',
       'store_nbr_32', 'store_nbr_33', 'store_nbr_34', 'store_nbr_35',
       'store_nbr_36', 'store_nbr_37', 'store_nbr_38', 'store_nbr_39',
       'store_nbr_40', 'store_nbr_41', 'store_nbr_42', 'store_nbr_43',
       'store_nbr_44', 'store_nbr_45', 'store_nbr_46', 'store_nbr_47',
       'store_nbr_48', 'store_nbr_49', 'store_nbr_50', 'store_nbr_51',
       'store_nbr

In [43]:
df.to_csv("Data_processed.csv")

# test train split 

In [44]:
from sklearn.preprocessing import StandardScaler
import numpy as np

# Step 5: Split Data Based on Time Series Order

# Separate train and test datasets
train_df = df[df['dataset'] == 'train'].drop(columns=['dataset'], errors='ignore')
test_df = df[df['dataset'] == 'test'].drop(columns=['dataset'], errors='ignore')


# Drop unnecessary columns from both datasets
train_df = train_df.drop(columns=['transactions', 'id'], errors='ignore')
test_df = test_df.drop(columns=['transactions'], errors='ignore')

# Sort training data by date to preserve time series order
train_df = train_df.sort_values(by='date')

# Define the split point for time series validation
split_ratio = 0.99  # Adjust as needed
split_index = int(len(train_df) * split_ratio)

# Create training and validation sets
X_train = train_df.iloc[:split_index].drop(columns=['sales'])
y_train = train_df.iloc[:split_index]['sales']
X_val = train_df.iloc[split_index:].drop(columns=['sales'])
y_val = train_df.iloc[split_index:]['sales']

# Log transform the target variables to stabilize variance
y_train = np.log1p(y_train)  # Log transform training target
y_val = np.log1p(y_val)      # Log transform validation target

# Drop the 'date' column after feature extraction
X_train = X_train.drop(columns=['date'], errors='ignore')
X_val = X_val.drop(columns=['date'], errors='ignore')

X_test = test_df.drop(columns=['sales'], errors='ignore')

In [45]:
X_val

Unnamed: 0,onpromotion,dcoilwtico,is_holiday,store_nbr_2,store_nbr_3,store_nbr_4,store_nbr_5,store_nbr_6,store_nbr_7,store_nbr_8,...,family_PLAYERS AND ELECTRONICS,family_POULTRY,family_PREPARED FOODS,family_PRODUCE,family_SCHOOL AND OFFICE SUPPLIES,family_SEAFOOD,month_encoded,day_encoded,year_encoded,day_of_week_encoded
3023080,0,49.72,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,376.414091,345.823234,481.166458,434.785811
3023104,18,49.72,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,376.414091,345.823234,481.166458,434.785811
3023103,0,49.72,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,376.414091,345.823234,481.166458,434.785811
3023102,4,49.72,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,376.414091,345.823234,481.166458,434.785811
3023101,0,49.72,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,376.414091,345.823234,481.166458,434.785811
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3053153,0,47.57,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,336.992535,348.736707,481.166458,319.920782
3053152,8,47.57,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,336.992535,348.736707,481.166458,319.920782
3053151,0,47.57,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,336.992535,348.736707,481.166458,319.920782
3053162,0,47.57,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,336.992535,348.736707,481.166458,319.920782


# Model Selection and Implementation

In [46]:
from catboost import CatBoostRegressor, Pool

# Assuming train_pool and val_pool are already defined
train_pool = Pool(X_train, y_train)
val_pool = Pool(X_val, y_val)

# Define the CatBoost model with initial parameters
catboost_model = CatBoostRegressor(
    iterations=10000,          
    learning_rate=0.1,        
    depth=8,                  
    loss_function='RMSE',     
    eval_metric='RMSE',       
    random_seed=42,           
    verbose=100,               
    early_stopping_rounds=50  
)

train_loop_count = 100  # Number of training stages
init_model = None  # Start without an initial model

for i in range(train_loop_count):
    print(f"\nTraining Stage {i + 1}/{train_loop_count}\n")
    catboost_model.fit(
        train_pool,
        eval_set=val_pool,
        early_stopping_rounds=50,
        verbose=50,
        use_best_model=True,
        init_model=init_model  # Use the model from the previous stage
    )
    # Update the initial model for the next training stage
    init_model = catboost_model


Training Stage 1/100

0:	learn: 2.5592806	test: 2.4453943	best: 2.4453943 (0)	total: 89.8ms	remaining: 14m 58s
50:	learn: 1.2925017	test: 1.0361521	best: 1.0361521 (50)	total: 1.79s	remaining: 5m 49s
100:	learn: 1.0648979	test: 0.8910148	best: 0.8910148 (100)	total: 3.49s	remaining: 5m 41s
150:	learn: 0.9626668	test: 0.7977793	best: 0.7977793 (150)	total: 5.16s	remaining: 5m 36s
200:	learn: 0.9000886	test: 0.7510573	best: 0.7510573 (200)	total: 6.83s	remaining: 5m 33s
250:	learn: 0.8541511	test: 0.7099348	best: 0.7099348 (250)	total: 8.47s	remaining: 5m 29s
300:	learn: 0.8129829	test: 0.6840703	best: 0.6840703 (300)	total: 10.1s	remaining: 5m 26s
350:	learn: 0.7817460	test: 0.6615091	best: 0.6615091 (350)	total: 11.8s	remaining: 5m 24s
400:	learn: 0.7549967	test: 0.6480726	best: 0.6480726 (400)	total: 13.5s	remaining: 5m 23s
450:	learn: 0.7304251	test: 0.6373942	best: 0.6373942 (450)	total: 15.2s	remaining: 5m 22s
500:	learn: 0.7097392	test: 0.6267962	best: 0.6267962 (500)	total: 16.9

# Submission for forecasting

In [47]:
test_features = test_df.drop(columns=['id', 'date'], errors='ignore')  # Drop unnecessary columns
test_features = test_features.reindex(columns=X_train.columns, fill_value=0)

In [48]:
# Predict using the trained CatBoost model
test_df['sales'] = catboost_model.predict(test_features)

In [49]:
test_df['sales'] = np.expm1(test_df['sales'])

In [50]:
# Create submission file
submission = test_df[['id', 'sales']]  # Include 'id' and the predicted target column
submission.to_csv('submission_catboost.csv', index=False)

print("Submission file created: submission.csv")

Submission file created: submission.csv


In [60]:
import numpy as np

def rmlse(y_true, y_pred):
    """
    Calculate Root Mean Log Squared Error (RMLSE).

    Parameters:
    y_true (array-like): Actual values (true sales)
    y_pred (array-like): Predicted values (sales predicted by CatBoost model)

    Returns:
    float: RMLSE value
    """
    # Ensure no negative values or NaNs in the data by clipping values to be >= 0
    y_true = np.maximum(y_true, 0)
    y_pred = np.maximum(y_pred, 0)

    # Check if there are any NaN or infinite values
    if np.any(np.isnan(y_true)) or np.any(np.isnan(y_pred)):
        return np.nan
    
    # Add 1 to avoid log(0) issues
    log_true = np.log(y_true + 1)
    log_pred = np.log(y_pred + 1)
    
    # Calculate the squared differences and compute the mean
    mean_squared_log_error = np.mean((log_true - log_pred) ** 2)
    
    # Return the square root of the mean squared log error
    return np.sqrt(mean_squared_log_error)

# Assuming 'test_df['sales']' is the true values and 'catboost_model.predict(test_features)' is the predicted values:
y_true = test_df['sales'].values
y_pred = catboost_model.predict(test_features)

# Calculate RMLSE
rmlse_value = rmlse(y_true, y_pred)
print(f"RMLSE: {rmlse_value}")


RMLSE: 2.906038830530555


In [62]:
from sklearn.metrics import root_mean_squared_error, root_mean_squared_log_error

In [63]:
y_true = test_df['sales'].values
y_pred = catboost_model.predict(test_features)

# Calculate RMLSE
rmlse_value = root_mean_squared_error(y_true, y_pred)
print(f"RMSE: {rmlse_value}")


RMSE: 1245.8210305939576
