<a href="https://colab.research.google.com/github/sidharkal/JOB-A-THON_2022/blob/main/Cat_Boost_Regressor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Import Library

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import plotly.express as px
from sklearn.preprocessing import OneHotEncoder
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import warnings
warnings.simplefilter(action='ignore', category=UserWarning)

In [3]:
train =  pd.read_csv("/content/train.csv")
test =  pd.read_csv("/content/test.csv")
solution =  pd.read_csv("/content/sample_sub.csv")
train['datetime']= pd.to_datetime(train['datetime']) 
test['datetime']= pd.to_datetime(test['datetime']) 
train.head()

Unnamed: 0,row_id,datetime,energy
0,1,2008-03-01 00:00:00,1259.985563
1,2,2008-03-01 01:00:00,1095.5415
2,3,2008-03-01 02:00:00,1056.2475
3,4,2008-03-01 03:00:00,1034.742
4,5,2008-03-01 04:00:00,1026.3345


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94992 entries, 0 to 94991
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   row_id    94992 non-null  int64         
 1   datetime  94992 non-null  datetime64[ns]
 2   energy    93092 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(1)
memory usage: 2.2 MB


# NA values

In [5]:
train.isnull().sum()

row_id         0
datetime       0
energy      1900
dtype: int64

In [6]:
train.shape ,test.shape

((94992, 3), (26304, 2))

# Date VS Energy Plot

In [7]:
fig = px.line(train, x="datetime", y="energy", title='energy forecasting')
fig.show()

# Feature Engineering

In [8]:
def encode(data, col, max_val):
    data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val)
    data[col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val)
    return data

In [9]:
train['year'] = train['datetime'].dt.year
train['month'] = train['datetime'].dt.month
train['day'] = train['datetime'].dt.day 
train['quarter'] = train['datetime'].dt.quarter
train['hour'] = train['datetime'].dt.hour
train['dayofweek'] = train['datetime'].dt.dayofweek
train['dayofyear'] = train['datetime'].dt.dayofyear
train['weekofyear'] = train['datetime'].dt.weekofyear
train['weekday_name'] = train['datetime'].dt.day_name()
train['Week'] = train['datetime'].apply(lambda x: pd.to_datetime(x).week)
train['is_weekend'] = np.where(train['weekday_name'].isin(['Sunday','Saturday']),1,0)
train['season'] = train['month'].apply(lambda x: 0 if x in [2,3] else 1 if x in [4,5,6] else 2 if x in [7,8] else 3 if x in [9,10,11] else 4)

train = encode(train, 'month', train['month'].max())
train = encode(train, 'quarter', train['quarter'].max())
train = encode(train, 'season', train['season'].max())
train = encode(train, 'Week', train['Week'].max())
train = encode(train, 'dayofweek', train['dayofweek'].max())
train = encode(train, 'day', train['day'].max())
train = encode(train, 'hour', train['hour'].max())


Series.dt.weekofyear and Series.dt.week have been deprecated.  Please use Series.dt.isocalendar().week instead.



In [10]:
test['year'] = test['datetime'].dt.year
test['month'] = test['datetime'].dt.month
test['day'] = test['datetime'].dt.day 
test['quarter'] = test['datetime'].dt.quarter
test['hour'] = test['datetime'].dt.hour
test['dayofweek'] = test['datetime'].dt.dayofweek
test['dayofyear'] = test['datetime'].dt.dayofyear
test['weekofyear'] = test['datetime'].dt.weekofyear
test['weekday_name'] = test['datetime'].dt.day_name()
test['is_weekend'] = np.where(test['weekday_name'].isin(['Sunday','Saturday']),1,0)
test['Week'] = test['datetime'].apply(lambda x: pd.to_datetime(x).week)
test['season'] = test['month'].apply(lambda x: 0 if x in [2,3] else 1 if x in [4,5,6] else 2 if x in [7,8] else 3 if x in [9,10,11] else 4)

test = encode(test, 'month', test['month'].max())
test = encode(test, 'quarter', test['quarter'].max())
test = encode(test, 'season', test['season'].max())
test = encode(test, 'Week', test['Week'].max())
test = encode(test, 'dayofweek', test['dayofweek'].max())
test = encode(test, 'day', test['day'].max())
test = encode(test, 'hour', test['hour'].max())


Series.dt.weekofyear and Series.dt.week have been deprecated.  Please use Series.dt.isocalendar().week instead.



In [11]:
train.columns

Index(['row_id', 'datetime', 'energy', 'year', 'month', 'day', 'quarter',
       'hour', 'dayofweek', 'dayofyear', 'weekofyear', 'weekday_name', 'Week',
       'is_weekend', 'season', 'month_sin', 'month_cos', 'quarter_sin',
       'quarter_cos', 'season_sin', 'season_cos', 'Week_sin', 'Week_cos',
       'dayofweek_sin', 'dayofweek_cos', 'day_sin', 'day_cos', 'hour_sin',
       'hour_cos'],
      dtype='object')

In [12]:
train['energy'] =  train['energy'].fillna(method='ffill')

In [13]:
X = train[['year', 'month', 'day', 'quarter',
       'hour', 'dayofweek', 'dayofyear', 'weekofyear', 'Week',
       'is_weekend', 'season', 'month_sin', 'month_cos', 'quarter_sin',
       'quarter_cos', 'season_sin', 'season_cos', 'Week_sin', 'Week_cos',
       'dayofweek_sin', 'dayofweek_cos', 'day_sin', 'day_cos', 'hour_sin',
       'hour_cos']]

y = train['energy']

Xtest = test[['year', 'month', 'day', 'quarter',
       'hour', 'dayofweek', 'dayofyear', 'weekofyear', 'Week',
       'is_weekend', 'season', 'month_sin', 'month_cos', 'quarter_sin',
       'quarter_cos', 'season_sin', 'season_cos', 'Week_sin', 'Week_cos',
       'dayofweek_sin', 'dayofweek_cos', 'day_sin', 'day_cos', 'hour_sin',
       'hour_cos']]

In [14]:
features = list(X.columns)
target = 'energy'

X   = X.to_numpy()
y   = y.to_numpy().reshape(-1, 1)

X_testt = Xtest[features]
X_test = X_testt.to_numpy()

# CatBoostRegressor

In [15]:
splits = 5
skf = KFold(n_splits=splits, shuffle=True, random_state=42)
oof_preds = np.zeros((train.shape[0],))
model_preds = 0
    
for fold, (train_idx, valid_idx) in enumerate(skf.split(train[features], train['energy'])):
    X_train, X_valid = train.loc[train_idx][features], train.loc[valid_idx][features]
    y_train, y_valid = train.loc[train_idx][target], train.loc[valid_idx][target]
        
    model = CatBoostRegressor(objective='RMSE',eval_metric='RMSE')
        
    model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], 
                  early_stopping_rounds=50, verbose=1000)
        
    model_preds += model.predict(test[features], ) / splits
    oof_preds[valid_idx] = model.predict(X_valid)
        
    print(f"\nFold-{fold+1} | MSE: {mean_squared_error(y_valid, oof_preds[valid_idx])}\n")
    
train[target+'_cb'] = oof_preds
test[target+'_cb'] = model_preds
    
model_mse = mean_squared_error(train[target], oof_preds)
print(f"All Folds | MSE: {model_mse}\n")

Learning rate set to 0.100503
0:	learn: 344.8183346	test: 344.8183346	test1: 344.2710058	best: 344.2710058 (0)	total: 65ms	remaining: 1m 4s
999:	learn: 73.9932665	test: 73.9932665	test1: 78.6618181	best: 78.6618181 (999)	total: 29.9s	remaining: 0us

bestTest = 78.66181808
bestIteration = 999


Fold-1 | MSE: 6187.681564913967

Learning rate set to 0.100503
0:	learn: 345.0313947	test: 345.0313947	test1: 343.3555824	best: 343.3555824 (0)	total: 16.4ms	remaining: 16.4s
999:	learn: 74.5300343	test: 74.5300343	test1: 77.6479768	best: 77.6479768 (999)	total: 15.7s	remaining: 0us

bestTest = 77.64797684
bestIteration = 999


Fold-2 | MSE: 6029.208368250944

Learning rate set to 0.100503
0:	learn: 344.6723922	test: 344.6723922	test1: 345.0885874	best: 345.0885874 (0)	total: 19.5ms	remaining: 19.5s
999:	learn: 74.4284845	test: 74.4284845	test1: 78.1768671	best: 78.1768671 (999)	total: 15.2s	remaining: 0us

bestTest = 78.1768671
bestIteration = 999


Fold-3 | MSE: 6111.62252551809

Learning rate 

In [16]:
test.head()

Unnamed: 0,row_id,datetime,year,month,day,quarter,hour,dayofweek,dayofyear,weekofyear,...,season_cos,Week_sin,Week_cos,dayofweek_sin,dayofweek_cos,day_sin,day_cos,hour_sin,hour_cos,energy_cb
0,94993,2019-01-01 00:00:00,2019,1,1,1,0,1,1,1,...,1.0,0.118273,0.992981,0.866025,0.5,0.201299,0.97953,0.0,1.0,1772.743169
1,94994,2019-01-01 01:00:00,2019,1,1,1,1,1,1,1,...,1.0,0.118273,0.992981,0.866025,0.5,0.201299,0.97953,0.269797,0.962917,1658.07721
2,94995,2019-01-01 02:00:00,2019,1,1,1,2,1,1,1,...,1.0,0.118273,0.992981,0.866025,0.5,0.201299,0.97953,0.519584,0.854419,1567.60367
3,94996,2019-01-01 03:00:00,2019,1,1,1,3,1,1,1,...,1.0,0.118273,0.992981,0.866025,0.5,0.201299,0.97953,0.730836,0.682553,1514.072951
4,94997,2019-01-01 04:00:00,2019,1,1,1,4,1,1,1,...,1.0,0.118273,0.992981,0.866025,0.5,0.201299,0.97953,0.887885,0.460065,1491.089111


In [18]:
sub = pd.DataFrame({'row_id':test['row_id'],'energy':test['energy_cb']})
sub.head()

Unnamed: 0,row_id,energy
0,94993,1772.743169
1,94994,1658.07721
2,94995,1567.60367
3,94996,1514.072951
4,94997,1491.089111


In [19]:
sub.to_csv("Solution.csv",index=False)