In [2]:
import pandas as pd
import datetime
from catboost import Pool, CatBoostRegressor
import math
from sklearn.metrics import r2_score

In [3]:
data = pd.read_pickle('../data/df6.pkl')
data.head(5)


Unnamed: 0,ID,DELTA,YEAR_DAY,DATE,MEAN_CONSUMPTION,VARIANCE_CONSUMPTION,WEEKDAY,IS_WEEKEND,sin_WEEKDAY,cos_WEEKDAY,sin_year_day,cos_year_day,PRECIPITATIONS,MIN_TEMP,MEAN_TEMP,MAX_TEMP,SUN,IS_GOOD
0,0,243.0,0,2019-02-01,282.893151,11453.639651,4,0,-0.433884,-0.900969,0.0,1.0,0.0,16.5,17.6,18.7,7.1,1
1,0,236.0,1,2019-02-02,282.893151,11453.639651,5,1,-0.974928,-0.222521,0.017213,0.999852,0.0,9.8,13.4,17.1,6.1,1
2,0,335.0,2,2019-02-03,282.893151,11453.639651,6,1,-0.781831,0.62349,0.034422,0.999407,0.0,7.7,10.6,13.6,9.3,1
3,0,252.0,3,2019-02-04,282.893151,11453.639651,0,0,0.0,1.0,0.05162,0.998667,0.0,4.1,10.6,17.2,9.3,1
4,0,220.0,4,2019-02-05,282.893151,11453.639651,1,0,0.781831,0.62349,0.068802,0.99763,0.0,7.5,14.6,21.6,9.2,1


In [4]:
data.isnull().sum()

ID                         0
DELTA                      0
YEAR_DAY                   0
DATE                       0
MEAN_CONSUMPTION           0
VARIANCE_CONSUMPTION       0
WEEKDAY                    0
IS_WEEKEND                 0
sin_WEEKDAY                0
cos_WEEKDAY                0
sin_year_day               0
cos_year_day               0
PRECIPITATIONS          1095
MIN_TEMP                   0
MEAN_TEMP                  0
MAX_TEMP                   0
SUN                      730
IS_GOOD                    0
dtype: int64

In [4]:
'''
given a start date in datetime format "start_date" and an "end_date" returns a list of strings with the dates from
"start_date" to "end_date".

Example:

start_date = datetime.date(2019, 9 , 30)
end_date = datetime.date(2019, 10, 7)
get_date_range(start_date, end_date)
'''

def get_date_range(start_date, end_date):
    number_of_days = (end_date-start_date).days
    return [(start_date + datetime.timedelta(days = day)).isoformat() for day in range(number_of_days+1)]

In [5]:
start_date = datetime.date(2020, 1 , 18)
end_date = datetime.date(2020, 1, 31)
test = data[data['DATE'].isin(get_date_range(start_date, end_date))]

start_date = datetime.date(2019, 2 , 1)
end_date = datetime.date(2020, 1, 17)
train = data[data['DATE'].isin(get_date_range(start_date, end_date))]

In [6]:
X_train = train.drop(['DELTA', 'DATE', 'YEAR_DAY', 'WEEKDAY', 'ID', 'IS_GOOD'], axis=1)
y_train = train['DELTA']
X_test = test.drop(['DELTA', 'DATE', 'YEAR_DAY', 'WEEKDAY', 'ID', 'IS_GOOD'], axis=1)
y_test = test['DELTA']

# Implementación de CatBoost en Python

In [7]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 128115 entries, 0 to 133210
Data columns (total 12 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   MEAN_CONSUMPTION      128115 non-null  float64
 1   VARIANCE_CONSUMPTION  128115 non-null  float64
 2   IS_WEEKEND            128115 non-null  int64  
 3   sin_WEEKDAY           128115 non-null  float64
 4   cos_WEEKDAY           128115 non-null  float64
 5   sin_year_day          128115 non-null  float64
 6   cos_year_day          128115 non-null  float64
 7   PRECIPITATIONS        127385 non-null  float64
 8   MIN_TEMP              128115 non-null  float64
 9   MEAN_TEMP             128115 non-null  float64
 10  MAX_TEMP              128115 non-null  float64
 11  SUN                   127385 non-null  float64
dtypes: float64(11), int64(1)
memory usage: 12.7 MB


In [8]:
X_train.nunique()

MEAN_CONSUMPTION        359
VARIANCE_CONSUMPTION    359
IS_WEEKEND                2
sin_WEEKDAY               7
cos_WEEKDAY               7
sin_year_day            351
cos_year_day            334
PRECIPITATIONS           37
MIN_TEMP                173
MEAN_TEMP               151
MAX_TEMP                160
SUN                      94
dtype: int64

If the categorical features have a lot of unique values, we won't use one hot encoding, but depending on the dataset it may be a good idea to adjust one_hot_max_size.

Vbles categóricas con pocos valores únicos ¿qué hacemos?

In [9]:
X_train.head(1)

Unnamed: 0,MEAN_CONSUMPTION,VARIANCE_CONSUMPTION,IS_WEEKEND,sin_WEEKDAY,cos_WEEKDAY,sin_year_day,cos_year_day,PRECIPITATIONS,MIN_TEMP,MEAN_TEMP,MAX_TEMP,SUN
0,282.893151,11453.639651,0,-0.433884,-0.900969,0.0,1.0,0.0,16.5,17.6,18.7,7.1


In [10]:
train_1 = Pool(X_train, 
                  y_train, 
                  cat_features=[2])

In [11]:
test_1 = Pool(X_test, 
                  cat_features=[2])

In [12]:
model = CatBoostRegressor(iterations=500, 
                          depth=2, 
                          learning_rate=0.1
                          )

In [13]:
model.fit(train_1)

0:	learn: 6439.4399712	total: 160ms	remaining: 1m 19s
1:	learn: 6251.7178503	total: 178ms	remaining: 44.3s
2:	learn: 6092.1907246	total: 196ms	remaining: 32.5s
3:	learn: 5959.6582816	total: 210ms	remaining: 26s
4:	learn: 5846.3461937	total: 224ms	remaining: 22.2s
5:	learn: 5749.1043086	total: 238ms	remaining: 19.6s
6:	learn: 5672.2412514	total: 251ms	remaining: 17.7s
7:	learn: 5603.0496683	total: 264ms	remaining: 16.2s
8:	learn: 5540.6355455	total: 278ms	remaining: 15.1s
9:	learn: 5490.2589879	total: 291ms	remaining: 14.2s
10:	learn: 5448.0105031	total: 310ms	remaining: 13.8s
11:	learn: 5413.6587576	total: 326ms	remaining: 13.3s
12:	learn: 5383.2312458	total: 340ms	remaining: 12.7s
13:	learn: 5355.2931029	total: 356ms	remaining: 12.4s
14:	learn: 5325.0534996	total: 369ms	remaining: 11.9s
15:	learn: 5301.8143006	total: 384ms	remaining: 11.6s
16:	learn: 5282.8927579	total: 396ms	remaining: 11.2s
17:	learn: 5267.7122292	total: 407ms	remaining: 10.9s
18:	learn: 5248.4017526	total: 420ms	re

<catboost.core.CatBoostRegressor at 0x1589d9a51b0>

In [14]:
y_pred = model.predict(test_1)
print(y_pred)
print(y_test)

[237.56395485 301.60489147 307.36565261 ...  89.67082505  85.37943918
  70.84545108]
351       421.0
352       273.0
353       306.0
354       292.0
355       460.0
          ...  
133220     97.0
133221    139.0
133222     65.0
133223     49.0
133224    100.0
Name: DELTA, Length: 5110, dtype: float64


In [15]:
#R2 Score
r2_test = r2_score(y_test, y_pred)
r2_test

-0.09517099626851788