In [1]:
import pandas as pd

data = pd.read_csv('../data/enriquecidos-004.pkl')
data.drop('Unnamed: 0', axis=1, inplace=True)
data.head()

Unnamed: 0,ID,DATE,DELTA,READING,PRECIPITATIONS,MIN_TEMP,MEAN_TEMP,MAX_TEMP,SUN,WEEK_DAY,IS_WEEKEND,sin_WEEK_DAY,cos_WEEK_DAY,IS_PUBLIC_HOLIDAY
0,0,2019-02-01,243.0,331953.0,0.0,16.5,17.6,18.7,7.1,4,0,-0.433884,-0.900969,0
1,0,2019-02-02,236.0,332189.0,0.0,9.8,13.4,17.1,6.1,5,1,-0.974928,-0.222521,0
2,0,2019-02-03,335.0,332524.0,0.0,7.7,10.6,13.6,9.3,6,1,-0.781831,0.62349,0
3,0,2019-02-04,252.0,332776.0,0.0,4.1,10.6,17.2,9.3,0,0,0.0,1.0,0
4,0,2019-02-05,220.0,332996.0,0.0,7.5,14.6,21.6,9.2,1,0,0.781831,0.62349,0


In [2]:
#Devuelve numero de dias que existen registro para un contador i

def num_days(df, i):
    data_one_id = df[df['ID']==i]
    return len(data_one_id['DATE'].unique())

In [5]:
#Cálculo de contadores de los que se tiene toda la información

contadores_completos = []
for i in data['ID'].unique():
    if num_days(data, i) == 365:
        contadores_completos.append(i)

len(contadores_completos) 

1975

In [6]:
#Nos quedamos con los contadores completos 100%

data = data[data['ID'].isin(contadores_completos)]

In [7]:
#Cuántas veces el siguiente reading es decreciente

def num_decreasing_reading(data, contador):
    rdif0 = data[data['ID']==contador]['READING'].diff().fillna(0).round(4)
    errores = 0
    for (i, rdif) in enumerate(rdif0):
        if rdif < 0:
            errores += 1
    return errores

#Diferencia entre dos readings debe ser delta. Chekeo de que se cumple

def num_readingdiff_not_eq_delta(df, contador):
    rdif0 = df[df['ID']==contador]['READING'].diff().fillna(0).round(4)
    delta0 = df[df['ID']==contador]['DELTA']
    errores = 0
    for (i, (rdif, delta)) in enumerate(zip(rdif0, delta0)):
        if rdif!=delta:
            errores += 1
    return errores

### Contadores con READING decreasing
Como son pocos y en muy poca proporción voy a ignorar este error

In [8]:
from tqdm import tqdm

bondad_contadores = {}
for contador in tqdm(data['ID'].unique()):
    rdif0 = data[data['ID']==contador]['READING'].diff().fillna(0).round(4)
    errores = 0
    for (i, rdif) in enumerate(rdif0):
        if rdif < 0:
            errores += 1
    bondad_contadores[contador] = errores/len(rdif0)

100%|██████████| 1975/1975 [00:03<00:00, 546.49it/s]


In [9]:
sum([1 if i!=0 else 0 for i in bondad_contadores.values()])

58

### Meter nueva variable consumo total del contador al cabo del año

In [10]:
def consumo_total(df, i):
    df0=df[df['ID']==i]
    initial_reading = df0[df0['DATE']=="2019-02-01"].READING.values[0]
    final_reading = df0[df0['DATE']=="2020-01-31"].READING.values[0]
    return final_reading-initial_reading

In [11]:
def consumo_total2(i):
    return consumo_total(data, i)

In [12]:
from tqdm import tqdm

tqdm.pandas()

data['TOTAL_CONSUMPTION'] = data['ID'].progress_apply(consumo_total2)

100%|██████████| 720875/720875 [25:04<00:00, 479.30it/s]


### Dividir en train y test
Tomo para test las dos últimas semanas de Enero

In [14]:
import datetime
from tqdm import tqdm

'''
given a start date in datetime format "start_date" and an "end_date" returns a list of strings with the dates from
"start_date" to "end_date".

Example:

start_date = datetime.date(2019, 9 , 30)
end_date = datetime.date(2019, 10, 7)
get_date_range(start_date, end_date)
'''

def get_date_range(start_date, end_date):
    number_of_days = (end_date-start_date).days
    return [(start_date + datetime.timedelta(days = day)).isoformat() for day in range(number_of_days+1)]

In [15]:
start_date = datetime.date(2020, 1 , 18)
end_date = datetime.date(2020, 1, 31)
test = data[data['DATE'].isin(get_date_range(start_date, end_date))]

start_date = datetime.date(2019, 2 , 1)
end_date = datetime.date(2020, 1, 17)
train = data[data['DATE'].isin(get_date_range(start_date, end_date))]

In [17]:
X_train = train.drop(['READING','DELTA','DATE','WEEK_DAY'], axis=1)
y_train = train['DELTA']
X_test = test.drop(['READING','DELTA','DATE','WEEK_DAY'], axis=1)
y_test = test['DELTA']

# Implementación de CatBoost en Python

In [19]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 693225 entries, 0 to 846732
Data columns (total 11 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   ID                 693225 non-null  int64  
 1   PRECIPITATIONS     693225 non-null  float64
 2   MIN_TEMP           693225 non-null  float64
 3   MEAN_TEMP          693225 non-null  float64
 4   MAX_TEMP           693225 non-null  float64
 5   SUN                693225 non-null  float64
 6   IS_WEEKEND         693225 non-null  int64  
 7   sin_WEEK_DAY       693225 non-null  float64
 8   cos_WEEK_DAY       693225 non-null  float64
 9   IS_PUBLIC_HOLIDAY  693225 non-null  int64  
 10  TOTAL_CONSUMPTION  693225 non-null  float64
dtypes: float64(8), int64(3)
memory usage: 63.5 MB


In [21]:
X_train.nunique()

ID                   1975
PRECIPITATIONS         37
MIN_TEMP              173
MEAN_TEMP             151
MAX_TEMP              160
SUN                    94
IS_WEEKEND              2
sin_WEEK_DAY            7
cos_WEEK_DAY            7
IS_PUBLIC_HOLIDAY       2
TOTAL_CONSUMPTION    1925
dtype: int64

If the categorical features have a lot of unique values, we won't use one hot encoding, but depending on the dataset it may be a good idea to adjust one_hot_max_size.

Vbles categóricas con pocos valores únicos ¿qué hacemos?

In [24]:
X_train.head(5)

Unnamed: 0,ID,PRECIPITATIONS,MIN_TEMP,MEAN_TEMP,MAX_TEMP,SUN,IS_WEEKEND,sin_WEEK_DAY,cos_WEEK_DAY,IS_PUBLIC_HOLIDAY,TOTAL_CONSUMPTION
0,0,0.0,16.5,17.6,18.7,7.1,0,-0.433884,-0.900969,0,103013.0
1,0,0.0,9.8,13.4,17.1,6.1,1,-0.974928,-0.222521,0,103013.0
2,0,0.0,7.7,10.6,13.6,9.3,1,-0.781831,0.62349,0,103013.0
3,0,0.0,4.1,10.6,17.2,9.3,0,0.0,1.0,0,103013.0
4,0,0.0,7.5,14.6,21.6,9.2,0,0.781831,0.62349,0,103013.0


In [29]:
#Se guarda en una lista las columnas con variables categóricas

cat_features = [0, 6, 9]
print(cat_features)

[0, 6, 9]


In [31]:
from catboost import CatBoostClassifier

In [43]:
catboost1 = CatBoostClassifier(
    iterations=10
)

In [46]:
catboost1.fit(
    X_train, y_train,
    cat_features=cat_features,
    eval_set=(X_test, y_test),
    verbose=False
)

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

CatBoostError: C:/Program Files (x86)/Go Agent/pipelines/BuildMaster/catboost.git/catboost/private/libs/target/target_converter.cpp:226: Unknown class label: "408.9"