In [1]:
import sys
sys.path.insert(1, '../../Src')
from utils.preprocessing import *
import training


from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
from math import ceil
import warnings
import glob
import os
import sys




SEED = 42
seed_everything(SEED)
PATH_DATASET = '../../dataset/'
PATH_RESULTS = '../../results/Demanda/'
FILL = False
TARGET = 'Demanda'
N_FOLDS = 3
N_FEATURE_IMPORTANCE = 15



plt.rcParams['axes.facecolor']='white'
plt.rcParams['savefig.facecolor']='white'
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)
%load_ext autoreload
%autoreload 2

# Lectura de data referencial para sacar metricas de analisis

In [2]:
submission_top_reference = 'Submission_73.csv'
print('reading .. ',os.path.join('../../results/',submission_top_reference))
result = pd.read_csv(os.path.join('../../results/',submission_top_reference))
descriptive_columns = ['Z_MODELO','Z_PUNTO_VENTA','Z_GAMA','Z_WEEK']
reverse_mapping_file = '../../utils/reverse_dict_mapping_list.txt'

result[['Z_MODELO','Z_PUNTO_VENTA','Z_GAMA','Z_WEEK']] = result['ID'].str.split('|',expand=True)

with open(reverse_mapping_file, 'rb') as f:
    reverse_mapping = pickle.load( f)
    
descriptive_columns = ['Z_MARCA', 'Z_GAMA', 'Z_MODELO',
                       'Z_DEPARTAMENTO', 'Z_PUNTO_VENTA']
i=0
for column in descriptive_columns:
    if column in ['Z_MODELO','Z_PUNTO_VENTA','Z_GAMA','Z_WEEK']:
        inv_reverse_mapping = {v: k for k, v in reverse_mapping[i].items()}

        result[column] = result[column].map(inv_reverse_mapping)
    i+=1
    
result = result.rename(columns = {TARGET: TARGET+'_real'})
result.head(1)

reading ..  ../../results/Submission_73.csv


Unnamed: 0,ID,Demanda_real,Z_MODELO,Z_PUNTO_VENTA,Z_GAMA,Z_WEEK
0,009e0874cc07b2180b2b2b1039321041a1f42e66693c49...,0.0,MOD_170,PVENT_259,GAM_2,SEMANA_51


# Lectura de la data con las nuevas variables creadas

In [3]:
print('Reading dataset')
SHIFT=1
df_train  = pd.read_pickle(os.path.join(PATH_RESULTS,'dataset','df_train_fe_FILL_'+str(FILL)+'_SHIFT_'+str(SHIFT)+'.pkl'))
df_test   = pd.read_pickle(os.path.join(PATH_RESULTS,'dataset','df_test_fe_FILL_'+str(FILL)+'_SHIFT_'+str(SHIFT)+'.pkl'))

gc.collect()
print('df_train size :',df_train.shape)
print('df_test size  :',df_test.shape) 
features_names = set(df_train.columns)-set([TARGET,'date_block_num',TARGET+'_clipped','release','item_id',
                                            'Z_WEEK_DATE', 'Z_WEEK','date_block_num','year',
                                            'month', 'is_month_end', 'tm_m_end','is_month_start','tm_wm', 'tm_w_end','is_quarter_start','is_year_start',
                                             'day_of_week',
                                            'day_of_year', 'day',
                                           ]) 
print(len(features_names))
features_names = list(features_names)
df_train['week_of_month'] = df_train['week_of_month'].astype('category')
df_test['week_of_month']  = df_test['week_of_month'].astype('category')
total_features_names = features_names.copy()

Reading dataset
df_train size : (2358650, 1111)
df_test size  : (471730, 1111)
1093


# Filtro de datos por ventana de tiempo temporal [1 a 3 meses antes] y [7 a 9 meses antes]
### nos aseguramos que la tendencia de los meses no solo dependa del anterior inmediato

In [4]:
def get_feature_names_shift(shift,total_features_names):
    features_names = []

    for column in total_features_names:
        if 'shift' in column:
            number = column.split('shift_')[1]
            number = int(number.split('_')[0])
            if (number>=shift and number<=(shift+2))or(number>=(shift+6) and number<=(shift+8)):
                features_names.append(column)
        else:
            features_names.append(column)
    return features_names

# Definir la ventana de testing para poder evaluar que el modelo predice bien el futuro
### esta ventana de testing es nuestra base de analisis en el training

In [5]:
TRAIN_START = '2021-08-01'
TEST_START = '2022-03-01'
TEST_END   = '2022-05-01'
print(df_train.shape)
df_train = df_train[(df_train['Z_WEEK_DATE'] >= TRAIN_START)]
print(df_train.shape)


(2358650, 1111)
(1839747, 1111)


# Entrenamiento con cross validation e importancia de datos

In [8]:
PATH_RESULTS = '../../results/Demanda_ventana_custom/'
training.PATH_RESULTS = PATH_RESULTS
training.submission_analysis.TARGET  = TARGET

metrics_dict = {}
metrics_dict_list = []
for SHIFT in range(1,5):
    print('*'*20)
    print('*'*20)
    print('WEEK = ',SHIFT)
    print('*'*20)
    print('*'*20)

    features_names = get_feature_names_shift(SHIFT,total_features_names)
    print('len base features = ',len(features_names))
    print('example base features =',features_names[:5])
    
    ################### CV MODEL RANDOM SPLIT ###############

    gc.collect()
    
    for idx,model_type in enumerate(['xgboost']):

    
        X_train      = df_train[((df_train['Z_WEEK_DATE'] >= TRAIN_START) &(df_train['Z_WEEK_DATE'] < TEST_START))&
                               (df_train['Z_WEEK_DATE'] != '2022-04-11')].copy() 
        X_test       = df_train[(df_train['Z_WEEK_DATE'] >= TEST_START) & (df_train['Z_WEEK_DATE'] < TEST_END)].copy() 

        print('*'*20)
        print('MODEL = ',model_type)
        print('CV MODEL RANDOM SPLIT model_type = ',model_type,'c_model_v2')
        print('*'*20)

        y_train = X_train[TARGET]
        y_test =  X_test[TARGET]
        y_submission = df_test[TARGET]
        X_train = X_train[features_names]
        X_test  = X_test[features_names]
        X_submission = df_test[features_names]        
        


        print('X_train total cv',X_train.shape)
        print('X_test          ',X_test.shape)
        print('X_submission    ',X_submission.shape)

        model_version = 'c_model_v2'
        training.TARGET = TARGET
        training.SHIFT = SHIFT

        df_submission,df_feature_importance,metrics = training.training_model_cv(model_type,model_version,X_train,y_train,
                                                            X_test,y_test,X_submission,df_test,result,N_FOLDS)

    
        gc.collect()
        
        #'''
        if idx ==0 :
            features_names = list(df_feature_importance['feature'][:30].values)
            
        ################### CV MODEL RANDOM SPLIT + FEATURE IMPORTANCE ###############
        important_features = list(df_feature_importance[:N_FEATURE_IMPORTANCE]['feature'])
        
        X_train      = df_train[(df_train['Z_WEEK_DATE'] >= TRAIN_START)&(df_train['Z_WEEK_DATE'] < TEST_START)].copy() 
        X_test       = df_train[(df_train['Z_WEEK_DATE'] >= TEST_START) & (df_train['Z_WEEK_DATE'] < TEST_END)].copy() 

        y_train = X_train[TARGET]
        y_test =  X_test[TARGET]
        y_submission = df_test[TARGET]

        X_train = X_train[important_features]
        X_test  = X_test[important_features]
        X_submission = df_test[important_features]

        print('X_train total cv',X_train.shape)
        print('X_test          ',X_test.shape)
        print('X_submission    ',X_submission.shape)

        print('*'*20)
        print('MODEL = ',model_type)
        print('CV MODEL RANDOM SPLIT + FEATURE IMPORTANCE model_type = ',model_type,'c_model_v3')
        print('*'*20)
        
        model_version = 'c_model_v3'
        training.TARGET = TARGET
        training.SHIFT = SHIFT


        df_submission,df_feature_importance,metrics = training.training_model_cv(model_type,model_version,X_train,y_train,
                                                               X_test,y_test,X_submission,df_test,result,N_FOLDS)
        #'''
        metrics_dict[SHIFT] = metrics
        metrics_dict_list.append(metrics.append('SEMANA_'+str(SHIFT)))
        gc.collect()


********************
********************
WEEK =  1
********************
********************
len base features =  444
example base features = ['shift_3_roll_4_mean_Z_PUNTO_VENTA_Z_DEPARTAMENTO', 'shift_7_roll_4_mean_item_id', 'shift_2_roll_2_std_Z_PUNTO_VENTA_Z_GAMA', 'shift_2_roll_4_std_Z_MARCA_Z_DEPARTAMENTO', 'shift_7_roll_2_std_Z_PUNTO_VENTA_Z_DEPARTAMENTO']
********************
MODEL =  xgboost
CV MODEL RANDOM SPLIT model_type =  xgboost c_model_v2
********************
X_train total cv (1462363, 444)
X_test           (377384, 444)
X_submission     (471730, 444)
KFold(n_splits=3, random_state=100, shuffle=True)
********************  fold n°0
Training ...
directory_model =  ../../results/Demanda_ventana_custom/xgboost/c_model_v2/fold_1/shift_1
create folder ../../results/Demanda_ventana_custom/xgboost/c_model_v2/fold_1/shift_1
Parameters: { "scale_pos_weight", "silent", "verbose_eval" } might not be used.

  This could be a false alarm, with some parameters getting used by language

KeyboardInterrupt: 

In [None]:
import pandas as pd

df = pd.DataFrame(metrics_dict_list,columns =['train rmse','val rmse','train mape','val mape','train mase','val mase'])

In [None]:
df.head(10)