In [18]:
### Data Science Libraries ###
import pandas as pd
import numpy as np
import seaborn as sns

### Machine Learning Libraries ###
 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.inspection import permutation_importance
import catboost as cb
import lightgbm as lgbm
import xgboost as xgb

### OS and other Libraries ###
import inspect
import os
import datetime
import warnings
warnings.filterwarnings('ignore')
import random
import pickle

In [1]:
pwd

'C:\\Users\\renatocastro\\Kaggle\\Competencias\\notebooks'

In [20]:
train = pd.read_csv('..\\dataset\\train\\train_converted.csv')
test = pd.read_csv('..\\dataset\\test\\test_converted.csv')

In [21]:
descriptive_columns = ['Z_MARCA', 'Z_GAMA', 'Z_MODELO',
                       'Z_DEPARTAMENTO', 'Z_PUNTO_VENTA']

In [22]:
X_train = train[['Z_MARCA', 'Z_GAMA', 'Z_MODELO', 
           'Z_DEPARTAMENTO', 'Z_PUNTO_VENTA',
           'Z_DAY', 'Z_MONTH', 'Z_YEAR'
          ]]
y = train['Demanda']

X_test = test[['Z_MARCA', 'Z_GAMA', 'Z_MODELO', 
           'Z_DEPARTAMENTO', 'Z_PUNTO_VENTA',
           'Z_DAY', 'Z_MONTH', 'Z_YEAR'
          ]]
#y = test['Demanda']

In [11]:
for feature in descriptive_columns:
    X_train[feature] = pd.Series(X_train[feature], dtype="category")
    X_test[feature] = pd.Series(X_test[feature], dtype="category")

In [13]:
random_seed = 2019
np.random.seed(random_seed)

In [14]:
X_training, X_val, y_training, y_val = train_test_split(X_train, y,
                                                    test_size = 0.2,
                                                    random_state=90)

In [15]:
categorical_features_indices = np.where(X_train.dtypes == 'category') [0]
categorical_features_indices

array([0, 1, 2, 3, 4], dtype=int64)

In [16]:
train_pool = cb.Pool(X_training, y_training,cat_features=categorical_features_indices)
validation_pool = cb.Pool(X_val, y_val,cat_features=categorical_features_indices)

In [17]:
model = cb.CatBoostRegressor(iterations=5000,  random_seed=42, early_stopping_rounds=50)

model.fit(train_pool, eval_set=validation_pool, verbose=100, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.062347
0:	learn: 5.8758087	test: 5.7331899	best: 5.7331899 (0)	total: 687ms	remaining: 57m 14s



KeyboardInterrupt



In [None]:
y_pred = model.predict(X_val)

In [None]:
from sklearn.metrics import mean_squared_error
import math
mse = mean_squared_error(y_val, y_pred)
rmse = math.sqrt(mse)
print('RMSE: %f' % rmse)

In [None]:
submit = test.copy()

In [None]:
submit['Demanda'] = model.predict(X_test)

In [None]:
submit.loc[submit['Demanda'] < 0, 'Demanda'] = 0 

In [None]:
reverse_mapping_file = '..\\utils\\reverse_dict_mapping_list.txt'

with open(reverse_mapping_file, 'rb') as f:
    reverse_mapping = pickle.load( f)

In [None]:
i=0
for column in descriptive_columns:
    submit[column] = submit[column].map(reverse_mapping[i])
    i+=1

In [None]:
submit['ID'] = submit['Z_MODELO'] + '|' + submit['Z_PUNTO_VENTA'] + '|' + submit['Z_GAMA'] + '|' + submit['Z_WEEK']

In [None]:
submission = submit[['ID','Demanda']].groupby('ID').sum().reset_index()

In [None]:
submission[['ID', 'Demanda']].to_csv('Submission_01.csv', index = False, sep = ',')