### Rossman Kaggle - Barborini Germán

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb

# Defino las métricas porque voy a usar una custom en vez de las que vienen de base en el paquete XgB
def rmspe(y, yhat):
    return np.sqrt(np.mean((yhat/y-1) ** 2))

def rmspe_xg(yhat, y):
    y = np.expm1(y.get_label())
    yhat = np.expm1(yhat)
    return "rmspe", rmspe(y,yhat)

def create_feature_map(features):
    outfile = open('xgb.fmap', 'w')
    for i, feat in enumerate(features):
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
    outfile.close()

def build_features(features, data):
    # Saco los NAN y los relleno con 0, luego los eliminaré mas adelante
    data.fillna(0, inplace=True)
    data.loc[data.Open.isnull(), 'Open'] = 1

    features.extend(['Store', 'CompetitionDistance', 'CompetitionOpenSinceMonth',
                     'CompetitionOpenSinceYear', 'Promo', 'Promo2', 'Promo2SinceWeek',
                     'Promo2SinceYear'])

    features.append('SchoolHoliday')
    data['SchoolHoliday'] = data['SchoolHoliday'].astype(float)

    features.extend(['StoreType', 'Assortment', 'StateHoliday'])
    mappings = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4}
    data.StoreType.replace(mappings, inplace=True)
    data.Assortment.replace(mappings, inplace=True)
    data.StateHoliday.replace(mappings, inplace=True)

    features.extend(['DayOfWeek', 'month', 'day', 'year'])
    data['year'] = data.Date.dt.year
    data['month'] = data.Date.dt.month
    data['day'] = data.Date.dt.day
    data['DayOfWeek'] = data.Date.dt.dayofweek

In [2]:
# Enmascaro los datos para saber como leerlos
types = {'CompetitionOpenSinceYear': np.dtype(int),
         'CompetitionOpenSinceMonth': np.dtype(int),
         'StateHoliday': np.dtype(str),
         'Promo2SinceWeek': np.dtype(int),
         'SchoolHoliday': np.dtype(float),
         'PromoInterval': np.dtype(str)}
train = pd.read_csv("dataset/rossmann//train.csv", parse_dates=[2], dtype=types)
test = pd.read_csv("dataset/rossmann//test.csv", parse_dates=[3], dtype=types)
store = pd.read_csv("dataset/rossmann//store.csv")
# Relleno los casilleros vacíos
train.fillna(1, inplace=True)
test.fillna(1, inplace=True)
# Elimino las tiendas que estuviesen cerradas
train = train[train["Open"] != 0]
#  Y las tiendas que no tuviesen ventas
train = train[train["Sales"] > 0]
# Armo los DF para entrenar
train = pd.merge(train, store, on='Store')
test = pd.merge(test, store, on='Store')

In [3]:
features = []
#Acondiciono los features
build_features(features, train)
build_features([], test)

print(features)

['Store', 'CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo', 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear', 'SchoolHoliday', 'StoreType', 'Assortment', 'StateHoliday', 'DayOfWeek', 'month', 'day', 'year']


In [4]:

# Parametros del XGBOOST 
params = {"objective": "reg:linear",
          "booster" : "gbtree",
          "eta": 0.09,
          "max_depth": 11, 
          "subsample": 0.9,
          "colsample_bytree": 0.7,
          "silent": 1,
          "seed": 1301
          }
num_boost_round = 500 
# Hago el train test split 
X_train, X_valid = train_test_split(train, test_size=0.012, random_state=10)
y_train = np.log1p(X_train.Sales)
y_valid = np.log1p(X_valid.Sales)

# Acá usa una conversión que viene en el paquete XGB que transforma los datos en una DMatrix que es
# un formato optimizado que mejora el uso de memoria y es más rápido a la hora de entrenar
dtrain = xgb.DMatrix(X_train[features], y_train)
dvalid = xgb.DMatrix(X_valid[features], y_valid)

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
#acá entrena con los parámetros antes calculados
gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, \
  early_stopping_rounds=100, feval=rmspe_xg, verbose_eval=True)

print("Val")
yhat = gbm.predict(xgb.DMatrix(X_valid[features]))
error = rmspe(X_valid.Sales.values, np.expm1(yhat))
print('RMSPE: {:.6f}'.format(error))

dtest = xgb.DMatrix(test[features])
test_probs = gbm.predict(dtest)
# Luego de entrenar guarda todo en un archivo
result = pd.DataFrame({"Id": test["Id"], 'Sales': np.expm1(test_probs)})
result.to_csv("xgboost_Barborini_submission3.csv", index=False)


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-rmse:7.52590	train-rmspe:0.99958	eval-rmse:7.52504	eval-rmspe:0.99958
[1]	train-rmse:6.84998	train-rmspe:0.99901	eval-rmse:6.84951	eval-rmspe:0.99901
[2]	train-rmse:6.23497	train-rmspe:0.99804	eval-rmse:6.23496	eval-rmspe:0.99804
[3]	train-rmse:5.67540	train-rmspe:0.99645	eval-rmse:5.67563	eval-rmspe:0.99645
[4]	train-rmse:5.16627	train-rmspe:0.99400	eval-rmse:5.16667	eval-rmspe:0.99400
[5]	train-rmse:4.70314	train-rmspe:0.99039	eval-rmse:4.70380	eval-rmspe:0.99040
[6]	train-rmse:4.28183	train-rmspe:0.98530	eval-rmse:4.28256	eval-rmspe:0.98531
[7]	train-rmse:3.89858	train-rmspe:0.97840	eval-rmse:3.89956	eval-rmspe:0.97842
[8]	train-rmse:3.54994	train-rmspe:0.96939	eval-rmse:3.55108	eval-rmspe:0.9694