In [35]:
import xgboost as xgb
import pandas as pd
import numpy as np
import datetime

In [28]:
def mape(predictions, dtrain):
    ground_truth = dtrain.get_label()
    one_over_n = 1/len(predictions)
    at_mean = sum(ground_truth)/len(ground_truth)
    sommatoria = sum(np.abs(ground_truth - predictions)) / at_mean
    return 'mape', one_over_n * sommatoria

In [30]:
train_set = pd.read_csv('dataset/output/dataset_polimi_extended_with_province.csv',delimiter=',')
train_set = train_set.round({'gini_index': 4, 'forza_lavoro': 4, 'occupazione' : 4})

In [31]:
y_train = train_set.pop('vendite')

In [32]:
xgdmat = xgb.DMatrix(train_set,y_train, missing=np.nan)

In [33]:
our_params = {'eta': 0.1, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8, 
             'objective': 'reg:linear', 'max_depth':10, 'min_child_weight':10} 

# Questo dictionary verra popolato con tutti i risultati della MAPE del train e 
# test set computati ad ogni iterazione
dic = {}

# Ad ogni iteraziore, voglio 'guardare' questo dataset e valutarli con la funzione
# passata in feval
watchlist = [(xgdmat,'train')]

final_gb = xgb.train(our_params, xgdmat, 155, evals = watchlist, feval = mape, evals_result = dic)

[0]	train-mape:0.821966
[1]	train-mape:0.754842
[2]	train-mape:0.695696
[3]	train-mape:0.647657
[4]	train-mape:0.606508
[5]	train-mape:0.573701
[6]	train-mape:0.538243
[7]	train-mape:0.510482
[8]	train-mape:0.487335
[9]	train-mape:0.472448
[10]	train-mape:0.455213
[11]	train-mape:0.437689
[12]	train-mape:0.425392
[13]	train-mape:0.412644
[14]	train-mape:0.401547
[15]	train-mape:0.392188
[16]	train-mape:0.38449
[17]	train-mape:0.376555
[18]	train-mape:0.367495
[19]	train-mape:0.362934
[20]	train-mape:0.356514
[21]	train-mape:0.351251
[22]	train-mape:0.346511
[23]	train-mape:0.343692
[24]	train-mape:0.339944
[25]	train-mape:0.335234
[26]	train-mape:0.331555
[27]	train-mape:0.327185
[28]	train-mape:0.324085
[29]	train-mape:0.32135
[30]	train-mape:0.319345
[31]	train-mape:0.317159
[32]	train-mape:0.314453
[33]	train-mape:0.310311
[34]	train-mape:0.308131
[35]	train-mape:0.306489
[36]	train-mape:0.305547
[37]	train-mape:0.302469
[38]	train-mape:0.300739
[39]	train-mape:0.298483
[40]	train-m

In [34]:
min(dic['train']['mape'])

0.240769

In [38]:
test_set = pd.read_csv('dataset/output/test_with_features_with_provincie.csv',delimiter=',')
dtest = xgb.DMatrix(test_set, missing=np.nan)
y_pred = final_gb.predict(dtest) 

# Round
round_pred = list(map(lambda x: int(round(x)), y_pred))

# Remove negatives
positive_pred = list(map(lambda x: x if x > 0 else 0, round_pred))

# Generate nicer prediction csv
test_set['vendite'] = positive_pred
df = pd.DataFrame()
df[['zona','area','sottoarea','categoria','data','vendite']] = test_set[['zona','area','sottoarea','categoria','timestamp','vendite']]
df['data'] = list(map(lambda x: datetime.datetime.fromtimestamp(x).strftime('%Y-%m-%d'),df['data']))
df.to_csv("dataset/predictions/xgboost_rm.csv", sep=',')