In [28]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    mean_squared_error, mean_absolute_error, 
    r2_score, mean_absolute_percentage_error
)
import plotly.express as px
import plotly.graph_objects as go

In [29]:
df = pd.read_csv('../data/processed/main_feats.csv')
df.head()

Unnamed: 0,start_date,end_date,project_duration,x1,x2,x3,x5,x51,y
0,1396-01-01,1400-01-01,48,3150.0,920.0,1,1703.672462,16,9979.795043
1,1395-01-01,1400-10-01,69,7600.0,1140.0,1,1446.077707,23,16362.045147
2,1398-01-01,1401-10-01,45,4800.0,840.0,1,2376.845399,15,14120.09328
3,1397-04-01,1398-04-01,12,685.0,202.0,1,1207.741141,4,2560.672748
4,1387-01-01,1390-04-01,39,3000.0,800.0,1,410.0,13,700.0


In [30]:
df_economics = pd.read_csv('../data/processed/economics.csv')
df_economics.head()

Unnamed: 0,Date,b1,b2,b3,b4,b5,b7,b8,b9,b10,b14,b17,b18,B-23,B-24
0,1360-01-01,2806.0,,918.0,4380.0,25.1,17.0,19.0,77.7,270.0,1078.5,,,,
1,1360-04-01,2437.0,,834.0,4552.0,20.8,18.0,19.0,81.3,270.0,1191.1,,,,
2,1360-07-01,2600.0,,720.0,4653.8,23.3,18.0,20.0,79.6,270.0,1246.2,,,,
3,1360-10-01,2355.0,,750.0,5236.1,18.4,20.0,19.0,80.8,270.0,1408.1,,,,
4,1361-01-01,2332.0,,850.0,5250.2,44.8,20.0,22.0,82.4,350.0,1324.9,0.5,0.5,0.233333,


In [31]:
# filter df_economics drop extera rows for now (TODO: Add seasonal and  lag later)
df_economics = df_economics[df_economics['Date'].isin(df['start_date'])]
df = pd.merge(df, df_economics, how='left', left_on='start_date', right_on='Date')
del df_economics
df.drop(columns=['Date'], inplace=True)
df.drop(columns=['start_date', 'end_date'], inplace=True)
df = df[[c for c in df if c not in ['y']] + ['y']]
print(df.shape)
df.head()

(400, 21)


Unnamed: 0,project_duration,x1,x2,x3,x5,x51,b1,b2,b3,b4,...,b7,b8,b9,b10,b14,b17,b18,B-23,B-24,y
0,48,3150.0,920.0,1,1703.672462,16,2261.0,104.4,2684.5,13149100.0,...,10424.0,10705.0,32441.8,37564.2,340100.0,104.9,104.4,107.333333,159717.794562,9979.795043
1,69,7600.0,1140.0,1,1446.077707,23,1786.0,96.7,3804.4,10595000.0,...,9418.0,10008.0,30358.3,34732.5,311600.0,96.8,97.2,96.333333,138459.84255,16362.045147
2,45,4800.0,840.0,1,2376.845399,15,2204.0,162.9,2780.5,19799100.0,...,17419.0,27814.0,42000.0,138784.0,502200.0,143.0,137.8,187.833333,303149.844961,14120.09328
3,12,685.0,202.0,1,1207.741141,4,2882.0,124.0,3897.1,16723700.0,...,12611.0,15328.0,42645.537634,116949.557127,445000.0,122.1,119.0,134.166667,262670.093748,2560.672748
4,39,3000.0,800.0,1,410.0,13,6790.0,24.2,6728.0,1622664.0,...,2871.0,3663.0,9140.8,9260.0,68903.3,35.9,35.0,24.866667,44709.541069,700.0


In [32]:
model = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.9,
    random_state=42,
    early_stopping_rounds=50,
)

x = df[[c for c in df if c not in ['y']]]
y = df['y']

# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
x_train, x_valid, x_test = x.iloc[:int(len(x)*0.7)], x.iloc[int(len(x)*0.7):int(len(x)*0.9)], x.iloc[int(len(x)*0.9):]
y_train, y_valid, y_test = y.iloc[:int(len(y)*0.7)], y.iloc[int(len(y)*0.7):int(len(y)*0.9)], y.iloc[int(len(y)*0.9):]
model.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], verbose=100)

[0]	validation_0-rmse:3640.05914


[100]	validation_0-rmse:756.69629
[115]	validation_0-rmse:748.36369


In [34]:
# plot training and validation accuracy and loss
results = model.evals_result()
epochs = len(results['validation_0']['rmse'])
x_axis = [i for i in range(0, epochs)]
fig = go.Figure()
fig.add_trace(go.Scatter(x=x_axis, y=results['validation_0']['rmse'], mode='lines', name='Train'))
fig.add_trace(go.Scatter(x=x_axis, y=results['validation_0']['rmse'], mode='lines', name='Valid'))
fig.update_layout(title='Training and Validation RMSE', xaxis_title='Epochs', yaxis_title='RMSE')
fig.show()

In [35]:
# evaluate the model
y_pred = model.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
print('MSE: %.3f, MAE: %.3f, R2: %.3f, MAPE: %.3f' % (mse, mae, r2, mape))

MSE: 42885650.602, MAE: 2281.781, R2: 0.637, MAPE: 1.411
