In [27]:
from modules.manipulation_functions import *
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

In [28]:
path = '../../data/regression-data/'

In [29]:
all_data_as_dict = import_all_files_as_dict(path)

In [30]:
all_data_as_dict

{'ABEO':                Close    Volume  Daily Returns  Monthly Moving Average  \
 Date                                                                    
 2017-09-13  1.428675 -0.239493       0.235319                1.528276   
 2017-09-14  1.428675 -0.337815       0.015221                1.575056   
 2017-09-15  1.976642  1.012566       3.486486                1.602960   
 2017-09-18  1.856774  0.209272      -0.602846                1.640712   
 2017-09-19  1.899584 -0.171723       0.245324                1.671898   
 ...              ...       ...            ...                     ...   
 2022-08-12 -0.938340  2.547555      -0.734342               -0.946011   
 2022-08-15 -0.936833  0.666504       0.769419               -0.945881   
 2022-08-16 -0.936011  1.012529       0.407138               -0.945702   
 2022-08-17 -0.937175  0.154829      -0.526027               -0.945502   
 2022-08-18 -0.938819  0.446557      -0.777132               -0.945326   
 
             Quarterly Movin

In [31]:
models = {}
for company in all_data_as_dict.keys():
    X, y = all_data_as_dict[company].drop(['Target'], axis=1), all_data_as_dict[company]['Target']
    X_train, y_train, X_dev, y_dev, X_test, y_test = split_train_dev_test(X, y)
    model = XGBRegressor(n_estimators=100)
    model.fit(X_train, y_train)
    y_hat = model.predict(X_dev)

    models[company] = {
        'model': model,
        'y_hat': y_hat,
        'X_dev': X_dev,
        'y_dev': y_dev,
        'X_test': X_test,
        'y_test': y_test
    }

In [32]:
models

{'ABEO': {'model': XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
               colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
               early_stopping_rounds=None, enable_categorical=False,
               eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
               importance_type=None, interaction_constraints='',
               learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
               max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
               missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
               num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
               reg_lambda=1, ...),
  'y_hat': array([-0.56845486, -0.5711251 , -0.5597076 , -0.6013435 , -0.6222879 ,
         -0.62424076, -0.6112089 , -0.6071093 , -0.6101592 , -0.57612735,
         -0.57512003, -0.5746849 , -0.5676396 , -0.5727308 , -0.6120562 ,
         -0.6144559 , -0.61887   , -0.6092

In [33]:
for company in all_data_as_dict.keys():
    mse = mean_squared_error(models[company]['y_dev'], models[company]['y_hat'])
    print(f'Mean Squared Error for {company}: {mse}')

Mean Squared Error for ABEO: 0.0006288177563014382
Mean Squared Error for ABIO: 0.0010722821890249469
Mean Squared Error for ABUS: 0.009225541757815486
Mean Squared Error for ACAD: 0.02480620377574144
Mean Squared Error for ACER: 0.00031489635726643123
Mean Squared Error for ACHN: 1.7966617674187717
Mean Squared Error for ACHV: 1.4974993851529068e-05
Mean Squared Error for ACIU: 0.014643856577283733
Mean Squared Error for ACOR: 4.996758827189996e-05
Mean Squared Error for ACRS: 0.013517701383115864
Mean Squared Error for ACST: 0.004385272431273901
Mean Squared Error for ADAP: 0.005000711888725262
Mean Squared Error for ADIL: 0.053568943287683086
Mean Squared Error for ADMA: 0.32191344016673007
Mean Squared Error for ADVM: 0.027017655654370046
Mean Squared Error for ADXS: 5.001896644129754e-06
Mean Squared Error for AEZS: 0.005958984529007257
Mean Squared Error for AFMD: 0.7262625321888235
Mean Squared Error for AGE: 0.0065568176056920915
Mean Squared Error for AGEN: 0.09497603054943393

In [34]:
predictions = {}
for company in all_data_as_dict.keys():
    X_test = models[company]['X_test']
    y_hat = models[company]['model'].predict(X_test)
    predictions[company] = y_hat

In [35]:
sum_mse = 0
for company in all_data_as_dict.keys():
    sum_mse += mean_squared_error(models[company]['y_test'], predictions[company])
    print(f"Mean Squared Error for {company}:\n{mean_squared_error(models[company]['y_test'], predictions[company])}")
sum_mse /= len(models)

Mean Squared Error for ABEO:
0.025334676865166512
Mean Squared Error for ABIO:
0.006459868377921183
Mean Squared Error for ABUS:
0.007189543108014711
Mean Squared Error for ACAD:
0.008098472806462604
Mean Squared Error for ACER:
0.0016648565191259483
Mean Squared Error for ACHN:
2.824614271433537
Mean Squared Error for ACHV:
4.6290377233012506e-05
Mean Squared Error for ACIU:
0.07024834373030894
Mean Squared Error for ACOR:
0.0008721008697252583
Mean Squared Error for ACRS:
0.01718518518606215
Mean Squared Error for ACST:
0.020892941029766087
Mean Squared Error for ADAP:
0.005920466891415376
Mean Squared Error for ADIL:
0.07019621416037468
Mean Squared Error for ADMA:
0.2021231486125013
Mean Squared Error for ADVM:
0.13244784314894734
Mean Squared Error for ADXS:
0.00012114310537258536
Mean Squared Error for AEZS:
0.005260585469685289
Mean Squared Error for AFMD:
0.013124461812184641
Mean Squared Error for AGE:
0.0066115567988937925
Mean Squared Error for AGEN:
0.024801371804664206
Mea

In [36]:
print(f'Average Mean Squared Error is {sum_mse}')

Average Mean Squared Error is 0.20881729603267993


In [37]:
for company in predictions.keys():
    predictions[company] = {
        'True Values': models[company]['y_test'],
        'Predictions': predictions[company]
    }

In [38]:
pred_path = '../../data/predictions/xgboostregressor'
export_predictions(predictions, pred_path)