In [None]:
import pandas as pd

In [None]:
import numpy as np

In [None]:
from dynamic_portfolio.utils import load_csv
from dynamic_portfolio.utils import features_creation, clean_data
from dynamic_portfolio.preprocess import scaler

# Loading data and creating clean dataframe

In [None]:
df = features_creation('META')

In [None]:
pd.set_option('display.max_columns', None)

# Cross vals

We use sklearn's time series split to break up the data in different folds
We use a sklearn example to run the first tests.

Metrics used : 
 - rmse
 - mae
 - R2

In [None]:
from sklearn.model_selection import TimeSeriesSplit

In [None]:
ts_cv = TimeSeriesSplit(
    n_splits=20,
    gap=0,
    max_train_size=252,
    test_size=45,
)

In [None]:
all_splits = list(ts_cv.split(X, y))

In [None]:
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

In [None]:
model = HistGradientBoostingRegressor()

In [None]:
rmse = []
for train_index, test_index in ts_cv.split(X):
    cv_train, cv_test = X.iloc[train_index], X.iloc[test_index]
    model = model.fit(X,y)
    predictions = model.predict(cv_test)
    true_values = cv_test['return']
    rmse.append(np.sqrt(mean_squared_error(true_values, predictions)))

In [None]:
np.mean(rmse)*100

In [None]:
def evaluate(model, X, y, cv):
    cv_results = cross_validate(
        model,
        X,
        y,
        cv=cv,
        scoring=["neg_mean_absolute_error", "neg_root_mean_squared_error"],
    )
    mae = -cv_results["test_neg_mean_absolute_error"]
    rmse = -cv_results["test_neg_root_mean_squared_error"]
    print(
        f"Mean Absolute Error:     {mae.mean():.6f} +/- {mae.std():.6f}\n"
        f"Root Mean Squared Error: {rmse.mean():.6f} +/- {rmse.std():.6f}"
    )


In [None]:
import seaborn as sns

In [None]:
evaluate(model, X, y, ts_cv)

# PCA Analysis

In [25]:
from sklearn.decomposition import PCA

In [130]:
pca = PCA(n_components=0.9)
ibm = prep.ready_to_train_df('IBM')
pca.fit(ibm)

In [131]:
n_pcs= pca.n_components_ # get number of component
# get the index of the most important feature on EACH component
most_important = [np.abs(pca.components_[i]).argmax() for i in range(n_pcs)]
initial_feature_names = ibm.columns
# get the most important feature names
most_important_names = [initial_feature_names[most_important[i]] for i in range(n_pcs)]

In [132]:
most_important_names

['volume_momentum_20days',
 'momentum_10days/eps',
 '2Y_yield',
 'distance_10days',
 'distance_10days',
 'inf_exp',
 '10_2_spread',
 '2Y_return',
 'oil_return',
 'non_farm_payroll_return',
 'spread_return',
 'non_farm_payroll_return',
 'cpi_return',
 'gdp_return',
 'oil_return',
 'volume',
 'non_farm_payroll_return']

## Using PCA analysis on all stocks

In [23]:
from sklearn.pipeline import make_pipeline

In [29]:
pipe.get_params()

{'memory': None,
 'steps': [('pca', PCA()),
  ('gradientboostingregressor', GradientBoostingRegressor())],
 'verbose': False,
 'pca': PCA(),
 'gradientboostingregressor': GradientBoostingRegressor(),
 'pca__copy': True,
 'pca__iterated_power': 'auto',
 'pca__n_components': None,
 'pca__n_oversamples': 10,
 'pca__power_iteration_normalizer': 'auto',
 'pca__random_state': None,
 'pca__svd_solver': 'auto',
 'pca__tol': 0.0,
 'pca__whiten': False,
 'gradientboostingregressor__alpha': 0.9,
 'gradientboostingregressor__ccp_alpha': 0.0,
 'gradientboostingregressor__criterion': 'friedman_mse',
 'gradientboostingregressor__init': None,
 'gradientboostingregressor__learning_rate': 0.1,
 'gradientboostingregressor__loss': 'squared_error',
 'gradientboostingregressor__max_depth': 3,
 'gradientboostingregressor__max_features': None,
 'gradientboostingregressor__max_leaf_nodes': None,
 'gradientboostingregressor__min_impurity_decrease': 0.0,
 'gradientboostingregressor__min_samples_leaf': 1,
 'gradi

In [30]:
pipe = make_pipeline(PCA(), GradientBoostingRegressor())

params = {'pca__n_components':0.9,
                'gradientboostingregressor__max_depth':3,
                'gradientboostingregressor__criterion':'friedman_mse',
                'gradientboostingregressor__n_estimators':100,
                'gradientboostingregressor__learning_rate':0.08}
pipe.set_params(**params)

In [31]:
pca_dict_score = {}
tickers = utils.return_tickers()
for ticker in tickers:
        dict_pca_score[ticker] = cross_validate_ml(prep.ready_to_train_df(ticker), pipe)
        print(f"done for ticker {ticker} index # {tickers.index(ticker)}")

done for ticker AAPL index # 0
done for ticker MSFT index # 1
done for ticker GOOG index # 2
done for ticker AMZN index # 3
done for ticker TSLA index # 4
done for ticker UNH index # 5
done for ticker XOM index # 6
done for ticker JNJ index # 7
done for ticker WMT index # 8
done for ticker NVDA index # 9
done for ticker JPM index # 10
done for ticker V index # 11
done for ticker CVX index # 12
done for ticker PG index # 13
done for ticker LLY index # 14
done for ticker MA index # 15
done for ticker HD index # 16
done for ticker META index # 17
done for ticker BAC index # 18
done for ticker ABBV index # 19
done for ticker PFE index # 20
done for ticker KO index # 21
done for ticker MRK index # 22
done for ticker PEP index # 23
done for ticker COST index # 24
done for ticker ORCL index # 25
done for ticker AVGO index # 26
done for ticker TMO index # 27
done for ticker MCD index # 28
done for ticker CSCO index # 29
done for ticker ACN index # 30
done for ticker DHR index # 31
done for tic

done for ticker BKR index # 259
done for ticker GLW index # 260
done for ticker LYB index # 261
done for ticker ES index # 262
done for ticker BAX index # 263
done for ticker STT index # 264
done for ticker VRSK index # 265
done for ticker TROW index # 266
done for ticker WBD index # 267
done for ticker AWK index # 268
done for ticker IT index # 269
done for ticker GPN index # 270
done for ticker HRL index # 271
done for ticker FANG index # 272
done for ticker WTW index # 273
done for ticker RJF index # 274
done for ticker GPC index # 275
done for ticker IFF index # 276
done for ticker CDW index # 277
done for ticker TSCO index # 278
done for ticker FITB index # 279
done for ticker ARE index # 280
done for ticker URI index # 281
done for ticker ZBH index # 282
done for ticker K index # 283
done for ticker LEN index # 284
done for ticker EBAY index # 285
done for ticker EIX index # 286
done for ticker CBRE index # 287
done for ticker EFX index # 288
done for ticker VMC index # 289
done 

In [33]:
pca_dict_score = {}
tickers = utils.return_tickers()
for ticker in tickers:
    pca = PCA(n_components=0.9)
    ticker_df = prep.ready_to_train_df(ticker)
    pca.fit(ticker_df)
    n_pcs= pca.n_components_
    most_important = [np.abs(pca.components_[i]).argmax() for i in range(n_pcs)]
    initial_feature_names = ticker_df.columns
    most_important_names = [initial_feature_names[most_important[i]] for i in range(n_pcs)]
    pca_df = ticker_df[list(np.unique(most_important_names))] 
    returns = ticker_df[['return']]
    final_pca = pd.merge(pca_df, returns, how='outer', left_index=True, right_index=True)
    pca_dict_score[ticker] = cross_validate_ml(final_pca, GradientBoostingRegressor(max_depth = 3, 
                                                                                    criterion='friedman_mse',
                                                                                    learning_rate=0.08,
                                                                                    n_estimators = 100))
    print(f"done for ticker {ticker} index # {tickers.index(ticker)}")

done for ticker AAPL index # 0
done for ticker MSFT index # 1
done for ticker GOOG index # 2
done for ticker AMZN index # 3
done for ticker TSLA index # 4
done for ticker UNH index # 5
done for ticker XOM index # 6
done for ticker JNJ index # 7
done for ticker WMT index # 8
done for ticker NVDA index # 9
done for ticker JPM index # 10
done for ticker V index # 11
done for ticker CVX index # 12
done for ticker PG index # 13
done for ticker LLY index # 14
done for ticker MA index # 15
done for ticker HD index # 16
done for ticker META index # 17
done for ticker BAC index # 18
done for ticker ABBV index # 19
done for ticker PFE index # 20
done for ticker KO index # 21
done for ticker MRK index # 22
done for ticker PEP index # 23
done for ticker COST index # 24
done for ticker ORCL index # 25
done for ticker AVGO index # 26
done for ticker TMO index # 27
done for ticker MCD index # 28
done for ticker CSCO index # 29
done for ticker ACN index # 30
done for ticker DHR index # 31
done for tic

done for ticker BKR index # 259
done for ticker GLW index # 260
done for ticker LYB index # 261
done for ticker ES index # 262
done for ticker BAX index # 263
done for ticker STT index # 264
done for ticker VRSK index # 265
done for ticker TROW index # 266
done for ticker WBD index # 267
done for ticker AWK index # 268
done for ticker IT index # 269
done for ticker GPN index # 270
done for ticker HRL index # 271
done for ticker FANG index # 272
done for ticker WTW index # 273
done for ticker RJF index # 274
done for ticker GPC index # 275
done for ticker IFF index # 276
done for ticker CDW index # 277
done for ticker TSCO index # 278
done for ticker FITB index # 279
done for ticker ARE index # 280
done for ticker URI index # 281
done for ticker ZBH index # 282
done for ticker K index # 283
done for ticker LEN index # 284
done for ticker EBAY index # 285
done for ticker EIX index # 286
done for ticker CBRE index # 287
done for ticker EFX index # 288
done for ticker VMC index # 289
done 

In [35]:
rmse_pca= []
baseline = []
for key in pca_dict_score.keys():
    rmse_pca.append(pca_dict_score[key][0])
    baseline.append(pca_dict_score[key][1])
print(np.mean(rmse_pca), np.mean(baseline) )

0.0033498792960944923 0.02096529727516623


In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.ylim(ymin=0)
plt.title('cumulated share of explained variance')
plt.xlabel('# of principal component used');

In [1]:
import pandas as pd
import numpy as np
import dynamic_portfolio.preprocess as prep
import dynamic_portfolio.utils as utils
import dynamic_portfolio.cross_validate as cv
import dynamic_portfolio.models as model
from sklearn.ensemble import GradientBoostingRegressor
import warnings
%load_ext autoreload
%autoreload 2
warnings.filterwarnings(action='ignore')
pd.set_option('display.max_columns', None)

### Cross validation functions

In [138]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
cross_val = {
    'fold_length' : 252, # Working days for 1 year
    'fold_stride' : 60, # Step between folds, here one quarter
    'train_test_ratio' : 0.7, # Split in fold
    'input_length' : 0, # Number of days to move back from last train_index, here 0
    'horizon' : 1, # Number of days ahead to make prediction, here 1
    'output_length' : 1, # Number of targets wanted
}
#Split the dataset by FOLDS
def get_folds(
    df: pd.DataFrame,
    fold_length = cross_val['fold_length'],
    fold_stride = cross_val['fold_stride']):
    '''
    This function slides through the Time Series dataframe of shape (n_timesteps, n_features) to create folds
    - of equal `fold_length`
    - using `fold_stride` between each fold
    Returns a list of folds, each as a DataFrame
    '''
    folds = []
    for idx in range(0, len(df), fold_stride):
        # Exits the loop as soon as the last fold index would exceed the last index
        if (idx + fold_length) > len(df):
            break
        fold = df.iloc[idx:idx + fold_length, :]
        folds.append(fold)
    return folds
#Split FOLDS by Train et Test
#### FOR ONE FOLD !!!!!
def train_test_split(fold: pd.DataFrame,
                     train_test_ratio = cross_val['train_test_ratio'],
                     input_length = cross_val['input_length']):
    '''
    Returns a train dataframe and a test dataframe (fold_train, fold_test)
    from which one can sample (X,y) sequences.
    df_train should contain all the timesteps until round(train_test_ratio * len(fold))
    '''
    # TRAIN SET
    # ======================
    last_train_idx = round(train_test_ratio * len(fold))
    fold_train = fold.iloc[0:last_train_idx, :]
    # TEST SET
    # ======================
    first_test_idx = last_train_idx - input_length 
    fold_test = fold.iloc[first_test_idx:, :]
    return (fold_train, fold_test)
def cross_validate_ml(df, model) :
    '''
    get_folds() create many FOLDS, train_test_split() create a split on ONE FOLDS.
    The goal of this function is to make splits and sequences on each FOLDS.
    Then, apply a model.
    '''
    folds = get_folds(df, fold_length = cross_val['fold_length'], fold_stride = cross_val['fold_stride']) # 1 - Creating FOLDS
    scores =[]
    baseline = []
    for fold in folds:
        # 2 - CHRONOLOGICAL TRAIN TEST SPLIT of the current FOLD
        (fold_train, fold_test) = train_test_split(fold = fold,
                                                train_test_ratio = cross_val['train_test_ratio'],
                                                input_length = cross_val['input_length'] ,
                                                )
        # 3 - Scanninng fold_train and fold_test for SEQUENCES
        X_train, y_train = fold_train, fold_train[['return']].shift(1).replace(np.nan,0)
        X_test, y_test = fold_test, fold_test[['return']].shift(1).replace(np.nan,0)
        model.fit(X_train, y_train)
        rmse_model = (mean_squared_error(y_test, model.predict(X_test)))**0.5
        scores.append(rmse_model)
        rmse_baseline = mean_squared_error(y_test.iloc[[0]], y_train.iloc[[-1]])**0.5
        baseline.append(rmse_baseline)
    return np.mean(scores), np.mean(baseline)

### Script to run for model on all stocks

### Models used

In [13]:
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

In [139]:
dict_score = {}
tickers = utils.return_tickers()
for ticker in tickers:
        dict_score[ticker] = cross_validate_ml(prep.ready_to_train_df(ticker), XGBRegressor())
        print(f"done for ticker {ticker} index # {tickers.index(ticker)}")

done for ticker AAPL index # 0
done for ticker MSFT index # 1
done for ticker GOOG index # 2
done for ticker AMZN index # 3
done for ticker TSLA index # 4
done for ticker UNH index # 5
done for ticker XOM index # 6
done for ticker JNJ index # 7
done for ticker WMT index # 8
done for ticker NVDA index # 9
done for ticker JPM index # 10
done for ticker V index # 11
done for ticker CVX index # 12
done for ticker PG index # 13
done for ticker LLY index # 14
done for ticker MA index # 15
done for ticker HD index # 16
done for ticker META index # 17
done for ticker BAC index # 18
done for ticker ABBV index # 19
done for ticker PFE index # 20
done for ticker KO index # 21
done for ticker MRK index # 22
done for ticker PEP index # 23
done for ticker COST index # 24
done for ticker ORCL index # 25
done for ticker AVGO index # 26
done for ticker TMO index # 27
done for ticker MCD index # 28
done for ticker CSCO index # 29
done for ticker ACN index # 30
done for ticker DHR index # 31
done for tic

KeyboardInterrupt: 

### Cross val scoring

In [20]:
rmse= []
baseline = []
for key in dict_score.keys():
    rmse.append(dict_score[key][0])
    baseline.append(dict_score[key][1])
print(np.mean(rmse), np.mean(baseline) )

0.004796076311898096 0.02096529727516623


## Custom grid search

In [None]:
max_depth = [2, 5, 8]
criterion = ['friedman_mse', 'squared_error', 'mse']
learning_rate=[0.01, 0.1, 0.2] 
n_estimators=[100, 200, 500]

In [11]:
def custom_gridsearch(df, model, max_depth=[2,3,4], criterion = ['friedman_mse', 'squared_error', 'mse'], n_estimator=[50, 75, 100], learning_rate=[0.08, 0.1, 0.12], loss=['squared_error', 'absolute_error', 'huber']):
    counter = 0
    rmse = []
    baseline = []
    params = []
    for max_depth_i in max_depth:
        for criterion_i in criterion:
            for n_estimator_i in n_estimator:
                for learning_rate_i in learning_rate:
                    for loss_i in loss:
                        test = cross_validate_ml(df = df, model = model(max_depth=max_depth_i,
                                                                   criterion = criterion_i,
                                                                   n_estimators = n_estimator_i,
                                                                   learning_rate = learning_rate_i,
                                                                   loss = loss_i))
                        rmse.append(test[0])
                        baseline.append(test[1])
                        params.append((max_depth_i, criterion_i, n_estimator_i, learning_rate_i))
                        counter += 1
                        print(f'model {counter} done with parameters: max_depth = {max_depth_i}, criterion = {criterion_i}, estimators = {n_estimator_i}, learning rate = {learning_rate_i}, loss = {loss_i}, rmse = {test[0]}')
    idx_min = np.argmin(rmse)
    best_params = params[idx_min]
    
    return best_params, rmse, params

### Model results on one stock on a grid search

In [12]:
gs_IP = custom_gridsearch(prep.ready_to_train_df('IP'), model = GradientBoostingRegressor)

model 1 done with parameters: max_depth = 2, criterion = friedman_mse, estimators = 50, learning rate = 0.08, loss = squared_error, rmse = 0.002958185553002535
model 2 done with parameters: max_depth = 2, criterion = friedman_mse, estimators = 50, learning rate = 0.08, loss = absolute_error, rmse = 0.006407018407676127
model 3 done with parameters: max_depth = 2, criterion = friedman_mse, estimators = 50, learning rate = 0.08, loss = huber, rmse = 0.004428068812946768
model 4 done with parameters: max_depth = 2, criterion = friedman_mse, estimators = 50, learning rate = 0.1, loss = squared_error, rmse = 0.002776729479836416
model 5 done with parameters: max_depth = 2, criterion = friedman_mse, estimators = 50, learning rate = 0.1, loss = absolute_error, rmse = 0.005372760873905239
model 6 done with parameters: max_depth = 2, criterion = friedman_mse, estimators = 50, learning rate = 0.1, loss = huber, rmse = 0.003432291380610513
model 7 done with parameters: max_depth = 2, criterion = 

model 53 done with parameters: max_depth = 2, criterion = squared_error, estimators = 100, learning rate = 0.12, loss = absolute_error, rmse = 0.0038499484792597865
model 54 done with parameters: max_depth = 2, criterion = squared_error, estimators = 100, learning rate = 0.12, loss = huber, rmse = 0.002795539577918935
model 55 done with parameters: max_depth = 2, criterion = mse, estimators = 50, learning rate = 0.08, loss = squared_error, rmse = 0.0029099191595345383
model 56 done with parameters: max_depth = 2, criterion = mse, estimators = 50, learning rate = 0.08, loss = absolute_error, rmse = 0.006407991680854807
model 57 done with parameters: max_depth = 2, criterion = mse, estimators = 50, learning rate = 0.08, loss = huber, rmse = 0.004428402988000523
model 58 done with parameters: max_depth = 2, criterion = mse, estimators = 50, learning rate = 0.1, loss = squared_error, rmse = 0.0028373111145722184
model 59 done with parameters: max_depth = 2, criterion = mse, estimators = 50

model 106 done with parameters: max_depth = 3, criterion = friedman_mse, estimators = 100, learning rate = 0.12, loss = squared_error, rmse = 0.0025730424155033494
model 107 done with parameters: max_depth = 3, criterion = friedman_mse, estimators = 100, learning rate = 0.12, loss = absolute_error, rmse = 0.0036681061284255722
model 108 done with parameters: max_depth = 3, criterion = friedman_mse, estimators = 100, learning rate = 0.12, loss = huber, rmse = 0.0027251777091245326
model 109 done with parameters: max_depth = 3, criterion = squared_error, estimators = 50, learning rate = 0.08, loss = squared_error, rmse = 0.002702793151349547
model 110 done with parameters: max_depth = 3, criterion = squared_error, estimators = 50, learning rate = 0.08, loss = absolute_error, rmse = 0.005058232367804183
model 111 done with parameters: max_depth = 3, criterion = squared_error, estimators = 50, learning rate = 0.08, loss = huber, rmse = 0.003941806328485692
model 112 done with parameters: m

model 159 done with parameters: max_depth = 3, criterion = mse, estimators = 100, learning rate = 0.1, loss = huber, rmse = 0.002699105329518561
model 160 done with parameters: max_depth = 3, criterion = mse, estimators = 100, learning rate = 0.12, loss = squared_error, rmse = 0.002601949405101594
model 161 done with parameters: max_depth = 3, criterion = mse, estimators = 100, learning rate = 0.12, loss = absolute_error, rmse = 0.0036977284985453603
model 162 done with parameters: max_depth = 3, criterion = mse, estimators = 100, learning rate = 0.12, loss = huber, rmse = 0.0027447041132625722
model 163 done with parameters: max_depth = 4, criterion = friedman_mse, estimators = 50, learning rate = 0.08, loss = squared_error, rmse = 0.002758204626328357
model 164 done with parameters: max_depth = 4, criterion = friedman_mse, estimators = 50, learning rate = 0.08, loss = absolute_error, rmse = 0.004962315891742368
model 165 done with parameters: max_depth = 4, criterion = friedman_mse, 

model 211 done with parameters: max_depth = 4, criterion = squared_error, estimators = 100, learning rate = 0.1, loss = squared_error, rmse = 0.002822949349514258
model 212 done with parameters: max_depth = 4, criterion = squared_error, estimators = 100, learning rate = 0.1, loss = absolute_error, rmse = 0.003671004260923169
model 213 done with parameters: max_depth = 4, criterion = squared_error, estimators = 100, learning rate = 0.1, loss = huber, rmse = 0.0027898015141627564
model 214 done with parameters: max_depth = 4, criterion = squared_error, estimators = 100, learning rate = 0.12, loss = squared_error, rmse = 0.002789152927054379
model 215 done with parameters: max_depth = 4, criterion = squared_error, estimators = 100, learning rate = 0.12, loss = absolute_error, rmse = 0.003691289937380604
model 216 done with parameters: max_depth = 4, criterion = squared_error, estimators = 100, learning rate = 0.12, loss = huber, rmse = 0.0028131115634149615
model 217 done with parameters:

In [21]:
gs_IP

((3, 'friedman_mse', 100, 0.08),
 [0.002958185553002535,
  0.006407018407676127,
  0.004428068812946768,
  0.002776729479836416,
  0.005372760873905239,
  0.003432291380610513,
  0.0026067464673304175,
  0.0048061652599137505,
  0.0030851908081413497,
  0.0026039764209378246,
  0.00477931059886155,
  0.003132758828900319,
  0.0027137581776954262,
  0.0042761812198638905,
  0.002879525183277261,
  0.002675563304797706,
  0.004136672056708204,
  0.0028028874014932675,
  0.00263901736188292,
  0.004243777202286242,
  0.0028285475219040047,
  0.0026600684459853496,
  0.003929768715246046,
  0.002738377789027572,
  0.0026049910482777846,
  0.0038342459402143712,
  0.0027842983111944423,
  0.002946016742644336,
  0.006400529273369599,
  0.004428402988000523,
  0.00274092287895698,
  0.005369459505880697,
  0.0034536502342627138,
  0.0026201916948474895,
  0.00481556253048331,
  0.0030731467770259437,
  0.0026640675291778127,
  0.004781983704751462,
  0.0031020685649265803,
  0.00264106682594

In [17]:
np.mean(baseline)

0.018969723709451568

### Saving models

In [36]:
pipe = make_pipeline(PCA(), GradientBoostingRegressor())

params = {'pca__n_components':0.9,
                'gradientboostingregressor__max_depth':,
                'gradientboostingregressor__criterion':'friedman_mse',
                'gradientboostingregressor__n_estimators':100,
                'gradientboostingregressor__learning_rate':0.1}
pipe.set_params(**params)

In [140]:
xgb_pipe = make_pipeline(PCA(), XGBRegressor())

params = {'pca__n_components':0.9,
                'xgbregressor__max_depth': 5,
                'xgbregressor__max_leaves':0,
                'xgbregressor__n_estimators':100,
                'xgbregressor__learning_rate':0.1}


In [149]:
model = XGBRegressor()

In [150]:
model.get_params()

{'objective': 'reg:squarederror',
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'gamma': None,
 'gpu_id': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': None,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 100,
 'n_jobs': None,
 'num_parallel_tree': None,
 'predictor': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [141]:
xgb_pipe.set_params(**params)

In [50]:
import joblib

In [51]:
tickers = utils.return_tickers()
for ticker in tickers:
    model = pipe
    df = prep.ready_to_train_df(ticker)
    model.fit(df, df['return'])
    joblib.dump(model, f"../raw_data/models/{ticker}_GradientBoostingRegressor_PCA.joblib")
    print(f"Model {ticker} index # {tickers.index(ticker)} saved")

Model AAPL index # 0 saved
Model MSFT index # 1 saved
Model GOOG index # 2 saved
Model AMZN index # 3 saved
Model TSLA index # 4 saved
Model UNH index # 5 saved
Model XOM index # 6 saved
Model JNJ index # 7 saved
Model WMT index # 8 saved
Model NVDA index # 9 saved
Model JPM index # 10 saved
Model V index # 11 saved
Model CVX index # 12 saved
Model PG index # 13 saved
Model LLY index # 14 saved
Model MA index # 15 saved
Model HD index # 16 saved
Model META index # 17 saved
Model BAC index # 18 saved
Model ABBV index # 19 saved
Model PFE index # 20 saved
Model KO index # 21 saved
Model MRK index # 22 saved
Model PEP index # 23 saved
Model COST index # 24 saved
Model ORCL index # 25 saved
Model AVGO index # 26 saved
Model TMO index # 27 saved
Model MCD index # 28 saved
Model CSCO index # 29 saved
Model ACN index # 30 saved
Model DHR index # 31 saved
Model TMUS index # 32 saved
Model ABT index # 33 saved
Model WFC index # 34 saved
Model DIS index # 35 saved
Model LIN index # 36 saved
Mode

KeyboardInterrupt: 

In [166]:
tickers = utils.return_tickers()
scores = []
for ticker in tickers:
    model = GradientBoostingRegressor()
    X_train = prep.ready_to_train_df(ticker)
    y_train = prep.ready_to_train_df(ticker)['return'].shift(1).replace(np.nan,0)
    X_test = prep.ready_to_test(ticker)
    y_test = prep.ready_to_test(ticker)['return'].shift(1).replace(np.nan,0)
    model.fit(X_train, y_train)
    error = (mean_squared_error(y_test, model.predict(X_test)))**0.5
    scores.append(error)
    print(f"error for {ticker}: {error}, index # {tickers.index(ticker)}")

error for AAPL: 0.0010755018274289084, index # 0
error for MSFT: 0.0014009360157847585, index # 1
error for GOOG: 0.0019409268784759998, index # 2
error for AMZN: 0.0011770144206575716, index # 3
error for TSLA: 0.0038774516256775086, index # 4
error for UNH: 0.0015803208162903399, index # 5
error for XOM: 0.002412013922014089, index # 6
error for JNJ: 0.0015723586058631182, index # 7
error for WMT: 0.0009193086913061908, index # 8
error for NVDA: 0.0015347427183583305, index # 9
error for JPM: 0.0011790385822612536, index # 10
error for V: 0.0010331803956665515, index # 11
error for CVX: 0.005028838707639965, index # 12
error for PG: 0.0014305292660018222, index # 13
error for LLY: 0.0026292035991625595, index # 14
error for MA: 0.0013321126960574535, index # 15
error for HD: 0.002734689182004797, index # 16
error for META: 0.005626137611098566, index # 17
error for BAC: 0.0016069918195044475, index # 18
error for ABBV: 0.0009326388735273637, index # 19
error for PFE: 0.00106406026239

error for CDNS: 0.0009306109222494366, index # 168
error for FDX: 0.003141837195197346, index # 169
error for NXPI: 0.002421501522406658, index # 170
error for AIG: 0.0023818895623304217, index # 171
error for KMB: 0.0011870997213360133, index # 172
error for AFL: 0.0015386538538001131, index # 173
error for HES: 0.0051275280423488615, index # 174
error for MSI: 0.0010126066529906961, index # 175
error for PAYX: 0.003564260010414954, index # 176
error for DVN: 0.007553380266759235, index # 177
error for TRV: 0.00233973862145062, index # 178
error for BIIB: 0.012066896300401236, index # 179
error for DXCM: 0.002018155665266986, index # 180
error for SYY: 0.006622501566191941, index # 181
error for LHX: 0.0009571668124520307, index # 182
error for RSG: 0.0014645262585922052, index # 183
error for ENPH: 0.0040303735406395395, index # 184
error for ECL: 0.0035312480377572746, index # 185
error for ADSK: 0.0013404593042538994, index # 186
error for MCHP: 0.0036346027005981266, index # 187
e

error for INVH: 0.0033814237591178395, index # 333
error for CHD: 0.001960683232209993, index # 334
error for AES: 0.0014176316315629206, index # 335
error for MOH: 0.0015946392145316486, index # 336
error for JBHT: 0.0014121464326489123, index # 337
error for MAA: 0.001390090270316626, index # 338
error for BBY: 0.0013694127052625762, index # 339
error for CLX: 0.0028657789507882305, index # 340
error for HOLX: 0.0011757011953517677, index # 341
error for WAB: 0.002787997094111809, index # 342
error for DRI: 0.006918423690012474, index # 343
error for EXPD: 0.0007835511775661293, index # 344
error for STE: 0.0010398237947621808, index # 345
error for AMCR: 0.0032137765491276715, index # 346
error for VTR: 0.005184283366371841, index # 347
error for IEX: 0.0008662654468679736, index # 348
error for CAG: 0.0038375226668428473, index # 349
error for CMS: 0.0011662443809519355, index # 350
error for KEY: 0.004181719590782902, index # 351
error for MPWR: 0.002399626460472851, index # 352
e

In [167]:
np.mean(scores)

0.0026363905812504087

In [168]:
np.std(scores)

0.0020598109161219865

## Baseline calculation

In [114]:
tickers = utils.return_tickers()
baseline = []
for ticker in tickers:
    df = prep.ready_to_test(ticker)
    for i in range(1, len(df), 1):
        error = (mean_squared_error(df.loc[i,['return']], df.loc[i-1,['return']]))**0.5
        baseline.append(error)
    print(f"baseline done for {ticker} index # {tickers.index(ticker)}")

baseline done for AAPL index # 0
baseline done for MSFT index # 1
baseline done for GOOG index # 2
baseline done for AMZN index # 3
baseline done for TSLA index # 4
baseline done for UNH index # 5
baseline done for XOM index # 6
baseline done for JNJ index # 7
baseline done for WMT index # 8
baseline done for NVDA index # 9
baseline done for JPM index # 10
baseline done for V index # 11
baseline done for CVX index # 12
baseline done for PG index # 13
baseline done for LLY index # 14
baseline done for MA index # 15
baseline done for HD index # 16
baseline done for META index # 17
baseline done for BAC index # 18
baseline done for ABBV index # 19
baseline done for PFE index # 20
baseline done for KO index # 21
baseline done for MRK index # 22
baseline done for PEP index # 23
baseline done for COST index # 24
baseline done for ORCL index # 25
baseline done for AVGO index # 26
baseline done for TMO index # 27
baseline done for MCD index # 28
baseline done for CSCO index # 29
baseline done 

baseline done for KEYS index # 244
baseline done for PPG index # 245
baseline done for WEC index # 246
baseline done for CTSH index # 247
baseline done for ROK index # 248
baseline done for GWW index # 249
baseline done for PCG index # 250
baseline done for HPQ index # 251
baseline done for FAST index # 252
baseline done for DFS index # 253
baseline done for MTB index # 254
baseline done for PEG index # 255
baseline done for OKE index # 256
baseline done for DHI index # 257
baseline done for APTV index # 258
baseline done for BKR index # 259
baseline done for GLW index # 260
baseline done for LYB index # 261
baseline done for ES index # 262
baseline done for BAX index # 263
baseline done for STT index # 264
baseline done for VRSK index # 265
baseline done for TROW index # 266
baseline done for WBD index # 267
baseline done for AWK index # 268
baseline done for IT index # 269
baseline done for GPN index # 270
baseline done for HRL index # 271
baseline done for FANG index # 272
baseline 

baseline done for NCLH index # 484
baseline done for DXC index # 485
baseline done for GNRC index # 486
baseline done for AIZ index # 487
baseline done for XRAY index # 488
baseline done for LNC index # 489
baseline done for DVA index # 490
baseline done for MHK index # 491
baseline done for LUMN index # 492
baseline done for ALK index # 493
baseline done for NWL index # 494
baseline done for VNO index # 495
baseline done for TAP index # 496


In [115]:
np.mean(baseline)



0.02256406773595633

In [116]:
np.std(baseline)

0.026165766170841074

In [67]:
model = joblib.load(f"../raw_data/models/QCOM_GradientBoostingRegressor_PCA.joblib")

In [None]:
tickers = utils.return_tickers()
y_pred = []
for ticker in tickers:
    model = joblib.load(model, f"model/{ticker}_GradientBoostingRegressor_PCA.joblib")
    y_pred.append(model.predict(df))

In [169]:
ticker = 'AAPL'
X_train = prep.ready_to_train_df(ticker)
y_train = prep.ready_to_train_df(ticker)['return'].shift(1).replace(np.nan,0)
X_test = prep.ready_to_test(ticker)
y_test = prep.ready_to_test(ticker)['return'].shift(1).replace(np.nan,0)

In [171]:
model.fit(X_train, y_train)

In [176]:
pd.DataFrame(model.predict(X_test)).rename(columns={0:f"{ticker}"})

Unnamed: 0,AAPL
0,0.013787
1,0.013746
2,-0.001507
3,-0.012839
4,0.016576
...,...
1081,0.011724
1082,-0.007757
1083,0.013579
1084,0.003698


In [178]:
pd.DataFrame()

In [174]:
y_test

0       0.000000
1       0.013885
2      -0.001207
3      -0.012976
4       0.016766
          ...   
1081    0.011869
1082   -0.008331
1083    0.012971
1084    0.003782
1085   -0.021680
Name: return, Length: 1086, dtype: float64

In [None]:
tickers = utils.return_tickers()
for ticker in tickers:
    model = XGBRegressor(n_jobs=-1)
    X_train = prep.ready_to_train_df(ticker)
    y_train = prep.ready_to_train_df(ticker)['return'].shift(1).replace(np.nan,0)
    X_test = prep.ready_to_test(ticker)
    y_test = prep.ready_to_test(ticker)['return'].shift(1).replace(np.nan,0)
    model.fit(X_train, y_train)
    pred_ticker = pd.DataFrame(model.predict(X_test))
    