In [None]:
import pandas as pd

In [None]:
import numpy as np

In [None]:
from dynamic_portfolio.utils import load_csv
from dynamic_portfolio.utils import features_creation, clean_data
from dynamic_portfolio.preprocess import scaler

# Loading data and creating clean dataframe

In [None]:
df = features_creation('META')

In [None]:
pd.set_option('display.max_columns', None)

# Cross vals

We use sklearn's time series split to break up the data in different folds
We use a sklearn example to run the first tests.

Metrics used : 
 - rmse
 - mae
 - R2

In [None]:
from sklearn.model_selection import TimeSeriesSplit

In [None]:
ts_cv = TimeSeriesSplit(
    n_splits=20,
    gap=0,
    max_train_size=252,
    test_size=45,
)

In [None]:
all_splits = list(ts_cv.split(X, y))

In [11]:
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

In [None]:
model = HistGradientBoostingRegressor()

In [None]:
rmse = []
for train_index, test_index in ts_cv.split(X):
    cv_train, cv_test = X.iloc[train_index], X.iloc[test_index]
    model = model.fit(X,y)
    predictions = model.predict(cv_test)
    true_values = cv_test['return']
    rmse.append(np.sqrt(mean_squared_error(true_values, predictions)))

In [None]:
np.mean(rmse)*100

In [None]:
def evaluate(model, X, y, cv):
    cv_results = cross_validate(
        model,
        X,
        y,
        cv=cv,
        scoring=["neg_mean_absolute_error", "neg_root_mean_squared_error"],
    )
    mae = -cv_results["test_neg_mean_absolute_error"]
    rmse = -cv_results["test_neg_root_mean_squared_error"]
    print(
        f"Mean Absolute Error:     {mae.mean():.6f} +/- {mae.std():.6f}\n"
        f"Root Mean Squared Error: {rmse.mean():.6f} +/- {rmse.std():.6f}"
    )


In [None]:
import seaborn as sns

In [None]:
evaluate(model, X, y, ts_cv)

# PCA Analysis

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=0.9)
ibm = prep.ready_to_train_df('IBM')
pca.fit(ibm)

In [None]:
n_pcs= pca.n_components_ # get number of component
# get the index of the most important feature on EACH component
most_important = [np.abs(pca.components_[i]).argmax() for i in range(n_pcs)]
initial_feature_names = ibm.columns
# get the most important feature names
most_important_names = [initial_feature_names[most_important[i]] for i in range(n_pcs)]

In [None]:
most_important_names

## Using PCA analysis on all stocks

In [None]:
from sklearn.pipeline import make_pipeline

In [None]:
pipe.get_params()

In [None]:
pipe = make_pipeline(PCA(), GradientBoostingRegressor())

params = {'pca__n_components':0.9,
                'gradientboostingregressor__max_depth':3,
                'gradientboostingregressor__criterion':'friedman_mse',
                'gradientboostingregressor__n_estimators':100,
                'gradientboostingregressor__learning_rate':0.08}
pipe.set_params(**params)

In [None]:
pca_dict_score = {}
tickers = utils.return_tickers()
for ticker in tickers:
        dict_pca_score[ticker] = cross_validate_ml(prep.ready_to_train_df(ticker), pipe)
        print(f"done for ticker {ticker} index # {tickers.index(ticker)}")

In [None]:
pca_dict_score = {}
tickers = utils.return_tickers()
for ticker in tickers:
    pca = PCA(n_components=0.9)
    ticker_df = prep.ready_to_train_df(ticker)
    pca.fit(ticker_df)
    n_pcs= pca.n_components_
    most_important = [np.abs(pca.components_[i]).argmax() for i in range(n_pcs)]
    initial_feature_names = ticker_df.columns
    most_important_names = [initial_feature_names[most_important[i]] for i in range(n_pcs)]
    pca_df = ticker_df[list(np.unique(most_important_names))] 
    returns = ticker_df[['return']]
    final_pca = pd.merge(pca_df, returns, how='outer', left_index=True, right_index=True)
    pca_dict_score[ticker] = cross_validate_ml(final_pca, GradientBoostingRegressor(max_depth = 3, 
                                                                                    criterion='friedman_mse',
                                                                                    learning_rate=0.08,
                                                                                    n_estimators = 100))
    print(f"done for ticker {ticker} index # {tickers.index(ticker)}")

In [None]:
rmse_pca= []
baseline = []
for key in pca_dict_score.keys():
    rmse_pca.append(pca_dict_score[key][0])
    baseline.append(pca_dict_score[key][1])
print(np.mean(rmse_pca), np.mean(baseline) )

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.ylim(ymin=0)
plt.title('cumulated share of explained variance')
plt.xlabel('# of principal component used');

In [1]:
import pandas as pd
import numpy as np
import dynamic_portfolio.preprocess as prep
import dynamic_portfolio.utils as utils
import dynamic_portfolio.cross_validate as cv
from sklearn.ensemble import GradientBoostingRegressor
import warnings
%load_ext autoreload
%autoreload 2
warnings.filterwarnings(action='ignore')
pd.set_option('display.max_columns', None)

### Cross validation functions

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
cross_val = {
    'fold_length' : 252, # Working days for 1 year
    'fold_stride' : 60, # Step between folds, here one quarter
    'train_test_ratio' : 0.7, # Split in fold
    'input_length' : 0, # Number of days to move back from last train_index, here 0
    'horizon' : 1, # Number of days ahead to make prediction, here 1
    'output_length' : 1, # Number of targets wanted
}
#Split the dataset by FOLDS
def get_folds(
    df: pd.DataFrame,
    fold_length = cross_val['fold_length'],
    fold_stride = cross_val['fold_stride']):
    '''
    This function slides through the Time Series dataframe of shape (n_timesteps, n_features) to create folds
    - of equal `fold_length`
    - using `fold_stride` between each fold
    Returns a list of folds, each as a DataFrame
    '''
    folds = []
    for idx in range(0, len(df), fold_stride):
        # Exits the loop as soon as the last fold index would exceed the last index
        if (idx + fold_length) > len(df):
            break
        fold = df.iloc[idx:idx + fold_length, :]
        folds.append(fold)
    return folds
#Split FOLDS by Train et Test
#### FOR ONE FOLD !!!!!
def train_test_split(fold: pd.DataFrame,
                     train_test_ratio = cross_val['train_test_ratio'],
                     input_length = cross_val['input_length']):
    '''
    Returns a train dataframe and a test dataframe (fold_train, fold_test)
    from which one can sample (X,y) sequences.
    df_train should contain all the timesteps until round(train_test_ratio * len(fold))
    '''
    # TRAIN SET
    # ======================
    last_train_idx = round(train_test_ratio * len(fold))
    fold_train = fold.iloc[0:last_train_idx, :]
    # TEST SET
    # ======================
    first_test_idx = last_train_idx - input_length 
    fold_test = fold.iloc[first_test_idx:, :]
    return (fold_train, fold_test)
def cross_validate_ml(df, model) :
    '''
    get_folds() create many FOLDS, train_test_split() create a split on ONE FOLDS.
    The goal of this function is to make splits and sequences on each FOLDS.
    Then, apply a model.
    '''
    folds = get_folds(df, fold_length = cross_val['fold_length'], fold_stride = cross_val['fold_stride']) # 1 - Creating FOLDS
    scores =[]
    baseline = []
    for fold in folds:
        # 2 - CHRONOLOGICAL TRAIN TEST SPLIT of the current FOLD
        (fold_train, fold_test) = train_test_split(fold = fold,
                                                train_test_ratio = cross_val['train_test_ratio'],
                                                input_length = cross_val['input_length'] ,
                                                )
        # 3 - Scanninng fold_train and fold_test for SEQUENCES
        X_train, y_train = fold_train, fold_train[['return']].shift(1).replace(np.nan,0)
        X_test, y_test = fold_test, fold_test[['return']].shift(1).replace(np.nan,0)
        model.fit(X_train, y_train)
        rmse_model = (mean_squared_error(y_test, model.predict(X_test)))**0.5
        scores.append(rmse_model)
        rmse_baseline = mean_squared_error(y_test.iloc[[0]], y_train.iloc[[-1]])**0.5
        baseline.append(rmse_baseline)
    return np.mean(scores), np.mean(baseline)

### Script to run for model on all stocks

### Models used

In [2]:
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

In [None]:
dict_score = {}
tickers = utils.return_tickers()
for ticker in tickers:
        dict_score[ticker] = cross_validate_ml(prep.ready_to_train_df(ticker), XGBRegressor())
        print(f"done for ticker {ticker} index # {tickers.index(ticker)}")

### Cross val scoring

In [None]:
rmse= []
baseline = []
for key in dict_score.keys():
    rmse.append(dict_score[key][0])
    baseline.append(dict_score[key][1])
print(np.mean(rmse), np.mean(baseline) )

## Custom grid search

In [None]:
max_depth = [2, 5, 8]
criterion = ['friedman_mse', 'squared_error', 'mse']
learning_rate=[0.01, 0.1, 0.2] 
n_estimators=[100, 200, 500]

In [None]:
def custom_gridsearch(df, model, max_depth=[2,3,4], criterion = ['friedman_mse', 'squared_error', 'mse'], n_estimator=[50, 75, 100], learning_rate=[0.08, 0.1, 0.12], loss=['squared_error', 'absolute_error', 'huber']):
    counter = 0
    rmse = []
    baseline = []
    params = []
    for max_depth_i in max_depth:
        for criterion_i in criterion:
            for n_estimator_i in n_estimator:
                for learning_rate_i in learning_rate:
                    for loss_i in loss:
                        test = cross_validate_ml(df = df, model = model(max_depth=max_depth_i,
                                                                   criterion = criterion_i,
                                                                   n_estimators = n_estimator_i,
                                                                   learning_rate = learning_rate_i,
                                                                   loss = loss_i))
                        rmse.append(test[0])
                        baseline.append(test[1])
                        params.append((max_depth_i, criterion_i, n_estimator_i, learning_rate_i))
                        counter += 1
                        print(f'model {counter} done with parameters: max_depth = {max_depth_i}, criterion = {criterion_i}, estimators = {n_estimator_i}, learning rate = {learning_rate_i}, loss = {loss_i}, rmse = {test[0]}')
    idx_min = np.argmin(rmse)
    best_params = params[idx_min]
    
    return best_params, rmse, params

### Model results on one stock on a grid search

In [None]:
gs_IP = custom_gridsearch(prep.ready_to_train_df('IP'), model = GradientBoostingRegressor)

In [None]:
gs_IP

In [None]:
np.mean(baseline)

### Saving models

In [None]:
pipe = make_pipeline(PCA(), GradientBoostingRegressor())

params = {'pca__n_components':0.9,
                'gradientboostingregressor__max_depth':,
                'gradientboostingregressor__criterion':'friedman_mse',
                'gradientboostingregressor__n_estimators':100,
                'gradientboostingregressor__learning_rate':0.1}
pipe.set_params(**params)

In [None]:
xgb_pipe = make_pipeline(PCA(), XGBRegressor())

params = {'pca__n_components':0.9,
                'xgbregressor__max_depth': 5,
                'xgbregressor__max_leaves':0,
                'xgbregressor__n_estimators':100,
                'xgbregressor__learning_rate':0.1}


In [None]:
model = XGBRegressor()

In [None]:
model.get_params()

In [None]:
xgb_pipe.set_params(**params)

In [59]:
import joblib

In [None]:
tickers = utils.return_tickers()
for ticker in tickers:
    model = pipe
    df = prep.ready_to_train_df(ticker)
    model.fit(df, df['return'])
    joblib.dump(model, f"../raw_data/models/{ticker}_XGBoostDefault.joblib")
    print(f"Model {ticker} index # {tickers.index(ticker)} saved")

In [None]:
tickers = utils.return_tickers()
scores = []
for ticker in tickers:
    model = GradientBoostingRegressor()
    X_train = prep.ready_to_train_df(ticker)
    y_train = prep.ready_to_train_df(ticker)['return'].shift(1).replace(np.nan,0)
    X_test = prep.ready_to_test(ticker)
    y_test = prep.ready_to_test(ticker)['return'].shift(1).replace(np.nan,0)
    model.fit(X_train, y_train)
    error = (mean_squared_error(y_test, model.predict(X_test)))**0.5
    scores.append(error)
    print(f"error for {ticker}: {error}, index # {tickers.index(ticker)}")

In [None]:
np.mean(scores)

In [None]:
np.std(scores)

## Baseline calculation

In [None]:
tickers = utils.return_tickers()
baseline = []
for ticker in tickers:
    df = prep.ready_to_test(ticker)
    for i in range(1, len(df), 1):
        error = (mean_squared_error(df.loc[i,['return']], df.loc[i-1,['return']]))**0.5
        baseline.append(error)
    print(f"baseline done for {ticker} index # {tickers.index(ticker)}")

In [None]:
np.mean(baseline)



In [None]:
np.std(baseline)

In [None]:
model = joblib.load(f"../raw_data/models/QCOM_GradientBoostingRegressor_PCA.joblib")

In [None]:
tickers = utils.return_tickers()
y_pred = []
for ticker in tickers:
    model = joblib.load(model, f"model/{ticker}_GradientBoostingRegressor_PCA.joblib")
    y_pred.append(model.predict(df))

In [35]:
ticker = 'WTW'
X_train = prep.ready_to_train_df(ticker)
y_train = prep.ready_to_train_df(ticker)['return'].shift(1).replace(np.nan,0)
X_test = prep.ready_to_test_df(ticker)
y_test = prep.ready_to_test_df(ticker)['return'].shift(1).replace(np.nan,0)

In [36]:
model = GradientBoostingRegressor()

In [37]:
model.fit(X_train, y_train)

In [38]:
pred_wtw = pd.DataFrame(model.predict(X_test), columns=[f"{ticker}"], index = X_test.index)

In [47]:
df_1 = pd.concat([df, pred_wtw], axis = 1)

In [48]:
df_1

Unnamed: 0_level_0,WTW
date,Unnamed: 1_level_1
2021-07-16,-0.004304
2021-07-19,0.002006
2021-07-20,-0.027596
2021-07-21,0.019297
2021-07-22,0.022737
...,...
2022-11-16,0.008243
2022-11-17,0.006935
2022-11-18,-0.000674
2022-11-21,0.007739


In [52]:
temp = [pred_wtw, pred_zts, pred]

In [55]:
df_3 = pd.concat(temp, axis=1)

In [56]:
df_3

Unnamed: 0_level_0,WTW,ZTS,AAPL
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-07-09,,,0.013787
2018-07-10,,,0.013746
2018-07-11,,,-0.001507
2018-07-12,,,-0.012839
2018-07-13,,,0.016576
...,...,...,...
2022-11-16,0.008243,0.024363,0.011724
2022-11-17,0.006935,-0.010172,-0.007757
2022-11-18,-0.000674,-0.031445,0.013579
2022-11-21,0.007739,0.015109,0.003698


In [39]:
pred_wtw

Unnamed: 0_level_0,WTW
date,Unnamed: 1_level_1
2021-07-16,-0.004304
2021-07-19,0.002006
2021-07-20,-0.027596
2021-07-21,0.019297
2021-07-22,0.022737
...,...
2022-11-16,0.008243
2022-11-17,0.006935
2022-11-18,-0.000674
2022-11-21,0.007739


In [34]:
pred_zts

Unnamed: 0_level_0,ZTS
date,Unnamed: 1_level_1
2020-12-24,-0.011570
2020-12-28,0.005519
2020-12-29,0.009949
2020-12-30,0.004380
2020-12-31,0.006858
...,...
2022-11-16,0.024363
2022-11-17,-0.010172
2022-11-18,-0.031445
2022-11-21,0.015109


In [26]:
y_test.index

DatetimeIndex(['2018-07-09', '2018-07-10', '2018-07-11', '2018-07-12',
               '2018-07-13', '2018-07-16', '2018-07-17', '2018-07-18',
               '2018-07-19', '2018-07-20',
               ...
               '2022-11-08', '2022-11-09', '2022-11-10', '2022-11-14',
               '2022-11-15', '2022-11-16', '2022-11-17', '2022-11-18',
               '2022-11-21', '2022-11-22'],
              dtype='datetime64[ns]', name='date', length=1086, freq=None)

In [29]:
pred_zts = pd.DataFrame(model.predict(X_test), columns=[f"{ticker}"], index = X_test.index)

In [30]:
pred

Unnamed: 0_level_0,AAPL
date,Unnamed: 1_level_1
2018-07-09,0.013787
2018-07-10,0.013746
2018-07-11,-0.001507
2018-07-12,-0.012839
2018-07-13,0.016576
...,...
2022-11-16,0.011724
2022-11-17,-0.007757
2022-11-18,0.013579
2022-11-21,0.003698


In [60]:
tickers = utils.return_tickers()
preds = []
for ticker in tickers:
    model = XGBRegressor(n_jobs=-1)
    X_train = prep.ready_to_train_df(ticker)
    y_train = prep.ready_to_train_df(ticker)['return'].shift(1).replace(np.nan,0)
    X_test = prep.ready_to_test_df(ticker)
    y_test = prep.ready_to_test_df(ticker)['return'].shift(1).replace(np.nan,0)
    model.fit(X_train, y_train)
    joblib.dump(model, f"../raw_data/models/{ticker}_XGBoostDefault.joblib")
    pred_ticker = pd.DataFrame(model.predict(X_test),columns=[f"{ticker}"], index = X_test.index)
    preds.append(pred_ticker)
    print(f"ticker {ticker} done index # {tickers.index(ticker)}")

final_df = pd.concat(preds, axis=1)

    
    
    
    

ticker AAPL done index # 0
ticker MSFT done index # 1
ticker GOOG done index # 2
ticker AMZN done index # 3
ticker TSLA done index # 4
ticker UNH done index # 5
ticker XOM done index # 6
ticker JNJ done index # 7
ticker WMT done index # 8
ticker NVDA done index # 9
ticker JPM done index # 10
ticker V done index # 11
ticker CVX done index # 12
ticker PG done index # 13
ticker LLY done index # 14
ticker MA done index # 15
ticker HD done index # 16
ticker META done index # 17
ticker BAC done index # 18
ticker ABBV done index # 19
ticker PFE done index # 20
ticker KO done index # 21
ticker MRK done index # 22
ticker PEP done index # 23
ticker COST done index # 24
ticker ORCL done index # 25
ticker AVGO done index # 26
ticker TMO done index # 27
ticker MCD done index # 28
ticker CSCO done index # 29
ticker ACN done index # 30
ticker DHR done index # 31
ticker TMUS done index # 32
ticker ABT done index # 33
ticker WFC done index # 34
ticker DIS done index # 35
ticker LIN done index # 36
tick

ticker MKC done index # 296
ticker ETR done index # 297
ticker LUV done index # 298
ticker ULTA done index # 299
ticker AEE done index # 300
ticker MLM done index # 301
ticker FE done index # 302
ticker PFG done index # 303
ticker FRC done index # 304
ticker DTE done index # 305
ticker DAL done index # 306
ticker HBAN done index # 307
ticker IR done index # 308
ticker CTRA done index # 309
ticker ANSS done index # 310
ticker ACGL done index # 311
ticker PPL done index # 312
ticker RF done index # 313
ticker VRSN done index # 314
ticker LH done index # 315
ticker EXR done index # 316
ticker PWR done index # 317
ticker CF done index # 318
ticker CAH done index # 319
ticker CFG done index # 320
ticker XYL done index # 321
ticker HPE done index # 322
ticker EPAM done index # 323
ticker DOV done index # 324
ticker WAT done index # 325
ticker WRB done index # 326
ticker TDY done index # 327
ticker PAYC done index # 328
ticker ROL done index # 329
ticker NTRS done index # 330
ticker MRO done 

In [62]:
final_df.to_csv('first_preds.csv')

In [None]:
mask = final_df['date'] >= '2020-12-31'