In [85]:
import os
import time
import datetime
from tqdm import tqdm
sep = "-*-*-"

os.environ["CUDA_VISIBLE_DEVICES"] = "-1" # disable GPU

import pandas as pd
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
import pandas_datareader.data as web
import numpy as np

# visuals.
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.figure_factory as ff
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# processing / validation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# keras/tf
# %tensorflow_version 2.x
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.callbacks import EarlyStopping
print(tf.__version__)

# models
from sklearn import svm
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import tree
from sklearn.ensemble import VotingRegressor

# metrics
from sklearn.metrics import mean_squared_error, r2_score # reg metrics
from sklearn.metrics import classification_report, roc_auc_score, roc_auc_score,\
accuracy_score, precision_score, average_precision_score, balanced_accuracy_score# clf metrics

# constant seed for reproducibility
SEED = 111 
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

# cpu workers
WORKERS = 6 

2.0.0


In [23]:
def fetch_data(tickers, days, years):

    df_raw = pd.DataFrame() 
    attempt = 0
    drop = []
    while len(tickers) != 0 and attempt <= 5:
        tickers = [j for j in tickers if j not in drop] 
        for i in range(len(tickers)):
            try:
                temp = web.get_data_yahoo(tickers[i],datetime.date.today() - datetime.timedelta(DAYS * YEARS), # reduce delta
                                          datetime.date.today())

                temp.dropna(inplace = True)
                df_raw[tickers[i]] = temp["Adj Close"]
                drop.append(tickers[i])       
            except:
                print(tickers[i]," :failed to fetch data...retrying")
                continue
        attempt+=1
       
    # missing values
    # BTC market is open all the time whereas Stock and Index markets are closed on weekends.
    # drop missing values caused by this behvaiour
    print("Missing Values:")
    print(df_raw.isnull().sum())
    df = df_raw.dropna()

    return df

###################################################################################################################

def process_clf_data(target, seq_len, period, features_type, df):
    df_pct = pd.DataFrame() # blank dataframe


    if features_type=="since": # changes since previous days
        for col in df.columns:
            for i in range(1,seq_len+1):
                df_pct[f"{col}_snc_[t-{i}]"] = df[col].pct_change(i)
        df_pct.dropna(inplace=True)
    
    elif features_type=="shifted": # shifted changes of previous days
        for col in df.columns:
            if col != target: # without target's pct_change
                df_pct[col] = df[col].pct_change(1)
        df_pct.dropna(inplace=True)
        
        # shifted previous
        for col in df_pct.columns:
            for i in range(1,seq_len+1):
                df_pct[f"{col}_sht_[t-{i}]"] = df_pct[col].shift(i)
        df_pct.dropna(inplace=True)
    else:
        raise ValueError("features_type can be either 'since' or 'shifted'.")


    df_pct[f"{target}_price_[t]"] = df[target] # price [t]

    # labeling
    df_pct[f"{target}_Future"] = df[target].shift(-period) # future price [t + perid]
    
    warnings = 0
    def classify(x):
#         print(warnings)
        if x[f"{target}_Future"] >= x[f"{target}_price_[t]"]:
            return 1
        elif x[f"{target}_Future"] < x[f"{target}_price_[t]"]:
            return 0
        else:
            nonlocal warnings
            warnings += 1
            return None
    
    df_pct[f"{target}_Future"] = df_pct.apply(classify, axis=1) # classify


    if warnings > 1:
        raise ValueError("More than 1 NaN in classifying.")
        
    
    df_pct.dropna(inplace=True)
    if df_pct.isnull().any().any():
        raise ValueError("null values exist")
        
    return df_pct

###################################################################################################################

def process_reg_data(target, seq_len, period, features_type, df):
    df_pct = pd.DataFrame() # blank dataframe


    if features_type=="since": # changes since previous days
        for col in df.columns:
            for i in range(1,seq_len+1):
                df_pct[f"{col}_snc_[t-{i}]"] = df[col].pct_change(i)
        df_pct.dropna(inplace=True)
    
    elif features_type=="shifted": # shifted changes of previous days
        for col in df.columns:
            if col != target: # without target's pct_change
                df_pct[col] = df[col].pct_change(1)
        df_pct.dropna(inplace=True)
        
        # shifted previous
        for col in df_pct.columns:
            for i in range(1,seq_len+1):
                df_pct[f"{col}_sht_[t-{i}]"] = df_pct[col].shift(i)
        df_pct.dropna(inplace=True)
    else:
        raise ValueError("features_type can be either 'since' or 'shifted'.")


    df_pct[f"{target}_price_[t]"] = df[target] # target's price

    # labeling
    df_pct[f"{target}_Future"] = df[target].shift(-period)

    df_pct.dropna(inplace=True)
    if df_pct.isnull().any().any():
        raise ValueError("null values exist")
        
    return df_pct

###################################################################################################################

def split_data(forward_test, scaling, split_size, proc_data):
    
    # train/test & faeture/label split
    if forward_test==True:
    #     forward test (recommended)
        nth_prcntile = int(len(proc_data)*split_size)
        test_df = proc_data.iloc[nth_prcntile:,:]
        
        train_df = proc_data.drop(test_df.index)
        train_df = train_df.sample(frac=1, random_state=SEED) # shuffle train dataset

        # features
        X_train = train_df.drop(f"{TARGET}_Future", axis=1).values
        X_test = test_df.drop(f"{TARGET}_Future", axis=1).values

        # labels
        y_train = train_df[f"{TARGET}_Future"].values
        y_test = test_df[f"{TARGET}_Future"].values
        
    elif forward_test==False:
        proc_data = proc_data.sample(frac=(1), random_state=SEED) # shuffle all data
        test_df = proc_data.sample(frac=(1-split_size), random_state=SEED) # sample test dataset
        train_df = proc_data.drop(test_df.index)

        # features
        X_train = train_df.drop(f"{TARGET}_Future", axis=1).values
        X_test = test_df.drop(f"{TARGET}_Future", axis=1).values

        # labels
        y_train = train_df[f"{TARGET}_Future"].values
        y_test = test_df[f"{TARGET}_Future"].values
        
    else:
        raise ValueError("forward_test must be boolean.")


    # scaling
    if scaling=="minmax":
        scaler = MinMaxScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        
        x_test_df = pd.DataFrame(X_test, columns=proc_data.drop(f"{TARGET}_Future", axis=1).columns, index=test_df.index)
        y_test_df = pd.DataFrame(y_test, columns=[f"{TARGET}_Future"], index=test_df.index)
        test_df = pd.concat([x_test_df, y_test_df], axis=1)
        
    elif scaling=="standard":
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        
        x_test_df = pd.DataFrame(X_test, columns=proc_data.drop(f"{TARGET}_Future", axis=1).columns, index=test_df.index)
        y_test_df = pd.DataFrame(y_test, columns=[f"{TARGET}_Future"], index=test_df.index)
        test_df = pd.concat([x_test_df, y_test_df], axis=1)
        
    elif scaling=="none":
        pass
    
    else:
        raise ValueError("scaling can be either 'minmax', 'standard' or 'none'.")
    
    
    return X_train, X_test, y_train, y_test, test_df

###################################################################################################################

def predict_sample(features_serie, model):
    features = np.array(features_serie).reshape(1,-1)
    pred = model.predict(features)[0]
    return pred

In [95]:
# tickers = ["BTC-USD", "^DJI", "^GSPC", "MSFT", "AAPL", "AMZN", "FB", "GOOGL", "JPM", "JNJ", "V", "MA", "INTC"]
TICKERS = ["MSFT", "AAPL", "AMZN", "FB"]
DAYS = 365
YEARS = 10
DF = fetch_data(tickers=TICKERS, days=DAYS, years=YEARS)
DF

Missing Values:
MSFT      0
AAPL      0
AMZN      0
FB      528
dtype: int64


Unnamed: 0_level_0,MSFT,AAPL,AMZN,FB
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2012-05-18,24.394806,65.770210,213.850006,38.230000
2012-05-21,24.794863,69.601997,218.110001,34.029999
2012-05-22,24.803186,69.067520,215.330002,31.000000
2012-05-23,24.261457,70.752777,217.279999,32.000000
2012-05-24,24.228115,70.102989,215.240005,33.029999
...,...,...,...,...
2020-04-03,153.830002,241.410004,1906.589966,154.179993
2020-04-06,165.270004,262.470001,1997.589966,165.550003
2020-04-07,163.490005,259.429993,2011.599976,168.830002
2020-04-08,165.130005,266.070007,2043.000000,174.279999


In [101]:
# seq 5, per 1
results = []
for seq in tqdm(range(1,100)):
    for per in range(1,5):
        TARGET="AAPL"
        SEQ_LEN=seq # previous data
        PERIOD=per # future data
        FEATURES_TYPE="since"
        FORWARD_TEST = True
        SCALING = "none" # none / minmax / standard
        SPLIT_SIZE = 0.50 # training size

        processed_data = process_reg_data(target=TARGET, seq_len=SEQ_LEN, period=PERIOD, features_type=FEATURES_TYPE, df=DF)
        X_train, X_test, y_train, y_test, test_df = split_data(forward_test=FORWARD_TEST, scaling=SCALING, split_size=SPLIT_SIZE, proc_data=processed_data)
#         print(f"processed shape: {processed_data.shape}", end=f"\n{sep}\n")
#         print(f"features: train shape: {X_train.shape} | test Shape: {X_test.shape}", end=f"\n{sep}\n")
#         print(f"labels: train shape: {y_train.shape} | test Shape: {y_test.shape}", end=f"\n{sep}\n")
#         print(f"X_train Max/Min: {X_train.max()} / {X_train.min()}")
#         test_df.tail()

        # model
        svmReg = svm.SVR(kernel='linear').fit(X_train, y_train)

        eval_df = pd.DataFrame()
        eval_df[f"future_{TARGET}_prediction"] = test_df.drop(f"{TARGET}_Future",axis=1).apply(predict_sample, args=(svmReg,), axis=1)
        eval_df = pd.concat([eval_df, processed_data[[f"{TARGET}_price_[t]", f"{TARGET}_Future"]]], axis=1)
        eval_df.columns = ['future_pred', "todays_price", "future_actual"]
        eval_df.dropna(inplace=True)
        eval_df["actual_signal"] = eval_df.apply(lambda x: 1 if x["future_actual"] >= x["todays_price"] else 0, axis=1)
        eval_df["pred_signal"] = eval_df.apply(lambda x: 1  if x["future_pred"] >= x["todays_price"] else 0, axis=1)
        
        preds = eval_df["pred_signal"]
        acts = eval_df["actual_signal"]
        
        res = {
            "SEQ":seq, "PERIOD":per
        }
        
        _metrics = {"AUC":roc_auc_score, "ACC":accuracy_score, "PRECISION":precision_score}
        for k,v in _metrics.items():
            res.update({k:v(acts, preds)})
        
        res.update({"value_counts":preds.value_counts()})
        
        results.append(res)

results = pd.DataFrame(results)
results

100%|██████████| 99/99 [09:23<00:00,  5.69s/it]


Unnamed: 0,SEQ,PERIOD,AUC,ACC,PRECISION,value_counts
0,1,1,0.499656,0.457661,0.500000,"0 988 1 4 Name: pred_signal, dtype: int64"
1,1,2,0.500000,0.585685,0.585685,"1 992 Name: pred_signal, dtype: int64"
2,1,3,0.494522,0.431887,0.579882,"0 822 1 169 Name: pred_signal, dtype: int64"
3,1,4,0.504474,0.417760,0.629630,"0 910 1 81 Name: pred_signal, dtype: int64"
4,2,1,0.503615,0.462702,0.647059,"0 975 1 17 Name: pred_signal, dtype: int64"
...,...,...,...,...,...,...
391,98,4,0.502849,0.519108,0.609576,"1 543 0 399 Name: pred_signal, dtype: int64"
392,99,1,0.499491,0.489926,0.540166,"0 582 1 361 Name: pred_signal, dtype: int64"
393,99,2,0.511882,0.528102,0.595027,"1 563 0 380 Name: pred_signal, dtype: int64"
394,99,3,0.535826,0.537155,0.628099,"1 484 0 458 Name: pred_signal, dtype: int64"


In [107]:
results.sort_values(["AUC", "ACC"], ascending=False)

Unnamed: 0,SEQ,PERIOD,AUC,ACC,PRECISION,value_counts
233,59,2,0.543406,0.517134,0.645161,"0 622 1 341 Name: pred_signal, dtype: int64"
390,98,3,0.540217,0.541888,0.632444,"1 487 0 456 Name: pred_signal, dtype: int64"
213,54,2,0.538505,0.523316,0.629353,"0 563 1 402 Name: pred_signal, dtype: int64"
269,68,2,0.536889,0.509395,0.638806,"0 623 1 335 Name: pred_signal, dtype: int64"
313,79,2,0.536482,0.525708,0.627358,"0 529 1 424 Name: pred_signal, dtype: int64"
...,...,...,...,...,...,...
65,17,2,0.476912,0.500000,0.569579,"1 618 0 366 Name: pred_signal, dtype: int64"
57,15,2,0.475464,0.510660,0.569767,"1 688 0 297 Name: pred_signal, dtype: int64"
53,14,2,0.475209,0.495431,0.567164,"1 603 0 382 Name: pred_signal, dtype: int64"
49,13,2,0.468426,0.506085,0.564723,"1 703 0 283 Name: pred_signal, dtype: int64"


In [102]:
# # model
# svmReg = svm.SVR(kernel='linear').fit(X_train, y_train)
# svmPreds = svmReg.predict(X_test)
# print(mean_squared_error(y_test, svmPreds))
# print(r2_score(y_test, svmPreds))

# eval_df = pd.DataFrame()
# eval_df[f"future_{TARGET}_prediction"] = test_df.drop(f"{TARGET}_Future",axis=1).apply(predict_sample, args=(svmReg,), axis=1)
# eval_df = pd.concat([eval_df, processed_data[[f"{TARGET}_price_[t]", f"{TARGET}_Future"]]], axis=1)
# eval_df.columns = ['future_pred', "todays_price", "future_actual"]
# eval_df.dropna(inplace=True)
# eval_df["actual_signal"] = eval_df.apply(lambda x: 1 if x["future_actual"] >= x["todays_price"] else 0, axis=1)
# eval_df["pred_signal"] = eval_df.apply(lambda x: 1  if x["future_pred"] >= x["todays_price"] else 0, axis=1)
# print(eval_df["pred_signal"].value_counts())
# print(classification_report(eval_df["actual_signal"], eval_df["pred_signal"]))
# eval_df

In [68]:
# svmClf = svm.SVC(kernel='rbf').fit(X_train, y_train)
# svmClfPreds = svmClf.predict(X_test)
# print(classification_report(y_test, svmClfPreds))

In [None]:
# def fit_ann():
#     dnnReg = Sequential()

#     dnnReg.add(Dense(100, activation="relu"))
#     dnnReg.add(Dense(100, activation="relu"))
#     dnnReg.add(Dense(100, activation="relu"))
#     dnnReg.add(Dense(100, activation="relu"))
#     dnnReg.add(Dense(100, activation="relu"))
#     dnnReg.add(Dense(100, activation="relu"))
#     dnnReg.add(Dense(100, activation="relu"))
#     dnnReg.add(Dense(100, activation="relu"))
#     dnnReg.add(Dense(100, activation="relu"))

#     dnnReg.add(Dense(1))

#     dnnReg.compile(optimizer="adam", loss="mse")

#     early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=PATIENCE)

#     dnnReg.fit(x=X_train, y=y_train, epochs=1000,validation_data=(X_test,y_test),use_multiprocessing=True, workers=WORKERS, callbacks=[early_stop])
#     return dnnReg

# if False:
#     dnnReg = fit_ann()
#     pd.DataFrame(dnnReg.history.history).plot()
#     dnnPreds = dnnReg.predict(X_test)
#     print(sep*5)
#     print(mean_squared_error(y_test, dnnPreds))
#     print(r2_score(y_test, dnnPreds))


In [None]:
# # plot timeseries, SMA, signals
# fig = go.Figure()
# fig.add_trace(go.Scatter(
#     x=test_df.index,
#     y=y_test,
#     name=f"Actual",
# #     line_color='#c761ff',
#     line=dict(width=2, dash="solid"),
#     opacity=0.6
#     )
# )
# fig.add_trace(go.Scatter(
#     x=test_df.index,
#     y=svmPreds,
#     name=f"Predicted",
# #     line_color='#c761ff',
#     line=dict(width=2, dash="dot"),
#     opacity=1
#     )
# )




# # fig.update_layout(
# #     title=f'{STOCK} Daily Chart',
# #     xaxis_title='Date',
# #     yaxis_title='Price ($)',
# #     template="plotly_dark",
# # )



# fig.show()

In [None]:
# # models:

# gboostReg = GradientBoostingRegressor(n_estimators=10000, learning_rate=0.01, max_depth=1,  loss='ls', random_state=SEED).fit(X_train, y_train)
# gboostPredds = gboostReg.predict(X_test)
# print(mean_squared_error(y_test, gboostPredds))
# print(r2_score(y_test, gboostPredds))


# dtReg = tree.DecisionTreeRegressor(random_state=SEED).fit(X_train, y_train)
# dtPredds = dtReg.predict(X_test)
# print(mean_squared_error(y_test, dtPredds))
# print(r2_score(y_test, dtPredds))

# ensReg = VotingRegressor(estimators=
#                          [
#                              ('svmReg', svmReg),
#                              ('gboost', gboostReg),
#                              ('dtReg', dtReg)
#                          ]
#                         ).fit(X_train, y_train)
# print(mean_squared_error(y_test, ensReg.predict(X_test)))
# print(r2_score(y_test, ensReg.predict(X_test)))

###################################################################################################################
# # visualisations:

# # correlation heatmap
# # plt.figure(figsize=(15,7))
# corr = df.corr()
# mask = np.zeros_like(corr)
# mask[np.triu_indices_from(mask)] = True
# with sns.axes_style("white"):
#     ax1 = sns.heatmap(corr, mask=mask, square=False,cmap="coolwarm").set_title("Percent Change Correlation Heatmap")       
# ###


# # distributions
# fig = ff.create_distplot([df.pct_change().dropna()[c] for c in df.pct_change().columns], df.pct_change().columns, show_rug=False, show_hist=False)
# fig.update_layout(
#     title=f'Daily Return Distribution')
# fig.show()
# ###


# # prices subplots
# fig = make_subplots(rows=df.shape[1], cols=1, start_cell="bottom-left",     subplot_titles=df.columns,shared_xaxes=True)

# i, j = 1, 1
# for col in df.columns:
#     fig.add_trace(go.Scatter(x=df.index, y=df[col],name=col), row=i, col=1)
#     i += 1
# #     if j != 4: i += 1
# #     else:
# #         i += 1
# #         j = 1

# fig.update_layout(
#     title=f' Daily Chart',
#     xaxis_title='Date',
#     yaxis_title='Price ($)',
# #     xaxis=dict(position=1)
# #     template="plotly_dark",
# )
# fig.update_layout(
#     autosize=True,
# #      width=1500,
#     height=2000,
#     margin=dict(
#         l=50,
#         r=50,
#         b=100,
#         t=100,
#         pad=4
#     ),
# #     paper_bgcolor="LightSteelBlue",
# )

# fig.show()