In [8]:
import os
import time
import datetime
sep = "-*-*-"

os.environ["CUDA_VISIBLE_DEVICES"] = "-1" # disable GPU

import pandas as pd
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
import pandas_datareader.data as web
import numpy as np

# visuals.
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.figure_factory as ff
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# processing / validation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# keras/tf
# %tensorflow_version 2.x
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.callbacks import EarlyStopping
print(tf.__version__)

# models
from sklearn import svm
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import tree
from sklearn.ensemble import VotingRegressor

# metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

# constant seed for reproducibility
SEED = 111 
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

# cpu workers
WORKERS = 6 

2.0.0


In [45]:
def fetch_data(tickers, days, years):

    df_raw = pd.DataFrame() 
    attempt = 0
    drop = []
    while len(tickers) != 0 and attempt <= 5:
        tickers = [j for j in tickers if j not in drop] 
        for i in range(len(tickers)):
            try:
                temp = web.get_data_yahoo(tickers[i],datetime.date.today() - datetime.timedelta(DAYS * YEARS), # reduce delta
                                          datetime.date.today())

                temp.dropna(inplace = True)
                df_raw[tickers[i]] = temp["Adj Close"]
                drop.append(tickers[i])       
            except:
                print(tickers[i]," :failed to fetch data...retrying")
                continue
        attempt+=1
       
    # missing values
    # BTC market is open all the time whereas Stock and Index markets are closed on weekends.
    # drop missing values caused by this behvaiour
    print("Missing Values:")
    print(df_raw.isnull().sum())
    df = df_raw.dropna()

    return df

###################################################################################################################

def process_clf_data(target, seq_len, period, features_type, df):
    df_pct = pd.DataFrame() # blank dataframe


    if features_type=="since": # changes since previous days
        for col in df.columns:
            for i in range(1,seq_len+1):
                df_pct[f"{col}_snc_[t-{i}]"] = df[col].pct_change(i)
        df_pct.dropna(inplace=True)
    
    elif features_type=="shifted": # shifted changes of previous days
        for col in df.columns:
            if col != target: # without target's pct_change
                df_pct[col] = df[col].pct_change(1)
        df_pct.dropna(inplace=True)
        
        # shifted previous
        for col in df_pct.columns:
            for i in range(1,seq_len+1):
                df_pct[f"{col}_sht_[t-{i}]"] = df_pct[col].shift(i)
        df_pct.dropna(inplace=True)
    else:
        raise ValueError("features_type can be either 'since' or 'shifted'.")


    df_pct[f"{target}_price_[t]"] = df[target] # price [t]

    # labeling
    df_pct[f"{target}_Future"] = df[target].shift(-period) # future price [t + perid]
    
    warnings = 0
    def classify(x):
#         print(warnings)
        if x[f"{target}_Future"] >= x[f"{target}_price_[t]"]:
            return 1
        elif x[f"{target}_Future"] < x[f"{target}_price_[t]"]:
            return 0
        else:
            nonlocal warnings
            warnings += 1
            return None
    
    df_pct[f"{target}_Future"] = df_pct.apply(classify, axis=1) # classify


    if warnings > 1:
        raise ValueError("More than 1 NaN in classifying.")
        
    
    df_pct.dropna(inplace=True)
    if df_pct.isnull().any().any():
        raise ValueError("null values exist")
        
    return df_pct

###################################################################################################################

def process_reg_data(target, seq_len, period, features_type, df):
    df_pct = pd.DataFrame() # blank dataframe


    if features_type=="since": # changes since previous days
        for col in df.columns:
            for i in range(1,seq_len+1):
                df_pct[f"{col}_snc_[t-{i}]"] = df[col].pct_change(i)
        df_pct.dropna(inplace=True)
    
    elif features_type=="shifted": # shifted changes of previous days
        for col in df.columns:
            if col != target: # without target's pct_change
                df_pct[col] = df[col].pct_change(1)
        df_pct.dropna(inplace=True)
        
        # shifted previous
        for col in df_pct.columns:
            for i in range(1,seq_len+1):
                df_pct[f"{col}_sht_[t-{i}]"] = df_pct[col].shift(i)
        df_pct.dropna(inplace=True)
    else:
        raise ValueError("features_type can be either 'since' or 'shifted'.")


    df_pct[f"{target}_price_[t]"] = df[target] # target's price

    # labeling
    df_pct[f"{target}_Future"] = df[target].shift(-period)

    df_pct.dropna(inplace=True)
    if df_pct.isnull().any().any():
        raise ValueError("null values exist")
        
    return df_pct

###################################################################################################################

def split_data(forward_test, scaling, split_size, proc_data):
    
    # train/test & faeture/label split
    if forward_test==True:
    #     forward test (recommended)
        nth_prcntile = int(len(proc_data)*split_size)
        test_df = proc_data.iloc[nth_prcntile:,:]
        
        train_df = proc_data.drop(test_df.index)
        train_df = train_df.sample(frac=1, random_state=SEED) # shuffle train dataset

        # features
        X_train = train_df.drop(f"{TARGET}_Future", axis=1).values
        X_test = test_df.drop(f"{TARGET}_Future", axis=1).values

        # labels
        y_train = train_df[f"{TARGET}_Future"].values
        y_test = test_df[f"{TARGET}_Future"].values
        
    elif forward_test==False:
        proc_data = proc_data.sample(frac=(1), random_state=SEED) # shuffle all data
        test_df = proc_data.sample(frac=(1-split_size), random_state=SEED) # sample test dataset
        train_df = proc_data.drop(test_df.index)

        # features
        X_train = train_df.drop(f"{TARGET}_Future", axis=1).values
        X_test = test_df.drop(f"{TARGET}_Future", axis=1).values

        # labels
        y_train = train_df[f"{TARGET}_Future"].values
        y_test = test_df[f"{TARGET}_Future"].values
        
    else:
        raise ValueError("forward_test must be boolean.")


    # scaling
    if scaling=="minmax":
        scaler = MinMaxScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        
        x_test_df = pd.DataFrame(X_test, columns=proc_data.drop(f"{TARGET}_Future", axis=1).columns, index=test_df.index)
        y_test_df = pd.DataFrame(y_test, columns=[f"{TARGET}_Future"], index=test_df.index)
        test_df = pd.concat([x_test_df, y_test_df], axis=1)
        
    elif scaling=="standard":
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        
        x_test_df = pd.DataFrame(X_test, columns=proc_data.drop(f"{TARGET}_Future", axis=1).columns, index=test_df.index)
        y_test_df = pd.DataFrame(y_test, columns=[f"{TARGET}_Future"], index=test_df.index)
        test_df = pd.concat([x_test_df, y_test_df], axis=1)
        
    elif scaling=="none":
        pass
    
    else:
        raise ValueError("scaling can be either 'minmax', 'standard' or 'none'.")
    
    
    return X_train, X_test, y_train, y_test, test_df

###################################################################################################################

def predict_sample(features_serie, model):
    features = np.array(features_serie).reshape(1,-1)
    pred = model.predict(features)[0]
    return pred

In [46]:
# tickers = ["BTC-USD", "^DJI", "^GSPC", "MSFT", "AAPL", "AMZN", "FB", "GOOGL", "JPM", "JNJ", "V", "MA", "INTC"]
TICKERS = ["MSFT", "AAPL", "AMZN", "FB"]
DAYS = 365
YEARS = 5
DF = fetch_data(tickers=TICKERS, days=DAYS, years=YEARS)
DF

Missing Values:
MSFT    0
AAPL    0
AMZN    0
FB      0
dtype: int64


Unnamed: 0_level_0,MSFT,AAPL,AMZN,FB
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-04-14,37.555172,116.228790,385.109985,83.519997
2015-04-15,38.105190,116.670502,383.450012,82.709999
2015-04-16,38.015030,116.109131,386.040009,82.309998
2015-04-17,37.528114,114.802368,375.559998,80.779999
2015-04-20,38.691284,117.425117,389.510010,83.089996
...,...,...,...,...
2020-04-03,153.830002,241.410004,1906.589966,154.179993
2020-04-06,165.270004,262.470001,1997.589966,165.550003
2020-04-07,163.490005,259.429993,2011.599976,168.830002
2020-04-08,165.130005,266.070007,2043.000000,174.279999


In [94]:
TARGET="AAPL"
SEQ_LEN=10 # previous data
PERIOD=1 # future data
FEATURES_TYPE="since"
FORWARD_TEST = True
SCALING = "none" # none / minmax / standard
SPLIT_SIZE = 0.70 # training size
PATIENCE = 128

processed_data = process_reg_data(target=TARGET, seq_len=SEQ_LEN, period=PERIOD, features_type=FEATURES_TYPE, df=DF)
X_train, X_test, y_train, y_test, test_df = split_data(forward_test=FORWARD_TEST, scaling=SCALING, split_size=SPLIT_SIZE, proc_data=processed_data)
print(f"processed shape: {processed_data.shape}", end=f"\n{sep}\n")
print(f"features: train shape: {X_train.shape} | test Shape: {X_test.shape}", end=f"\n{sep}\n")
print(f"labels: train shape: {y_train.shape} | test Shape: {y_test.shape}", end=f"\n{sep}\n")
print(f"X_train Max/Min: {X_train.max()} / {X_train.min()}")
test_df.tail()

processed shape: (1247, 42)
-*-*-
features: train shape: (872, 41) | test Shape: (375, 41)
-*-*-
labels: train shape: (872,) | test Shape: (375,)
-*-*-
X_train Max/Min: 227.3003387451172 / -0.24125281207473048


Unnamed: 0_level_0,MSFT_snc_[t-1],MSFT_snc_[t-2],MSFT_snc_[t-3],MSFT_snc_[t-4],MSFT_snc_[t-5],MSFT_snc_[t-6],MSFT_snc_[t-7],MSFT_snc_[t-8],MSFT_snc_[t-9],MSFT_snc_[t-10],AAPL_snc_[t-1],AAPL_snc_[t-2],AAPL_snc_[t-3],AAPL_snc_[t-4],AAPL_snc_[t-5],AAPL_snc_[t-6],AAPL_snc_[t-7],AAPL_snc_[t-8],AAPL_snc_[t-9],AAPL_snc_[t-10],AMZN_snc_[t-1],AMZN_snc_[t-2],AMZN_snc_[t-3],AMZN_snc_[t-4],AMZN_snc_[t-5],AMZN_snc_[t-6],AMZN_snc_[t-7],AMZN_snc_[t-8],AMZN_snc_[t-9],AMZN_snc_[t-10],FB_snc_[t-1],FB_snc_[t-2],FB_snc_[t-3],FB_snc_[t-4],FB_snc_[t-5],FB_snc_[t-6],FB_snc_[t-7],FB_snc_[t-8],FB_snc_[t-9],FB_snc_[t-10],AAPL_price_[t],AAPL_Future
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1
2020-04-02,0.020709,-0.015535,-0.031018,0.037141,-0.005445,0.056766,0.04665,0.141786,0.130397,0.08794,0.016687,-0.036808,-0.038774,-0.011343,-0.052275,-0.002403,-0.007899,0.091634,0.068443,0.000613,0.005834,-0.015843,-0.022974,0.009857,-0.018747,0.017494,-0.010963,0.008409,0.039402,0.02015,-0.008835,-0.051619,-0.046761,0.008929,-0.031529,0.012675,-0.017331,0.06813,0.056502,0.033044,244.929993,241.410004
2020-04-03,-0.00921,0.011308,-0.024602,-0.039943,0.027589,-0.014605,0.047032,0.03701,0.131269,0.119985,-0.014371,0.002075,-0.050651,-0.052588,-0.025551,-0.065895,-0.01674,-0.022157,0.075946,0.053088,-0.006379,-0.000582,-0.022121,-0.029206,0.003416,-0.025007,0.011003,-0.017272,0.001976,0.032772,-0.025349,-0.03396,-0.07566,-0.070925,-0.016646,-0.056079,-0.012995,-0.042241,0.041053,0.02972,241.410004,262.470001
2020-04-06,0.074368,0.064473,0.086516,0.047936,0.031455,0.104008,0.058677,0.124898,0.11413,0.215399,0.087237,0.071612,0.089494,0.032168,0.030062,0.059457,0.015594,0.069037,0.063148,0.169809,0.047729,0.041046,0.04712,0.024552,0.017129,0.051308,0.021529,0.059257,0.029632,0.0498,0.073745,0.046526,0.037281,-0.007494,-0.00241,0.055871,0.01353,0.059791,0.028389,0.117826,262.470001,259.429993
2020-04-07,-0.01077,0.062797,0.053008,0.074814,0.03665,0.020346,0.092118,0.047274,0.112783,0.10213,-0.011582,0.074645,0.059201,0.076875,0.020213,0.018131,0.047187,0.003831,0.056655,0.050834,0.007013,0.055077,0.048347,0.054464,0.031738,0.024262,0.058681,0.028694,0.066686,0.036854,0.019813,0.095019,0.067261,0.057832,0.01217,0.017355,0.076791,0.033611,0.080789,0.048764,259.429993,266.070007
2020-04-08,0.010031,-0.000847,0.073458,0.063571,0.085596,0.047048,0.030581,0.103073,0.05778,0.123945,0.025595,0.013716,0.10215,0.08631,0.104437,0.046325,0.04419,0.073989,0.029523,0.0837,0.015609,0.022732,0.071547,0.064711,0.070923,0.047843,0.040251,0.075207,0.044751,0.083337,0.032281,0.052733,0.130367,0.101713,0.09198,0.044844,0.050196,0.111551,0.066977,0.115678,266.070007,267.98999


In [95]:
# model
svmReg = svm.SVR(kernel='linear').fit(X_train, y_train)
svmPreds = svmReg.predict(X_test)
print(mean_squared_error(y_test, svmPreds))
print(r2_score(y_test, svmPreds))

eval_df = pd.DataFrame()
eval_df[f"future_{TARGET}_prediction"] = test_df.drop(f"{TARGET}_Future",axis=1).apply(predict_sample, args=(svmReg,), axis=1)
eval_df = pd.concat([eval_df, processed_data[[f"{TARGET}_price_[t]", f"{TARGET}_Future"]]], axis=1)
eval_df.columns = ['future_pred', "todays_price", "future_actual"]
eval_df.dropna(inplace=True)
eval_df["actual_signal"] = eval_df.apply(lambda x: 1 if x["future_actual"] >= x["todays_price"] else 0, axis=1)
eval_df["pred_signal"] = eval_df.apply(lambda x: 1  if x["future_pred"] >= x["todays_price"] else 0, axis=1)
print(eval_df["pred_signal"].value_counts())
print(classification_report(eval_df["actual_signal"], eval_df["pred_signal"]))
eval_df

33.21569038174827
0.9852855353613694
1    329
0     46
Name: pred_signal, dtype: int64
              precision    recall  f1-score   support

           0       0.59      0.16      0.25       173
           1       0.56      0.91      0.69       202

    accuracy                           0.56       375
   macro avg       0.57      0.53      0.47       375
weighted avg       0.57      0.56      0.49       375



Unnamed: 0_level_0,future_pred,todays_price,future_actual,actual_signal,pred_signal
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-10-11,210.658538,210.042480,217.545044,1,1
2018-10-12,218.008168,217.545044,212.892654,0,1
2018-10-15,213.392742,212.892654,217.584213,1,1
2018-10-16,217.849353,217.584213,216.643967,0,1
2018-10-17,216.915687,216.643967,211.580200,0,1
...,...,...,...,...,...
2020-04-02,244.996602,244.929993,241.410004,0,1
2020-04-03,241.580078,241.410004,262.470001,1,1
2020-04-06,262.364739,262.470001,259.429993,0,0
2020-04-07,259.576327,259.429993,266.070007,1,1


In [68]:
# svmClf = svm.SVC(kernel='rbf').fit(X_train, y_train)
# svmClfPreds = svmClf.predict(X_test)
# print(classification_report(y_test, svmClfPreds))

In [None]:
# def fit_ann():
#     dnnReg = Sequential()

#     dnnReg.add(Dense(100, activation="relu"))
#     dnnReg.add(Dense(100, activation="relu"))
#     dnnReg.add(Dense(100, activation="relu"))
#     dnnReg.add(Dense(100, activation="relu"))
#     dnnReg.add(Dense(100, activation="relu"))
#     dnnReg.add(Dense(100, activation="relu"))
#     dnnReg.add(Dense(100, activation="relu"))
#     dnnReg.add(Dense(100, activation="relu"))
#     dnnReg.add(Dense(100, activation="relu"))

#     dnnReg.add(Dense(1))

#     dnnReg.compile(optimizer="adam", loss="mse")

#     early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=PATIENCE)

#     dnnReg.fit(x=X_train, y=y_train, epochs=1000,validation_data=(X_test,y_test),use_multiprocessing=True, workers=WORKERS, callbacks=[early_stop])
#     return dnnReg

# if False:
#     dnnReg = fit_ann()
#     pd.DataFrame(dnnReg.history.history).plot()
#     dnnPreds = dnnReg.predict(X_test)
#     print(sep*5)
#     print(mean_squared_error(y_test, dnnPreds))
#     print(r2_score(y_test, dnnPreds))


In [None]:
# # plot timeseries, SMA, signals
# fig = go.Figure()
# fig.add_trace(go.Scatter(
#     x=test_df.index,
#     y=y_test,
#     name=f"Actual",
# #     line_color='#c761ff',
#     line=dict(width=2, dash="solid"),
#     opacity=0.6
#     )
# )
# fig.add_trace(go.Scatter(
#     x=test_df.index,
#     y=svmPreds,
#     name=f"Predicted",
# #     line_color='#c761ff',
#     line=dict(width=2, dash="dot"),
#     opacity=1
#     )
# )




# # fig.update_layout(
# #     title=f'{STOCK} Daily Chart',
# #     xaxis_title='Date',
# #     yaxis_title='Price ($)',
# #     template="plotly_dark",
# # )



# fig.show()

In [None]:
# # models:

# gboostReg = GradientBoostingRegressor(n_estimators=10000, learning_rate=0.01, max_depth=1,  loss='ls', random_state=SEED).fit(X_train, y_train)
# gboostPredds = gboostReg.predict(X_test)
# print(mean_squared_error(y_test, gboostPredds))
# print(r2_score(y_test, gboostPredds))


# dtReg = tree.DecisionTreeRegressor(random_state=SEED).fit(X_train, y_train)
# dtPredds = dtReg.predict(X_test)
# print(mean_squared_error(y_test, dtPredds))
# print(r2_score(y_test, dtPredds))

# ensReg = VotingRegressor(estimators=
#                          [
#                              ('svmReg', svmReg),
#                              ('gboost', gboostReg),
#                              ('dtReg', dtReg)
#                          ]
#                         ).fit(X_train, y_train)
# print(mean_squared_error(y_test, ensReg.predict(X_test)))
# print(r2_score(y_test, ensReg.predict(X_test)))

###################################################################################################################
# # visualisations:

# # correlation heatmap
# # plt.figure(figsize=(15,7))
# corr = df.corr()
# mask = np.zeros_like(corr)
# mask[np.triu_indices_from(mask)] = True
# with sns.axes_style("white"):
#     ax1 = sns.heatmap(corr, mask=mask, square=False,cmap="coolwarm").set_title("Percent Change Correlation Heatmap")       
# ###


# # distributions
# fig = ff.create_distplot([df.pct_change().dropna()[c] for c in df.pct_change().columns], df.pct_change().columns, show_rug=False, show_hist=False)
# fig.update_layout(
#     title=f'Daily Return Distribution')
# fig.show()
# ###


# # prices subplots
# fig = make_subplots(rows=df.shape[1], cols=1, start_cell="bottom-left",     subplot_titles=df.columns,shared_xaxes=True)

# i, j = 1, 1
# for col in df.columns:
#     fig.add_trace(go.Scatter(x=df.index, y=df[col],name=col), row=i, col=1)
#     i += 1
# #     if j != 4: i += 1
# #     else:
# #         i += 1
# #         j = 1

# fig.update_layout(
#     title=f' Daily Chart',
#     xaxis_title='Date',
#     yaxis_title='Price ($)',
# #     xaxis=dict(position=1)
# #     template="plotly_dark",
# )
# fig.update_layout(
#     autosize=True,
# #      width=1500,
#     height=2000,
#     margin=dict(
#         l=50,
#         r=50,
#         b=100,
#         t=100,
#         pad=4
#     ),
# #     paper_bgcolor="LightSteelBlue",
# )

# fig.show()