In [None]:
# import libraries for time-series analysis
import os
import numpy as np
import pandas as pd
import seaborn as sns

from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px

import matplotlib.pyplot as plt
from matplotlib import cm
from pandas import read_csv, set_option

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer, RobustScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.feature_selection import SelectKBest, f_regression
import xgboost
from xgboost import plot_importance, XGBClassifier, XGBRegressor
from sklearn.model_selection import learning_curve
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import shap

from sklearn.decomposition import PCA
from sklearn.decomposition import SparsePCA
from tqdm.autonotebook import tqdm
from sklearn.decomposition import KernelPCA
from sklearn.decomposition import IncrementalPCA
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import MiniBatchDictionaryLearning
from sklearn.decomposition import FastICA
from sklearn.manifold import Isomap
from sklearn.manifold import MDS
from sklearn.manifold import LocallyLinearEmbedding
from sklearn.manifold import TSNE
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection

import time
import warnings
warnings.filterwarnings('ignore')
sns.set(style='whitegrid')
%matplotlib inline

In [None]:
import datetime

In [None]:
# define time conversion custom function for timestamps in string format -> native timestamps in the csv file
def datetime_parser(timestamp_str):
    return datetime.datetime.strptime(timestamp_str, "%Y-%m-%d %H:%M:%S%z")

# define path for train_df1 dataset
path = './data/train_df1.csv'

# for time series data, the datetime is always the index
train_df1 = pd.read_csv(path,
                        parse_dates=[0], # to be decided later
                        date_parser = datetime_parser,
                        index_col = 'Timestamp')    

train_df1.info()

In [None]:
# import the test_df1 csv file 
path = './data/test_df1.csv'

# for time series data, the datetime is always the index
test_df1 = pd.read_csv(path,
                       parse_dates=[0], # to be decided later
                       date_parser = datetime_parser,
                       index_col = 'Timestamp')    

test_df1.info()

### 3. Data Transformation

#### 3.1 Baseline Model Features
Plot a baseline correlation feature matrix to visualize baseline dataframe feature correlation to signal (target variable).

In [None]:
# cutomized function for the correlation of baseline features with target variable
def corrMat(df, 
            target='', 
            figsize=(9,0,5), 
            ret_id=False):
    corr_mat = df.corr().round(3);
    corr_mat = corr_mat.transpose()
    corr = corr_mat.loc[:, df.columns == target].transpose().copy()
    
    # define a default colormap
    cmap = sns.diverging_palette(220, 10, as_cmap=True)
    
    if (ret_id is False):
        f, ax = plt.subplots(figsize = figsize)
        sns.heatmap(corr,
                    vmin=-1,
                    vmax=1, 
                    center=0,
                    cmap=cmap,
                    square=False,
                    lw=2, 
                    annot=True,
                    cbar=False)
        plt.title(f'Feature Correlation to {target}')
        
    if (ret_id):
        return corr
    

In [None]:
corrMat(train_df1,'Signal',figsize=(7,0.5)) # baseline dataframe feature correlation to Signal

All features have weak negative linear correlations to the target variable (buy/sell signals). 

This is suggestive of a number of things:
`open`, `high`, `low`, `close`, `volume_(BTC)`, `volume_(Currency)`, `weighted_price`, `SMA`, `LMA` may have (1) high non-linearity, (2) stable ocsillation relative to stationary value (circular scatter), or (3) they are not the most ideal to model the target variable `signal` and can be improved so the attention shifts to feature engineering. 

#### 3.2 Feature Engineering - Conversion of Variables into Traditional Technical Indicators
Technical Indicators to measure performance of assets : 
- Moving Averages (20MA, 50MA, 250MA)
- Exponential Moving Averages (10, 30, 200)
- Momentum
- Relative Strength Index (RSI)
- Stochastic Oscillators (Slow)
- Stochastic Oscillators (Fast) 

These engineered features will be introduced into the feature matrix. The results will show us which features have the most significant impact on the model's performance, if any. 

In [None]:
train_df2 = train_df1.copy()  # duplicate dataframes & add features to them
test_df2 = test_df1.copy()

In [None]:
# comversion formula for moving averages (20MA, 50MA, 250MA)
# the moving average provides an indication of the trend of the price movement by reducing the amount of noise

def ma(df, n):
    return pd.Series(df['Close'].rolling(n, min_periods=n).mean(), name='MA_'+str(n))


In [None]:
# conversion formula for exponential moving averages (10, 30, 200)

def exp_ma(df, n):
    return pd.Series(df['Close'].ewm(span=n, min_periods=n).mean(), name='EMA_'+str(n))


In [None]:
# conversion formula for price momentum

def mom(df, n):
    return pd.Series(df.diff(n), name='Momentum_'+str(n))


In [None]:
# conversion formula for relative strength index (RSI)
# a momentum indicator that measures the magnitude of recent price changes to evaluate overbought or oversold conditions in the price of a stock or other asset. Ranging from [1, 100].
# if the asset -> 70 (asset deemed overbought)
# if the asset -> 30 (asset getting undersold and undervalued)

def rsi(df, period):
    delta = df.diff().dropna()
    u = delta * 0;
    d = u.copy()
    u[delta > 0] = delta[delta > 0];
    d[delta < 0] = -delta[delta < 0];
    u[u.index[period-1]] = np.mean( u[:period] )  # the first value is the sum of average gains
    u.drop(u.index[:(period-1)])
    d[d.index[period-1]] = np.mean( d[:period] ) # the first valud is the sum of average losses
    d = d.drop(d.index[:(period-1)])
    rs = u.ewm(com=period-1, adjust=False).mean() / d.ewm(com=period-1, adjust=False).mean()
    return 100 - 100 / (1 + rs)
    

In [None]:
# conversion formula for stochastic oscillators - slow/fast indicator
# a stochastic oscillator is a momentum indicator comparing a particular closing price of a security to a range of its prices over a period of time (%K/%D)

def sto(close, low, high, n,id): 
    stok = ((close - low.rolling(n).min()) / (high.rolling(n).max() - low.rolling(n).min())) * 100
    if(id is 0):
        return stok
    else:
        return stok.rolling(3).mean()
    

In [None]:
from plotly.subplots import make_subplots

In [None]:
# visualize overall asset price history during training data period and the associated buy/sell signals
    
# plot n verticle subplots
def plot_vsubplots(ldf,
                   lst,
                   title='',
                   nplots=None,
                   lw_id=None,
                   size=[400,1000]):

    # lw_id list of line widths if added
        
    assert(nplots is not None) 
    fig = make_subplots(rows=nplots,
                        shared_xaxes=True)
    ii=-1
    for i in lst:
        ii+=1
        fig.add_trace(go.Scatter(x=ldf.index,
                                 y=ldf[lst[ii]], 
                                 mode='lines',
                                 name=lst[ii],
                                 line=dict(width=lw_id[ii])), 
                      row=ii+1, 
                      col=1) 

    fig.update_layout(height=size[0],
                      width=size[1],
                      template='plotly_white',
                      title=title,
                      margin=dict(l=50,
                                  r=80,
                                  t=50,
                                  b=40));
    fig.show()

In [None]:
import plotly.graph_objects as go

In [None]:
# customize function to plot trends across time with line plot
def plot_line(ldf, 
              lst, 
              title='',
              sec_id=None,
              size=[350,1000]):
    
    """
    Function to plot trends across time with a line plot.
    
        Parameters:

        (1) ldf : dataframe 
                The DataFrame containing the data to plot.

        (2) lst : list of str
                A list of column names to plot.

        (3) title : str, optional 
                The title of the plot - default is an empty string.

        (4) sec_id : list of bool, optional 
                A list of boolean values indicating whether to activate subplots; 
                Must be the same length as lst - default is None.

        (5) size : list of int, optional 
                The size of the plot as [height, width] - default is [350, 1000].
            
    """
        
    # if sec_id is provided, we create a subplot with secondary y-axis
    if(sec_id is not None):
        fig = make_subplots(specs=[[{"secondary_y": True}]])
    else:
        fig = go.Figure() # otherwise, create a simple figure without subplots 
        
    # check if lst contains more than one element
    if(len(lst) is not 1): # use '!=' instead of 'is not' for integer comparison 
        ii =-1 # initialize an index for iterating through lst
        for i in lst:
            ii +=1 # increment index
            if(sec_id != None):
                # add a trace with a secondary y-axis if sec_id is provided and matches index
                fig.add_trace(go.Scatter(x=ldf.index, 
                                         y=ldf[lst[ii]], 
                                         mode='lines', 
                                         name=lst[ii], 
                                         line=dict(width=2.0)), 
                              secondary_y=sec_id[ii])
            else:
                # add a trace to the figure without secondary y-axis
                fig.add_trace(go.Scatter(x=ldf.index, 
                                         y=ldf[lst[ii]], 
                                         mode='lines', 
                                         name=lst[ii], 
                                         line=dict(width=2.0)))
    else:
        # if lst contains only one element, add a simple trace
        fig.add_trace(go.Scatter(x=ldf.index, 
                                 y=ldf[lst[0]],
                                 mode='lines',
                                 name=lst[0],
                                 line=dict(width=2.0)))

    # update the layout of the figure with the specific size and title
    fig.update_layout(height=size[0],
                      width=size[1],
                      template='plotly_white',
                      title=title,
                      margin=dict(l=50,
                                  r=80,
                                  t=50,
                                  b=40));
    
    # display the plot  
    fig.show()

In [None]:
# customised function to create all technical indicators using embedded conversion formulas
# with the option to plot the training data (technical indicators over time)

plot_period = slice('2019-7-7 0:00','2019-7-7 8:00')  

def technical_indicators(ldf, tr_id = True):
    ''' Moving Average '''
    ldf['MA21'] = ma(ldf,10)
    ldf['MA63'] = ma(ldf, 30)
    ldf['MA252'] = ma(ldf, 200)
    lst_MA = ['MA21','MA63','MA252']

    ''' Exponentially Weighted Moving Average '''
    ldf['EMA10'] = exp_ma(ldf, 10)
    ldf['EMA30'] = exp_ma(ldf, 30)
    ldf['EMA200'] = exp_ma(ldf, 200)
    lst_EMA = ['EMA10','EMA30','EMA200']

    ''' Momentum '''
    ldf['MOM10'] = mom(ldf['Close'], 10)
    ldf['MOM30'] = mom(ldf['Close'], 30)
    lst_MOM = ['MOM10','MOM30']

    ''' Relative Strength Index / RSI '''
    ldf['RSI10'] = rsi(ldf['Close'], 10)
    ldf['RSI30'] = rsi(ldf['Close'], 30)
    ldf['RSI200'] = rsi(ldf['Close'], 200)
    lst_RSI = ['RSI10','RSI30','RSI200']

    ''' Slow Stochastic Oscillators '''
    ldf['%K10'] = sto(ldf['Close'], ldf['Low'], ldf['High'],5,0)
    ldf['%K30'] = sto(ldf['Close'], ldf['Low'], ldf['High'],10,0)
    ldf['%K200'] = sto(ldf['Close'], ldf['Low'], ldf['High'], 20,0)
    lst_pK = ['%K10','%K30','%K200']

    ''' Fast Stochastic Oscillators '''
    ldf['%D10'] = sto(ldf['Close'], ldf['Low'], ldf['High'], 10,1)
    ldf['%D30'] = sto(ldf['Close'], ldf['Low'], ldf['High'], 30,1)
    ldf['%D200'] = sto(ldf['Close'], ldf['Low'], ldf['High'], 200,1)
    lst_pD = ['%D10','%D30','%D200']
    
    # plot dataset
    if(tr_id):
        plot_line(ldf.loc[plot_period,lst_MA],lst_MA,title='Moving Average (window=21,63,252)')
        plot_line(ldf.loc[plot_period,lst_EMA],lst_EMA,title='Exponential Moving Average (window=10,30,200)')
        plot_line(ldf.loc[plot_period,lst_MOM],lst_MOM,title='Momentum')
        plot_line(ldf.loc[plot_period,lst_RSI],lst_RSI,title='Relative Strength Index')
        plot_line(ldf.loc[plot_period,lst_pK],lst_pK,title='Stochastic Oscillators (slow)')
        plot_line(ldf.loc[plot_period,lst_pD],lst_pD,title='Stochastic Oscillators (fast)')

In [None]:
technical_indicators(train_df2) # add technical features to training set
technical_indicators(test_df2,tr_id=False) # add technical features to test set, no need to plot trends in test dataset

In [None]:
# ensure that technical indicators have been added to the training / test dataset as additional features
train_df2.columns

In [None]:
test_df2.columns

In [None]:
# original datasets without technical indicators added as additional features
train_df1.columns

In [None]:
test_df1.columns

#### 3.3 Updated Baseline Model Features
Plot a updated baseline correlation feature matrix to visualize baseline dataframe feature correlation to signal (target variable).

In [None]:
# updated feature linear correlation matrix
corrMat(train_df2,'Signal',figsize=(20,0.5))

* engineered features were more linearly correlated to target variable (signal)
* original baseline features may not be very useful in the prediction of signals
* linear correlation between engineered features and target variable is significant and not too high (risk of overfitting)

#### 3.4 Feature Selection
- Features with high linear correlation have high predictive power, to be included in the training of the predictive model
- Remove features with low linear correlation to target variable, to prevent the introduction of Noise and Dimensionality - Poor Generalization & Lower Model Efficiency
<br>
(1) Inclusion of redundant features will cause model to capture noise rather than true patterns
<br>
(2) Curse of Dimensionality - increased dimensionality reduces model efficiency, more computational resources needed for high dimensional feature spaces

In [None]:
# drop the original features with low predictive power and low feature importance
# dropping redundant features that will introduce noise and dimensionality 
# 'Open', 'High', 'Low', 'Close', 'Volume_(BTC)', 'Volume_(Currency)', 'Weighted_Price', 'SMA', 'LMA'
# drop features with low linear correlation + features that are redundant based on domain knowledge
train_df2.drop(['Open','High', 'Low', 'Volume_(Currency)', 'SMA', 'LMA'], axis=1, inplace=True)
train_df2.info()

In [None]:
test_df2.drop(['Open','High','Low','Volume_(Currency)', 'SMA', 'LMA'], axis=1, inplace=True)
test_df2.info()

In [None]:
# customize function for plotting the percentage of missing values

def bar_plot(x,
             y, 
             palette_len,
             title = 'Missing Values (%)',
             xlim = None,
             ylim = None,
             xticklabels = None,
             yticklabels = None,
             xlabel = None,
             ylabel = None,
             figsize = (10,4),
             axis_grid ='y'):
    
    cmap = sns.color_palette("plasma")
    fig, ax = plt.subplots(figsize = figsize)
    plt.title(title, size=15, fontweight='bold')
    
    for i in ['top', 'right', 'bottom', 'left']:
        ax.spines[i].set_color('black')
        
    ax.spines['top'].set_visible(True)
    ax.spines['right'].set_visible(True)
    ax.spines['bottom'].set_visible(True)
    ax.spines['left'].set_visible(False)
    
    sns.barplot(x = x,
                y = y,
                palette = cmap, 
                ax = ax,
                edgecolor = 'black')
    
    ax.set_xlim(xlim)
    ax.set_ylim(ylim)
    ax.set_xticklabels(xticklabels)
    ax.set_yticklabels(yticklabels)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    ax.grid(axis = axis_grid, ls = '--', alpha = 0.9)
    
    plt.show
    

In [None]:
# check updated training dataset for missing values
# plot percentage of missing values for each column
nan_values_tr2 = ((train_df2.isnull().sum() / len(train_df2)) * 100).sort_values(ascending=True)

bar_plot(x = nan_values_tr2, 
         y = nan_values_tr2.index, 
         palette_len = nan_values_tr2.index, 
         xlim = (0,1),
         xticklabels = range(0,10),
         yticklabels = nan_values_tr2.index,
         figsize = (10,5), 
         axis_grid = 'x')

In [None]:
# check updated testing dataset for missing values
# plot percentage of values that are NA for each column
nan_values_te2 = ((test_df2.isnull().sum() / len(test_df2)) * 100).sort_values(ascending=True)

bar_plot(x = nan_values_te2,
         y = nan_values_te2.index,
         palette_len = nan_values_te2.index, 
         xlim = (0,1),
         xticklabels = range(0,10),
         yticklabels = nan_values_te2.index,
         figsize = (10,5), 
         axis_grid = 'x')

There are missing values in the training and testing datasets,records to be removed since they make up only an insignificant percentage of the total count.

In [None]:
# drop the rows with missing values in train_df2
train_df2 = train_df2.dropna(axis=0,
                             inplace=False) 
train_df2.info()

In [None]:
train_df2.isnull().sum()


In [None]:
# drop the rows with missing values in test_df2
test_df2 = test_df2.dropna(axis=0,
                           inplace=False)
test_df2.info()

In [None]:
test_df2.isnull().sum()


In [None]:
# save transformed train_df2 for model generation and evaluation
train_df2.to_csv('./data/train_df2.csv', index=True)

In [None]:
# save transformed test_df2 for model generation and evaluation
test_df2.to_csv('./data/test_df2.csv', index=True)