In [1]:
%load_ext autoreload
%autoreload 2

import json
import jsonmerge
import os
import sys
import numpy as np
import scipy as sp
import datetime
import itertools
import pandas as pd
import sklearn as sk
import sklearn.preprocessing
import sklearn.model_selection
import sklearn.ensemble
import sklearn.metrics
import time
import requests

#plotting import & settings
import plotly as py
import plotly.offline as pyo
from plotly.graph_objs import *
import plotly.figure_factory as ff
import matplotlib.pyplot as plt
import colorlover as cl
pyo.init_notebook_mode(connected=True)
%matplotlib inline

#import package scripts
module_path = os.path.abspath(os.path.join('..'))
sys.path.append('C:\\Code\\general')
from plotly_plot_tools import *
from misc import *

### Load coin data from Cryptocompare API
 (Mostly taken from Sohrab's code)

In [2]:
df_dict = {}

coins = ['BTC', 'ETH', 'XRP', 'LTC']

exchanges = {'CCCAGG'}  #CCCAGG = cryptocompare's aggregated data

ref = 'USD'

n_sample = 2000  # number of rows (hours) we want
t_now = int(np.round(time.time()))
t_start = t_now - 365*24*3600 + n_sample*3600   # time we want to start from


df = []
for cn in coins:
    for ex in exchanges:
        cnt = 0
        t = t_now
        while t>=t_start:

            url = 'https://min-api.cryptocompare.com/data/histohour?fsym={:s}&tsym={:s}&limit={:d}&aggregate=0&e={:s}&toTs={:d}'.format(\
                                            cn, ref, n_sample, ex, t)
            response = requests.post(url)
            s = response.json()

            if len(s['Data'])==0:
                print('s')
                break
             
            print('coin: ' + cn + ' cnt:' + str(cnt) + ' t:' + str(t)) # for debugging
            if cnt==0:
                df = pd.DataFrame.from_dict(s['Data'])
            else:
                df_tmp = pd.DataFrame.from_dict(s['Data'])  # convert to pandas 
                df = pd.concat([df, df_tmp], axis=0)        # add newly loaded values
                        
            t -= n_sample*3600
            cnt += 1
            
        #df.sort_values(by='time', axis=0, ascending=True, inplace=True)
        df=df.set_index('time') 
        df.sort_index(inplace=True)
        df_dict[cn] = df
        

coin: BTC cnt:0 t:1515505839
coin: BTC cnt:1 t:1508305839
coin: BTC cnt:2 t:1501105839
coin: BTC cnt:3 t:1493905839
coin: ETH cnt:0 t:1515505839
coin: ETH cnt:1 t:1508305839
coin: ETH cnt:2 t:1501105839
coin: ETH cnt:3 t:1493905839
coin: XRP cnt:0 t:1515505839
coin: XRP cnt:1 t:1508305839
coin: XRP cnt:2 t:1501105839
coin: XRP cnt:3 t:1493905839
coin: LTC cnt:0 t:1515505839
coin: LTC cnt:1 t:1508305839
coin: LTC cnt:2 t:1501105839
coin: LTC cnt:3 t:1493905839


### Preprocess data & add features

In [3]:
def feature_eng(df):
    try:
        df.drop(['close', 'high', 'low'], axis=1, inplace=True)   #assuming for now high/low not necc.
    except:
        print('columns already removed')

    df['pct_chng'] = np.log(df.open.pct_change()+1)
    df['pct_chng'].fillna(0, inplace=True)
    df['48h_std'] = df.pct_chng.rolling(48).std()
    df['48h_mean'] = df.pct_chng.rolling(48).mean()
    df = df[['open', 'pct_chng', '48h_std', '48h_mean', 'volumeto', 'volumefrom']]
    return df

# reformat & do feature eng. on each individual coin df
df_dict = {c:feature_eng(df_dict[c]) for c in coins}


#df['formtime'] = pd.to_datetime(df.index,unit='s') # for intuition, adds formatted datetime
df_dict['ETH'].head(6)

Unnamed: 0_level_0,open,pct_chng,48h_std,48h_mean,volumeto,volumefrom
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1486702800,11.0,0.0,,,24019.43,2192.62
1486706400,11.0,0.0,,,68823.65,6312.91
1486710000,10.93,-0.006384,,,37842.44,3495.93
1486713600,10.8,-0.011965,,,81628.87,7413.6
1486717200,11.0,0.018349,,,106269.94,9797.39
1486720800,10.84,-0.014652,,,38860.52,3547.69


### Combine all coins into 1 DF

In [4]:
def combineIndivCoinDFs(df_dict):

    ## Combine all individual coin dfs into one large DF for ML
    coins = list(df_dict.keys())
    Nc = len(coins)
    DF = df_dict[coins[0]].rename(index=str, columns={c:coins[0]+c for c in df_dict[coins[0]].columns})
    for i in range(1,Nc):
        DF = DF.join(df_dict[coins[i]].rename(index=str, columns={c:coins[i]+c for c in df_dict[coins[i]].columns}), how='right', ) 
    return DF

X = combineIndivCoinDFs(df_dict)
X.head(10)

Unnamed: 0_level_0,BTCopen,BTCpct_chng,BTC48h_std,BTC48h_mean,BTCvolumeto,BTCvolumefrom,ETHopen,ETHpct_chng,ETH48h_std,ETH48h_mean,...,XRP48h_std,XRP48h_mean,XRPvolumeto,XRPvolumefrom,LTCopen,LTCpct_chng,LTC48h_std,LTC48h_mean,LTCvolumeto,LTCvolumefrom
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1486702800,979.12,0.0,,,1469980.43,1498.02,11.0,0.0,,,...,,,1212.7,193084.79,3.68,0.0,,,7630.74,2070.76
1486706400,982.13,0.003069,,,3372076.2,3485.69,11.0,0.0,,,...,,,656.88,105060.59,3.69,0.002714,,,33710.97,9183.74
1486710000,960.96,-0.021791,,,2230569.69,2307.95,10.93,-0.006384,,,...,,,541.06,86015.01,3.67,-0.005435,,,28656.57,7811.66
1486713600,967.76,0.007051,,,3313643.79,3446.87,10.8,-0.011965,,,...,,,1771.71,281597.4,3.64,-0.008208,,,100742.27,27525.42
1486717200,952.03,-0.016388,,,4486824.68,4699.41,11.0,0.018349,,,...,,,3288.86,522491.99,3.66,0.005479,,,40693.17,11103.72
1486720800,956.54,0.004726,,,2156146.44,2244.72,10.84,-0.014652,,,...,,,1147.13,184293.56,3.66,0.0,,,20089.89,5442.97
1486724400,959.94,0.003548,,,1922342.21,1994.15,11.05,0.019187,,,...,,,2179.13,345929.75,3.68,0.00545,,,33501.02,9021.7
1486728000,965.65,0.005931,,,1923772.98,1988.13,10.98,-0.006355,,,...,,,1197.9,190544.19,3.73,0.013495,,,24773.7,6697.03
1486731600,964.78,-0.000901,,,1422645.77,1471.27,10.99,0.00091,,,...,,,979.12,156824.47,3.68,-0.013495,,,20526.5,5574.67
1486735200,963.73,-0.001089,,,2127100.47,2203.55,11.09,0.009058,,,...,,,164.99,26483.82,3.7,0.00542,,,20426.6,5529.18



### Visualize Data

In [5]:
coin_to_vis = 'ETH'

## eth price, vol over time (candle graph)?
formtime = pd.to_datetime(df_dict[coin_to_vis].index,unit='s') # for intuition, adds formatted datetime

traces=[]
traces += [go.Scatter(x=formtime, y=df_dict['ETH'].open, name=coin_to_vis + ' hourly open')]
layout = go.Layout(
        title=coin_to_vis + ' activity over time',
        xaxis={'title': 'Time'},
        yaxis={'title': 'Price'},
)
fig = go.Figure(data=traces, layout=layout)
pyo.iplot(fig)

## eth auto corr
maxlag = 120
acorr= autocorrelation(df_dict['ETH'].pct_chng, maxlag)

traces=[]
traces += [go.Scatter(y=acorr, name=coin_to_vis + ' hourly open')]
layout = go.Layout(
        title=coin_to_vis + ' % change autocorrelation',
        xaxis={'title': 'Lag (hours)'},
        yaxis={'title': 'Autocorr.'},
)
fig = go.Figure(data=traces, layout=layout)
pyo.iplot(fig)

In [57]:
## x[t] vs x[t-1] plot
corrPlot(df_dict[coin_to_vis].pct_chng,                 # 1D data vector or list of 1D dsata vectors
             df_dict[coin_to_vis].pct_chng.shift(-1),                 # 1D data vector or list of 1D dsata vectors
             names=['x[t]','x[t-1]'],        # names of x, y (ex:['A', 'B']
             maxdata=200000,      # max # of points to plot above histogram (if too high, it will be slow)
             addCorr=True,      # whether to add correlation statistics into plot (R2, spearmanR2, Pvals, & y=mx+b)
             addCorrLine=True,     # whether to plot correlation line
             addXYline=False,      # whether to plot y=x line
             plot=True,         # if false, just returns plotly json object
             title='Lag 1 Correlation', # title of plot
             xlbl='x[t]',           #
             ylbl='x[t-1]')

In [56]:
coin_chng = pd.DataFrame()
for c in coins:
    coin_chng[c] = df_dict[c]['pct_chng']
scattermatrix(coin_chng, title='Coin Hourly % Change Correlations')

This is the format of your plot grid:
[ (1,1) x1,y1 ]    [ (1,2) x2,y2 ]    [ (1,3) x3,y3 ]    [ (1,4) x4,y4 ]  
[ (2,1) x5,y5 ]    [ (2,2) x6,y6 ]    [ (2,3) x7,y7 ]    [ (2,4) x8,y8 ]  
[ (3,1) x9,y9 ]    [ (3,2) x10,y10 ]  [ (3,3) x11,y11 ]  [ (3,4) x12,y12 ]
[ (4,1) x13,y13 ]  [ (4,2) x14,y14 ]  [ (4,3) x15,y15 ]  [ (4,4) x16,y16 ]



### Random Forest Regression to predict coin pct_change

In [58]:
Xf = copy.deepcopy(X)
features = list(Xf)

# use imputation to account for missing values
imp = sk.preprocessing.Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(Xf)
Xf = imp.transform(Xf)

#Scale data to mean=0, std=1
Xf=sk.preprocessing.scale(Xf)

Xf = pd.DataFrame(Xf, index=X.index, columns=features)

In [59]:
coin_to_predict = 'ETH'
y = X[coin_to_predict + 'pct_chng'].shift(-1)
Xf = Xf.iloc[0:-1,:]
y = y.iloc[0:-1]

In [60]:
# split train/test set
X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(Xf, y, test_size=0.2, random_state=1)

In [61]:
# Fit random forest
regr = sk.ensemble.RandomForestRegressor(n_estimators=300, oob_score=True, max_depth=None, random_state=1)
regr.fit(X_train, y_train)

# get train/test predictions
predicted_train = regr.predict(X_train)
predicted_test = regr.predict(X_test)

In [62]:
# Print metrics
r2_trn = sk.metrics.r2_score(y_train, predicted_train)
spearman_trn = sp.stats.spearmanr(y_train, predicted_train)
pearson_trn = sp.stats.pearsonr(y_train, predicted_train)
r2_tst = sk.metrics.r2_score(y_test, predicted_test)
spearman_tst = sp.stats.spearmanr(y_test, predicted_test)
pearson_tst = sp.stats.pearsonr(y_test, predicted_test)
print(f'Out-of-bag R-2 score estimate: {regr.oob_score_:>5.3}')
print(f'Train data R-2 score: {r2_trn:>5.3}')
print(f'Train data Spearman correlation: {spearman_trn[0]:.3}')
print(f'Train data Pearson correlation: {pearson_trn[0]:.3}')
print(f'Test data R-2 score: {r2_tst:>5.3}')
print(f'Test data Spearman correlation: {spearman_tst[0]:.3}')
print(f'Test data Pearson correlation: {pearson_tst[0]:.3}')

Out-of-bag R-2 score estimate: -0.00655
Train data R-2 score: 0.865
Train data Spearman correlation: 0.979
Train data Pearson correlation: 0.98
Test data R-2 score: 0.128
Test data Spearman correlation: 0.182
Test data Pearson correlation: 0.364


### Visualize Accuracy

In [63]:
corrPlot([np.array(y_train),np.array(y_test)],                 # 1D data vector or list of 1D dsata vectors
             [predicted_train,predicted_test],                 # 1D data vector or list of 1D dsata vectors
             names=['Train','Test'],        # names of x, y (ex:['A', 'B']
             maxdata=2010,      # max # of points to plot above histogram (if too high, it will be slow)
             addCorr=True,      # whether to add correlation statistics into plot (R2, spearmanR2, Pvals, & y=mx+b)
             addCorrLine=0,
             addXYline=1,
             plot=True,         # if false, just returns plotly json object
             title='Predicted vs Actual Audience', # title of plot
             xlbl='Actual audience',           #
             ylbl='Predicted audience')          #

show_obs_bar = go.Bar(
    x=list(X),
    y=regr.feature_importances_,
)
layout = go.Layout(
        title='Regression feature Importance',
        xaxis={'title': 'Feature'},
        yaxis={'title': 'Importance'},
        hovermode='closest',
        height =400,
        width = 600,
)
fig = go.Figure(data=[show_obs_bar], layout=layout)
pyo.iplot(fig)