# Visualise and compare prices of any 2 assets on a day-by-day fashion

In [None]:
import numpy as np
import pandas as pd 
import dask.dataframe as dd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('whitegrid')
sns.set(font_scale = 1.4)

from datetime import datetime

from scipy.fft import fft, fftfreq
from sklearn import preprocessing
from scipy import signal

import gc 

Since we care about short term price movements, lets study the properties of the time series in short, logical, time frames such as day-by-day.

In [None]:
DATA = '../input/g-research-crypto-forecasting/train.csv'
DATA_ASSETS = '../input/g-research-crypto-forecasting/train.csv'

In [None]:
# # # # # # # # # Engineering Functions # # # # # # # # # # # 
def downcast_floats (df):
    before = df.memory_usage().sum()/1073741824
    for c in df.select_dtypes(include=['float']).columns.tolist():
        if (df[str(c)].max() < np.finfo('float16').max) & (df[str(c)].min() > np.finfo('float16').min):
             df[str(c)] = df[str(c)].astype('float16')
        elif (df[str(c)].max() < np.finfo('float32').max) & (df[str(c)].min() > np.finfo('float32').min):
            df[str(c)] = df[str(c)].astype('float32')
    
    after = df.memory_usage().sum()/1073741824
    print ('Memory usage reduced by '+str(np.round((after/before)*100,1))+'% to '+str(after)+' Gb')
    return df


# # # # # # # # # Data Selection Functions # # # # # # # # # # # 
def select_assets(a1, a2):
    asset_ID_1 = asset_details.loc[a1].Asset_ID
    asset1 = df[df['Asset_ID']==asset_ID_1[0]].set_index("timestamp").copy()
    print(a1+ ' contains '+ str(asset1.shape[0])+ ' entries and ' + str(asset1.isna().sum().sum())+ ' missing values')
    asset1 = asset1.drop('Asset_ID',axis =1)

    
    asset_ID_2 = asset_details.loc[a2].Asset_ID
    asset2 = df[df['Asset_ID']==asset_ID_2[0]].set_index("timestamp").copy()
    print(a2+ ' contains '+ str(asset2.shape[0])+ ' entries and ' + str(asset2.isna().sum().sum())+ ' missing values')
    asset2 = asset2.drop('Asset_ID',axis =1)
    return asset1, asset2


# match row for row two dataframes
def clean_match(df1, df2):
    df1 = df1.dropna()
    df2 = df2.dropna()
    #find intersection of indexes 
    idx = df1.index.intersection(df2.index)
    
    df1 = df1.loc[idx]
    df2 = df2.loc[idx]
    
    print('assets matched: intersection of '+ str(len(idx)) + ' common rows')
    
    return df1, df2


# create df with the average volume on  each day
# acceptes a df with rows trading data 
def daily_vol (df, verbose = False):
    vol = []
    d = []
    for day in df.one_day.unique().tolist():
        d.append(day)
        vol.append(df[(df['one_day'] == day)].Volume.mean())
    df_dvol = pd.DataFrame(vol, columns=['day_volume'], index = d)
    
    if verbose == True:
        df_dvol.hist(bins=100)

    return df_dvol


# return the days in whcich volume was low and high 
# accepts the daily volume df as input
def split_days_volume(df):
    criterion=df.median()
    return df[df['day_volume']<= float(criterion)].index, df[df['day_volume']> float(criterion)].index

# # # # # # # # # Date time Functions # # # # # # # # # # # 

# Fill gaps in timeseries by padding - NOT USED CURRENTLY
def fill_time(df):
    if len((df.index[1:]-df.index[:-1]).value_counts().tolist()) > 1:
        df = df.reindex(range(df.index[0],df.index[-1]+60,60),method='pad')
    else:
        print('No gaps found in time series')
    return df

# convert unix to datetime data    
def unix2datetime (df):
    df.index = pd.to_datetime(df.index,unit='s')
    return df


def add_time_labels (df):
    df['year'] = df.index.strftime('%Y')
    df['month'] = df.index.strftime('%b')
    df['date'] = df.index.strftime('%d')
    df['hour'] = df.index.strftime('%H')
    df['Day_of_week'] = df.index.strftime('%a')
    df['one_day'] = df['date']+ df['month'] + df['year']
    
    return df

# # # # # # # # # # # Feature engineering functions # # # # # # # # # # # 

def log_return(series, periods=1):
    return np.log(series).diff(periods=periods)

def upper_shadow(df):
    return df['High'] - np.maximum(df['Close'], df['Open'])

def lower_shadow(df):
    return np.minimum(df['Close'], df['Open']) - df['Low']

def spread(df):
    return df['High'] - df['Low']

def mean_trade(df):
    return df['Volume']/df['Count']

def log_change(series1, series2):
    return np.log(series1/series2)

# define function to compute log returns
def log_return(series, periods=1):
    return np.log(series).diff(periods=periods)
                

In [None]:
df = pd.read_csv(DATA)
df = downcast_floats(df)

In [None]:

asset_details = pd.read_csv(DATA_ASSETS)
t = {0: 'BNB', 1: 'BTC', 2: 'BCH', 3: 'ADA',4: 'DOGE',5: 'EOS',6: 'ETH',7: 'ETC',8: 'MIOTA',9: 'LTC'
    , 10: 'MKR',11: 'XMR',12: 'XLM',13: 'TRX'}
asset_details['ticker'] = asset_details['Asset_ID'].map(t)
asset_details.set_index('ticker',inplace=True)
btc, eth = select_assets('BTC', 'ETH' )
del df
gc.collect()
# In stead of filling in missing values,we choose to work on the intersection of common time series data
# between the two coins which are examined.
btc, eth = clean_match(btc, eth)
#btc = fill_time(btc)
#eth = fill_time(eth)


btc['upperShadow'] = upper_shadow (btc)
btc['lowerShadow'] = lower_shadow (btc)

eth['upperShadow'] = upper_shadow (eth)
eth['lowerShadow'] = lower_shadow (eth)

In [None]:
btc =  unix2datetime(btc)
eth =  unix2datetime(eth)

In [None]:
btc = add_time_labels(btc)
eth = add_time_labels(eth)

In [None]:
btc_dvol = daily_vol(btc, verbose = True)
eth_dvol = daily_vol(eth, verbose = True)

In [None]:
# split days by relative daily volume (high / low)  // not used for now
#dlvol, dhvol = split_days_volume(btc_dvol)

In [None]:
# plot day (for example day224) for the examined assets
day = btc_dvol.index[224] # we can use btc_dvol or eth_dvol the same since they were matched earlier on. 


asset1 = 'BTC'
asset2 = 'ETH'
df1 = btc
df2 = eth

feature = 'Close'


df_focus1 = df1[(df1['one_day'] == day)]
df_focus2 = df2[(df2['one_day'] == day)]

df_plot = pd.concat([df_focus1[feature], df_focus2[feature]],axis=1)
#rename the columns
df_plot.columns = [feature+asset1, feature+asset2]


fig, ax = plt.subplots(figsize=(30,15)) # Sample figsize in inches
sns.lineplot(data=df_plot[feature+asset1], color="g", linewidth=1.5)
sns.lineplot(data=df_plot[feature+asset1].rolling('h').mean(), color="g", linewidth=4.5, label=asset1)

ax2 = plt.twinx()
sns.lineplot(data=df_plot[feature+asset2], color="b", linewidth=1.5)
sns.lineplot(data=df_plot[feature+asset2].rolling('h').mean(), color="b", linewidth=4.5,label=asset2)

