In [1]:
import os
import pandas as pd
import numpy as np
from sklearn import preprocessing
from collections import deque
import random

In [2]:
SEQ_LEN = 60
FUTURE_PERIOD_PREDICT = 3
RATIO_TO_PREDICT = "LTC-USD"

In [3]:
# Decide whether to buy (1) or not to buy (0)
def classify(current, future):
    if float(future) > float(current):
        return 1
    else:
        return 0

In [4]:
def preprocess_df(df):
    #we'll no longer need the future column we used to get the target
    df = df.drop('future', 1)
    for col in df.columns:
        if col != "target":
            #normalize all the data, the importance is on the percent changes
            df[col] = df[col].pct_change()
            df.dropna(inplace=True)
            #scale the data, from 0 to 1
            df[col] = preprocessing.scale(df[col].values)
            
    #and just in case for whatever reason that creates a Not a Number (NaN)...
    df.dropna(inplace=True)
    
    sequential_data = []
    prev_days = deque(maxlen=SEQ_LEN)
    
    for i in df.values:
        prev_days.append([n for n in i[:-1]])
        if len(prev_days) == SEQ_LEN:
            sequential_data.append([np.array(prev_days), i[-1]])
    
    random.shuffle(sequential_data)
    
    buys = []
    sells = []
    for seq, target in sequential_data:
        if target == 0:
            sells.append([seq, target])
        elif target == 1:
            buys.append([seq, target])
    
    random.shuffle(buys)
    random.shuffle(sells)
    
    lower = min(len(buys), len(sells))
    
    buys = buys[:lower]
    sells = sells[:lower]
    
    sequential_data = buys+sells
    random.shuffle(sequential_data)
    
    X = []
    y = []
    
    for seq, target in sequential_data:
        X.append(seq)
        y.append(target)
        
    return np.array(X), y

In [5]:
#create empty DF
main_df = pd.DataFrame()

In [6]:
ratios = ["BTC-USD", "LTC-USD", "ETH-USD", "BCH-USD"]

for ratio in ratios:
    #for each ratio read correspondent CSV file and initialize a readable DataFrame
    dataset = f"crypto_data/{ratio}.csv"
    df = pd.read_csv(dataset, names = ["time", "low", "high", "open", "close", "volume"])

    #Renaming columns
    df.rename(columns={"close": f"{ratio}_close", "volume":f"{ratio}_volume"}, inplace = True)

    #Set correct index and clean Data (for each ratio). We are only using Volume and Close price
    df.set_index("time", inplace = True)
    df = df [[f"{ratio}_close", f"{ratio}_volume"]]

    #And join all the ratios into the same super table
    if len(main_df) == 0:
    	main_df = df
    else:
    	main_df = main_df.join(df)

In [7]:
#create future column and fill it with future values based on FUTURE_PERIOD_PREDICT variable
main_df['future'] = main_df[f"{RATIO_TO_PREDICT}_close"].shift(-FUTURE_PERIOD_PREDICT)

print(main_df[[f"{RATIO_TO_PREDICT}_close","future"]].head())

            LTC-USD_close     future
time                                
1528968660      96.580002  96.500000
1528968720      96.660004  96.389999
1528968780      96.570000  96.519997
1528968840      96.500000  96.440002
1528968900      96.389999  96.470001


In [8]:
main_df['target'] = list(map(classify, main_df[f"{RATIO_TO_PREDICT}_close"], main_df["future"]))

print(main_df[[f"{RATIO_TO_PREDICT}_close","future","target"]].head())

            LTC-USD_close     future  target
time                                        
1528968660      96.580002  96.500000       0
1528968720      96.660004  96.389999       0
1528968780      96.570000  96.519997       0
1528968840      96.500000  96.440002       0
1528968900      96.389999  96.470001       1


In [9]:
#out of sample testing
times = sorted(main_df.index.values)
last5pct = times[-int(0.05*len(times))]

print(last5pct)

1534922100


In [10]:
#now we split up the data
validation_main_df = main_df[(main_df.index >= last5pct)]
main_df = main_df[(main_df.index < last5pct)]

In [11]:
train_x, train_y = preprocess_df(main_df)
validation_x, validation_y = preprocess_df(validation_main_df)