# Imports

In [1]:
import pandas as pd 
import numpy as np
import tensorflow as tf
import keras

!pip install -q git+https://github.com/oanda/oandapy.git
import oandapy as opy

from datetime import datetime as dt

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
oanda = opy.API(environment='live')

# Data

In [4]:
# Input for downloading data using Oanda API
d1 = '2007-01-01'
d2 = str(dt.now())[:10]
instrument = 'USD_JPY'

In [None]:
# Download data in chucks
dates = pd.date_range(start=d1, end=d2, freq='D')
df = pd.DataFrame()
 
for i in range(0, len(dates) -1):
    d1 = str(dates[i]).replace(' ', 'T')
    d2 = str(dates[i+1]).replace(' ', 'T')
     
    try:
        data = oanda.get_history(instrument=instrument, start=d1, end=d2, granularity='M1')
        df = df.append(pd.DataFrame(data['candles']))
    except:
        pass
date = pd.DatetimeIndex(df['time'], tz='UTC')
df['date'] = date

In [None]:
DF = df.copy()

In [None]:
df = df.drop(['complete', 'time', 'closeAsk', 'highAsk', 'lowAsk', 'openAsk'], axis=1)

In [None]:
df.head()

In [None]:
cl = ['Close15', 'Close30', 'Close45']
lo = ['Low15', 'Low30', 'Low45']
hi = ['High15', 'High30', 'High45']
op = ['Open15', 'Open30', 'Open45']
shifts = [1,2,3]

for i,j,k,o,s in zip(cl,lo,hi,op,shifts):
  df[i] = df['closeBid'].shift(s)
  df[j] = df['lowBid'].shift(s)
  df[k] = df['highBid'].shift(s)
  df[o] = df['openBid'].shift(s)

In [None]:
df.head()

In [None]:
df['date'] = df['date'].astype(str)

In [None]:
df['d2'] = df['date'].str[14:]

In [None]:
df = df[df['d2'] == '45:00+00:00']

In [None]:
df = df[4:-4]

In [None]:
df['H'] = df.drop(['volume', 'date', 'd2'], 1).max(axis=1)
df['L'] = df.drop(['volume', 'date', 'd2'], 1).min(axis=1)


In [None]:
df.head()

In [None]:
# DataFrame para regresión lineal a futuros precios
dflr = df.copy()
# DataFrame para red neuronal (probabilidades) de clasificación
dfnn = df.copy()

# Regresión Lineal (Pronóstico)

In [None]:
LRresponsesBid = ['FutureClose', 'FutureHigh', 'FutureLow']

In [None]:
LRactualBid = ['closeBid', 'H', 'L']

In [None]:
dflr.head()

In [None]:
for j,l in zip(LRresponsesBid, LRactualBid):
    dflr[j] = dflr[l].shift(-1)

In [None]:
dflr['intercept'] = 1

In [None]:
dflr.head()

In [None]:
def augment_data(DF, response, correl=0.1, datetrans=False, convertdummies=False, dummy_transform=False):
    """
    Prueba ciertas transformaciones numéricas y verifica si la correlación es buena
    para agregarlas al dataframe
    
    Args:
        DF (DataFrame): DataFrame de tus datos
        response (str): Variable dependiente (la debe contener tu base)
        correl (float): Correlación mínima que se espera de una variable que 
                        quieres que entre al modelo
        convertdummies (boolean): Si queremos convertir categóricas a variables
                                  binarias
        dummy_transform (boolean): Si queremos encontrar transformaciones en las
                                   variables binarias
    Returns:
        df (DataFrame): DataFrame con transformaciones útiles
    
    """
    df = DF.copy()
    
    numericas = list(df.select_dtypes(include=['int','float']).columns) 
    fechas = list(df.select_dtypes(include=['datetime']).columns)

    numericas = [x for x in numericas if x != response]
    fechas = [x for x in fechas if x != response]
    
    newvars = []
    unuseful = []
    
    if convertdummies != False:
        cat = list(df.select_dtypes(include=['category', 'object']).columns) 
        df = pd.get_dummies(df, columns=cat)
    
    # En caso de querer transformaciones en nuestras variables binarias hay
    # un gran tiempo de espera
    if dummy_transform != False:
        dummy_vars = []
        for i in df.columns:
            if set(df[i].unique()) == set([0, 1]):
                dummy_vars.append(i)
        
        dummy_vars = [x for x in dummy_vars if x != response]
        fechas = [i for i in fechas if i not in dummy_vars]
        numericas = [i for i in numericas if i not in dummy_vars]
        
        acum = []
        for i in dummy_vars:
            acum.append(i)
            for j in [x for x in dummy_vars if x not in acum]:
                # Multiplicación de conectores lógicos (AND)
                varname = i + '*' + j
                df[varname] = df[i] * df[j]
                correlagg = df[[varname, response]].corr()[response][0]
                # Se agrega si supera la correlación mínima
                if abs(correlagg) > abs(correl):
                    newvars.append(varname)
                else:
                    unuseful.append(varname)
    
    if datetrans != False:
        acum_fechas = []
        for i in fechas:
            varname = 'hora_' + i
            # Hora de la fecha
            df[varname] = df[i].dt.hour
            correlhora = df[[varname, response]].corr()[response][0]

            # Se agrega si supera la correlación mínima
            if abs(correlhora) > abs(correl):

                newvars.append(varname)
            else:
                unuseful.append(varname)

            varname = 'dia_' + i
            # Día de la fecha    
            df[varname] = df[i].dt.day
            correldia = df[[varname, response]].corr()[response][0]

            # Se agrega si supera la correlación mínima
            if abs(correldia) > abs(correl):

                newvars.append(varname)
            else:
                unuseful.append(varname)

            varname = 'mes_' + i
            # Mes de la fecha
            df[varname] = df[i].dt.month
            correlmes = df[[varname, response]].corr()[response][0]

            # Se agrega si supera la correlación mínima
            if abs(correlmes) > abs(correl):

                newvars.append(varname)
            else:
                unuseful.append(varname)

            acum_fechas.append(i)
            for j in [x for x in fechas if x not in acum_fechas]:
                # Diferencia de fechas (en días)
                varname = i + '-' + j
                df.loc[(df[i].notnull()) & (df[j].notnull()), varname] = (df[i] - df[j]).dt.days
                correldif = df[[varname, response]].corr()[response][0]
                # Se agrega si supera la correlación mínima
                if abs(correldif) > abs(correl):
                    newvars.append(varname)
                else:
                    unuseful.append(varname)

    for i in numericas:
        # Correlación sin transformación
        correl1 = df[[i, response]].corr()[response][0]
        varname = i + '^' + str(2)
        # Variable al cuadrado
        df[varname] = df[i]**2
        
        # Correlación con cada variable al cuadrado
        correl2 = df[[varname, response]].corr()[response][0]
        # Se agrega si supera la correlación mínima y la correlación sin transformación
        if abs(correl2) > abs(correl) and abs(correl2) > abs(correl1):
            
            newvars.append(varname)
        else:
            unuseful.append(varname)
            
        varname = i + '^' + str(3)
        # Variable al cubo
        df[varname] = df[i]**3
        
        # Correlación con cada variable al cubo
        correl3 = df[[varname, response]].corr()[response][0]
        
        # Se agrega si supera la correlación mínima y la correlación sin transformación
        if abs(correl3) > abs(correl) and abs(correl3) > abs(correl1):
            
            newvars.append(varname)
        else:
            unuseful.append(varname)
        
        varname = 'sqrt(' + i + ')'
        # Raíz cuadrada de la variable
        df[varname] = np.sqrt(df[i])
        
        # Correlación con la raíz cuadrada de cada variable
        correlsqrt = df[[varname, response]].corr()[response][0]
        
        # Se agrega si supera la correlación mínima y la correlación sin transformación
        if abs(correlsqrt) > abs(correl) and abs(correlsqrt) > abs(correl1):
            
            newvars.append(varname)
        else:
            unuseful.append(varname)
        
        varname = '1/' + i
        # Inverso de la variable
        df[varname] = 1 / df[i]
        
        # Correlación con el inverso de cada variable
        correlinv = df[[varname, response]].corr()[response][0]
        
        # Se agrega si supera la correlación mínima y la correlación sin transformación
        if abs(correlinv) > abs(correl) and abs(correlinv) > abs(correl1):
            
            newvars.append(varname)
        else:
            unuseful.append(varname)
        
        varname = 'log(' + i + ')' 
        # Logaritmo de la variable
        df[varname] = df[i].apply(np.log)
        
        # Correlación con el logaritmo de cada variable
        correllog = df[[varname, response]].corr()[response][0]
        
        # Se agrega si supera la correlación mínima y la correlación sin transformación
        if abs(correllog) > abs(correl) and abs(correllog) > abs(correl1):
            
            newvars.append(varname)
        else:
            unuseful.append(varname)
            
        varname = '%(' + i + ')' 
        # Porcentaje con el pasado
        df[varname] = df[i].div(df[i].shift(1))
   
        newvars.append(varname)
            
        varname = '>' + i 
        df[varname] = 0
        df.loc[df['%(' + i + ')'] >= 1, varname] = 1
    
        newvars.append(varname)
        
        for j in range(2,4):
            df[i+str(j)] = 0
            df[i+str(j)] = df[i].shift(j-1)
        
    df = df.drop(unuseful, 1)
    print('Agregamos las siguientes transformaciones:')
    display(newvars)
    
    df = df.replace(-np.inf, -100000)
    df = df.replace(np.inf, 100000)
    
    num = list(df.select_dtypes(include=['int', 'float']).columns)
    
    return df[num]

In [None]:
dfs = []
LRresponses = LRresponsesBid
for i in LRresponses:
    drop = [k for k in LRresponses if k != i]
    dfi = augment_data(dflr.drop(drop, axis=1), i, correl=0.2, 
                     datetrans=True, convertdummies=False, dummy_transform=False)
    dfs.append(dfi)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit


def train_test(Data, response, time_series=False):
    """
    Regresa train y test sets con el 75% de los datos
    para entrenar y el 25% para probar el modelo
    
    Args:
        Data (DataFrame): Datos listos para el modelo
        response (str): Variable respuesta
        time_series (boolean): Si es serie de tiempo o no
    Returns:
        X_train (Array): conjunto de datos de entrenamiento (indep)
        X_test (Array): conjunto de datos de prueba (indep)
        y_train (Array): conjunto de datos de entrenamiento (dep)
        y_test (Array): conjunto de datos de prueba (dep)
        
    """
    
    Data1 = Data.copy()
    X = Data1.drop(response, 1)
    y = Data1[response]
    
    if time_series == True:
        tscv = TimeSeriesSplit(n_splits=2)
        for train_index, test_index in tscv.split(X):
            X_train, X_test = X.values[train_index], X.values[test_index]
            y_train, y_test = y.values[train_index], y.values[test_index]
    
    else:
        X_train, X_test, y_train, y_test = train_test_split(X, y,random_state = 0)
        X_train = X_train.values
        X_test = X_test.values
        y_train = y_train.values
        y_test = y_test.values
    
    return X_train, X_test, y_train, y_test

In [None]:
#!pip install statsmodels
import statsmodels.api as sm

def linreg_model(X_train, y_train):
    """
    Calcula modelo de Regresión Logística  
    Args:
        X_train (Array): conjunto de datos de entrenamiento (indep)
        y_train (Array): conjunto de datos de entrenamiento (dep)
    returns:
        logit (modelo): Regresión Logística

    """
    linreg = sm.OLS(y_train, X_train)
    lr = linreg.fit()
    
    return lr

In [None]:
import matplotlib.pyplot as plt

lrmodels = []
for i in range(len(dfs)):
    print(LRresponses[i])
    dfs[i] = dfs[i].dropna()
    X_train, X_test, y_train, y_test = train_test(dfs[i], LRresponses[i], time_series=True)
    lr = linreg_model(X_train, y_train)
    print(lr.summary())
    lrmodels.append(lr)
    plt.figure(figsize=(15,4))
    plt.plot(range(len(y_test)),lr.predict(X_test), color='r')
    plt.plot(range(len(y_test)),y_test, color='b')
    plt.show()

In [None]:
for i in range(len(lrmodels)):
    lrmodels[i].save(LRresponses[i]+'.h5')

# Red Neuronal (Clasificación)

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from keras.callbacks import ModelCheckpoint
from keras.utils import plot_model
from keras.utils.vis_utils import model_to_dot
from keras.models import load_model
from IPython.display import SVG

In [None]:
NNactualBid = ['closeBid', 'H', 'L']
responsenn = ['Labelclose', 'labelhigh', 'labellow']

for i,j in zip(NNactualBid, responsenn):
    dfnn['return'] = dfnn[i].shift(-1) - dfnn[i]
    dfnn[j] = dfnn['return'].apply(lambda x: 1 if x>0.0 else 0)
    dfnn = dfnn.drop('return', 1)

In [None]:
dfnn.head(6)

In [None]:
dfsn = []

for i in responsenn:
    drop = [k for k in responsenn if k != i]
    dfi = augment_data(dfnn.drop(drop, axis=1), i, correl=0.2, 
                     datetrans=True, convertdummies=False, dummy_transform=False)
    dfsn.append(dfi)

In [None]:
pd.options.display.max_columns = 999

In [None]:
def NN(X_train, y_train, neurons, activations, initializer,
       optimizer, epochs, batch, loss, checkpoint=False): 
    """
    Args:
        X_train (Array): Variables independientes (muestra de entrenamiento)
        y_train (Array): Variable dependiente (muestra de entrenamiento)
        neurons (list): Número de neuronas en cada capa
        activations (list): Función de activación en cada capa
        initializer (str): Kernel initializer
        optimizer (str): Optimizer
        epochs (int): Número de epochs
        batch (int): Tamaño de cada batch
        loss (str): Función de pérdida
        checkpoint (boolean): Si queremos que cada vez que haya mejora en un epoch se guarde el modelo

    Returns:
        model (modelo): Modelo de Red Neuronal
    """   
    
    dim = len(X_train[1])
    model = Sequential()
    model.add(Dense(neurons[0], input_dim=dim, kernel_initializer=initializer,
                    bias_initializer='zeros', activation=activations[0]))
    
    for i in range(1, len(neurons)):
        model.add(Dense(neurons[i], kernel_initializer=initializer,
                        bias_initializer='zeros', activation=activations[i]))
        
    model.compile(loss=loss, optimizer=optimizer, metrics=['accuracy'])
    
    if checkpoint != False:
        filepath = "weights-improvement-{epoch:02d}-{val_acc:.2f}.hdf5"
        checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
        callbacks_list = [checkpoint]

        model.fit(X_train, y_train, epochs=epochs, batch_size=batch,
                  callbacks=callbacks_list, validation_split=0.1)
    
    else:
        model.fit(X_train, y_train, epochs=epochs, batch_size=batch)
        
    return model

In [None]:
def model_precision(y_test, predictions, lim):
    """
    Args:
        y_test (array): Instancias de la variable dependiente
        predictions (array): Predicciones
        lim (float): Entre 0 y 1 que marca el límite de clasificación (arriba de lim se considera cierre)
    
    Returns:
        Accuracy (float): (tp+tn)/(tp+tn+fp+fn)
        Precision (float): tp/(tp+fp)
        Recall (float): tp/(tp+fn)
        F1_score (float): 2/(1/Precision+1/Recall) Media armónica entre Precision y Recall
        MCC (float): Matthiews Correlation Coefficient (tp*tn-fp*fn)/(math.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn)))
    
    """
          
    y_test.shape = [y_test.shape[0],1]
    predictions.shape = [predictions.shape[0],1]
    
    test = np.concatenate((y_test, predictions),axis=1)

    tp = ((test[:,0] == 1) & (test[:,1] >= lim)).sum()
    fp = ((test[:,0] == 0) & (test[:,1] >= lim)).sum()
    tn = ((test[:,0] == 0) & (test[:,1] < lim)).sum()
    fn = ((test[:,0] == 1) & (test[:,1] < lim)).sum()
    
    Accuracy = (tp+tn)/(tp+tn+fp+fn)
    Precision = tp/(tp+fp)
    Recall = tp/(tp+fn)
    F1_score = 2/(1/Precision+1/Recall)
    MCC = (tp*tn-fp*fn)/(math.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn)))
    

    res = pd.DataFrame(0, index=['Accuracy', 'Precision',
                                     'Recall', 'F1 Score',
                                     'MCC'], columns=['Score'])

    res.loc['Accuracy'] = 100*Accuracy
    res.loc['Precision'] = 100*Precision
    res.loc['Recall'] = 100*Recall
    res.loc['F1 Score'] = 100*F1_score
    res.loc['MCC'] = 100*MCC
    display(res)
    
    return Accuracy, Precision, Recall, F1_score, MCC

def bucket_scores(y_test, predictions):
    """
    Precision por cubeta de 10 en 10
    
    Args:
        y_test (array): Instancias de la variable dependiente
        predictions (array): Predicciones
    
    Returns:
        res (DataFrame): Positive rate por scores de 1 a 100 en cubetas de 10
    
    """
    scoresindex = ['0-10','10-20','20-30','30-40','40-50','50-60','60-70','70-80','80-90','90-100']
    scorescolumns = ['Total','Positives']
    res = pd.DataFrame(0, index=scoresindex, columns=scorescolumns)
    
    y_test.shape = [y_test.shape[0],1]
    predictions.shape = [predictions.shape[0],1]
    
    test = np.concatenate((y_test,predictions),axis=1)
    
    low = 0
    up = 0.1
    for i in scoresindex:
        res.loc[i]['Total'] = ((test[:,1] >= low) & (test[:,1] < up)).sum()
        res.loc[i]['Positives'] = ((test[:,1] >= low) & (test[:,1] < up) & (test[:,0] == 1)).sum()
        low += 0.1
        up += 0.1
    res['Positive Rate'] = res['Positives']/res['Total']*100  
    
    return res

In [None]:
dfsn[0].head()

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve
import math

nnmodels = []
for i in range(len(dfsn)):
    print(responsenn[i])
    dfsn[i] = dfsn[i].dropna()
    X_train, X_test, y_train, y_test = train_test(dfsn[i], responsenn[i], time_series=True)
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    scaler.fit(X_test)
    X_test = scaler.transform(X_test)
    modelosnn = []
    for j in range(10):
        try:
            k = round(abs(np.random.randn() * 10))
            neurons = [X_train.shape[1]]
            for s in range(k):
                neurons.append(round(abs(np.random.randn() * 100)))
            neurons.append(1)
            activations = ['relu'] * (s + 1)
            activations.append('sigmoid')
            initializer = 'he_normal' 
            optimizer = 'adam'
            loss = 'binary_crossentropy'
            print('\n Modelo #')
            print(j)
            print('\n Neurons')
            print(neurons)
            print('\n Initializer')
            print(initializer)
            print('\n Optimizer')
            print(optimizer)
            print('\n Loss')
            print(loss)
            mod = NN(X_train, y_train, neurons, activations, 
                                 initializer, optimizer, epochs=17, batch=512, loss=loss)
            modelosnn.append(mod)
            predictions_train = mod.predict(X_train)
            predictions_test = mod.predict(X_test)


            print('\n -----------------------------------------')
            lim = 0.5
            Accuracy, Precision, Recall, F1_score, MCC = model_precision(y_test,predictions_test,lim)
            bs = bucket_scores(y_test, predictions_test)
            display(bs)  

            fpr, tpr, thresholds = roc_curve(y_test, predictions_test, pos_label=None, sample_weight=None, drop_intermediate=True)
            plt.plot(fpr, tpr)
            auc = np.trapz(tpr, fpr)
            print('auc')
            print(auc)

            if auc > maxauc:
                maxauc = auc
                bestnn = j

            plt.axis([0,1,0,1])
            plt.plot([0,1],[0,1])
            plt.show()
        except:
            pass
    

In [None]:
maxf1 = 0
maxauc = 0

plt.figure(figsize=(25,25))
for i in modelosnn:
            
    predictions_train = i.predict(X_train)
    predictions_test = i.predict(X_test)
    
    
    print('\n -----------------------------------------')
    lim = 0.5
    Accuracy, Precision, Recall, F1_score, MCC = model_precision(y_test,predictions_test,lim)
    bs = bucket_scores(y_test, predictions_test)
    display(bs)  
    
    fpr, tpr, thresholds = roc_curve(y_test, predictions_test, pos_label=None, sample_weight=None, drop_intermediate=True)
    plt.plot(fpr, tpr)
    auc = np.trapz(tpr, fpr)
    print('auc')
    print(auc)
    
    if auc > maxauc:
        maxauc = auc
        bestnn = i

plt.axis([0,1,0,1])
plt.plot([0,1],[0,1])
plt.show()

In [None]:
predictions_test = bestnn.predict(X_test)

plt.figure(figsize=(15,10))

fpr, tpr, thresholds = roc_curve(y_test, predictions_test, pos_label=None, sample_weight=None, drop_intermediate=True)
plt.plot(fpr, tpr, color='b', label='NN')

plt.plot([0,1],[0,1], color='black', label='Pronóstico Aleatorio')

plt.legend(prop={'size': 20})

plt.axis([0,1,0,1])

plt.show()

print('auc')
print(np.trapz(tpr, fpr))


In [None]:
bestnn.save(responsenn+'.h5')

# Trading

In [None]:
response = oanda.get_history(instrument="USD_JPY", granularity='M15', since='2018-08-23T01:00:00.00000Z')
prices = response.get("prices")

In [None]:
response

In [None]:
prices