In [1]:
import pandas as pd
import datetime as dt
import numpy as np

import ta
#import talib

from oandapyV20 import API
from oandapyV20.contrib.factories import InstrumentsCandlesFactory
import oandapyV20.endpoints.forexlabs as labs
# https://media.readthedocs.org/pdf/oanda-api-v20/latest/oanda-api-v20.pdf
# https://financetrain.com/best-python-librariespackages-finance-financial-data-scientists/
# https://github.com/mrjbq7/ta-lib

from sklearn import tree
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import classification_report, confusion_matrix

## Parameters

In [49]:
stop_loss = 0.0030
take_profit = 0.0070
periods = 5
commision = 0.0002
number_of_models_to_test = 10
days_to_train = 5
granularity_param = 'H1'

## Necessary functions

In [50]:
client = API(access_token='7f736aabc877f3ea75bc844c79814d7c-998e41725285ea6d54b836e8b93fe4f6')

In [51]:
def hist(api, instrument, start_days, end_days, granularity):

    start_date = (dt.datetime.now()-dt.timedelta(days=start_days)).strftime('%Y-%m-%dT%H:%M:%SZ')
    end_date = (dt.datetime.now()-dt.timedelta(days=end_days, hours=2,minutes=4)).strftime('%Y-%m-%dT%H:%M:%SZ')

    params ={
                "from": start_date,
                "to": end_date,
                "granularity":granularity,
            }

    df_list = []
    for r in InstrumentsCandlesFactory(instrument=instrument,params=params):
        api.request(r)
        df = pd.DataFrame(r.response['candles'])
        if(df.empty==False):
            time = df['time']
            volume = pd.DataFrame(df['volume'].apply(pd.Series))
            df = pd.DataFrame(df['mid'].apply(pd.Series))
            df = pd.concat([df,time,volume], axis=1)
            df['time'] = pd.to_datetime(df['time'], format='%Y-%m-%dT%H:%M:%S.000000000Z')
            #df.set_index('time',inplace=True)
            df_list.append(df)
    
    final = pd.concat(df_list)
    
    names = {
        'o': 'open',
        'c': 'close',
        'h': 'high',
        'l': 'low',
        0: 'vol',
        'time': 'time',
    }
    new_names = []
    for column_name in final.columns:
        new_names.append(names[column_name])
    final.columns = new_names
    
    return final

In [52]:
def cal(client, instrument, perdiod):

    
    
    params = {
        "instrument": instrument,
        "period": perdiod
    }
    
    # PERIOD VALUES
    #3600 - 1 hour
    #43200 - 12 hours
    #86400 - 1 day
    #604800 - 1 week
    #2592000 - 1 month
    #7776000 - 3 months
    #15552000 - 6 months
    #31536000 - 1 year
    # http://developer.oanda.com/rest-live/forex-labs/

    r = labs.Calendar(params=params)
    client.request(r)
    
    df = pd.DataFrame.from_dict(r.response, orient='columns')
    
    df['timestamp'] = pd.to_datetime(df['timestamp']*1000000000)
    df = df[['impact', 'timestamp']]
    df.columns = ['impact', 'time']

    return df.groupby('time').sum().reset_index()

In [53]:
def merge(history, calendar):
    return pd.merge(history, calendar, left_on = 'time', right_on = 'time', how='outer')\
                                                                            .set_index('time')\
                                                                            .astype(float)\
                                                                            .fillna(0)

In [54]:
def add_ta(df):
    df = ta.add_all_ta_features(df, "open", "high", "low", "close", "vol", fillna=False)
    
    for feature in df.drop(["open", "high", "low", "close", "vol", "impact"], axis=1).columns.tolist():
        df[feature+"_change"] = (df[feature] - df[feature].shift(1)) / df[feature].shift(1)
        df = df.drop(feature, axis=1)
        
    df = df.replace([np.inf, -np.inf], np.nan)\
            .dropna(axis=1, thresh=len(df) - 50)\
            .iloc[100:]
    
    return df

In [138]:
def broaden_impact(df, period):
    df = df.reset_index().sort_values('time').set_index('time')

    for i in range(periods):
        df.loc[
            (df['impact'].shift(-1-i) != 0)
        ,'impact'] = df['impact'].shift(-1-i)
    
    df['impact'] = df['impact'].fillna(0)

    return df[df['low'] != 0]

In [56]:
def add_sessions(df):
    df.loc[df.index.hour.isin([7,8,9,10,11,12,13,14,15,16]), 'eu_session'] = 1
    df.loc[df['eu_session'] != 1, 'eu_session'] = 0

    df.loc[df.index.hour.isin([21,22,23,24,1,2,3,4,5,6,7]), 'asia_session'] = 1
    df.loc[df['asia_session'] != 1, 'asia_session'] = 0

    df.loc[df.index.hour.isin([12,13,14,15,16,17,18,19,20]), 'us_session'] = 1
    df.loc[df['us_session'] != 1, 'us_session'] = 0
    
    return df

In [219]:
def add_change(df, periods):
    df['change'] = (df['close'].shift(-periods) - df['close'])
    
    return df

In [220]:
def label(df, plus_change, minus_change):
    df.loc[df['change'] > plus_change,'label'] = 2
    df.loc[df['change'] < -minus_change,'label'] = 1
    df.loc[df['label'].isna(),'label'] = 0
    
    df = df.drop(['open', 'change'], axis=1)
    
    return df

In [212]:
def filter_data(df, periods):
    # friday evening
    df = df[~((df.index.dayofweek == 4) & (df.index.hour >= (20-periods)))]
    # calendar
    df = df[df['impact'] == 0 ]
    # monday's morning
    df = df[~((df.index.dayofweek == 0) &( df.index.hour < 7))]
    
    return df

In [60]:
def add_day_counter(df):
    for i, date in enumerate(sorted(list(set(df.index.date)))):
        df.loc[df.index.date == date ,'day_index'] = i
        
    df['weekday'] = df.index.dayofweek.values
    
    return df

## Get data for all tests

In [161]:
calendar = cal(client, 'EUR_USD', 7776000)

In [162]:
history = hist(client, 'EUR_USD', 90, 0, granularity_param)

In [163]:
merged = merge(history, calendar)

In [164]:
len(merged[merged.index.duplicated(keep=False) == True])

8

In [165]:
merged = merged[merged.index.duplicated(keep='first') == False]

## Parameters loop

In [271]:
periods = 3
days_to_train = 5
plus_change = 0.0010
minus_change = 0.0010

tree_criterions = ['entropy','gini']
tree_max_depths = [6,8,10]
tree_min_samples_leafs = [10, 25, 50]

number_of_models_to_test = 1
commision = 0.0002
stop_loss = 0.0030
take_profit = 0.0070

### prepare data

In [272]:
with_broaden_impact = broaden_impact(merged, periods)
with_ta_impact = add_ta(with_broaden_impact)
with_ta_impact_sessions = add_sessions(with_ta_impact)
with_ta_impact_sessions_change = add_change(with_ta_impact_sessions, periods)
labeled = label(with_ta_impact_sessions_change, plus_change, minus_change)
final = labeled.dropna(axis=1, how='any')
final = add_day_counter(final)
final = filter_data(final, periods)

  dip[i] = 100 * (dip_mio[i]/trs[i])
  din[i] = 100 * (din_mio[i]/trs[i])
  aroon_up = close.rolling(n).apply(lambda x: float(np.argmax(x) + 1) / n * 100)
  aroon_down = close.rolling(n).apply(lambda x: float(np.argmin(x) + 1) / n * 100)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(new_indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A

### check label distribution

In [273]:
final.groupby('label').count()['vol']

label
0.0    507
1.0    157
2.0    189
Name: vol, dtype: int64

### find the best parameters combination
So for each parameters train model on past X days, chec performance at next day
save the scores for each day and summarize them at the end of a whole dataframe.

In [None]:
df_list = []

# test each kind of tree
for criterion in tree_criterions:
    for max_depth in tree_max_depths:
        for min_samples_leaf in tree_min_samples_leafs:
            
            # on each date range
            period_pred_Y = []
            period_test_Y = []
            for i in range(len(final['day_index'].unique()) - days_to_train):

                train_df = final[(final['day_index'] >= i) & (final['day_index'] < (i+days_to_train))]
                train_Y = train_df['label'].values.tolist()
                train_X = train_df.drop(['close', 'high', 'low', 'vol', 'impact', 'label','day_index'], axis=1).values

                test_df = final[final['day_index'] == (i + days_to_train)]
                if not test_df.empty:
                    test_Y = test_df['label'].values.tolist()
                    test_X = test_df.drop(['close', 'high', 'low', 'vol', 'impact', 'label','day_index'], axis=1).values

                    clf = ExtraTreesClassifier(n_estimators=30)
                    clf = clf.fit(train_X, train_Y)

                    # pick only those features (technical indicators) which hase at least mean influence on y label
                    model = SelectFromModel(clf, prefit=True, threshold="0.3*mean")

                    train_X = train_X[:, model.get_support()]
                    test_X = test_X[:, model.get_support()]

        
                    clf = tree.DecisionTreeClassifier(criterion=criterion,
                                         max_depth=max_depth,
                                         min_samples_leaf=min_samples_leaf)
                    clf.fit(train_X, train_Y)
                    pred_Y = clf.predict(test_X)
                    
                    # add predictions for each 'subperiod'
                    for i in range(len(pred_Y)):
                        period_pred_Y.append(pred_Y[i])
                        period_test_Y.append(test_Y[i])

        row = []
        row.append(i)
        row.append(criterion)
        row.append(max_depth)
        row.append(min_samples_leaf)
        row.append(clf)
        # dummy score for whole dataframe
        good = 0
        bad = 0
        for i in range(len(period_pred_Y)):
            if period_pred_Y[i] == period_test_Y[i] and period_pred_Y[i] != 0:
                good = good + 1
            if period_pred_Y[i] != period_test_Y[i] and period_pred_Y[i] != 0:
                bad = bad + 1
        
        if good+bad != 0:
            row.append(good/(good+bad))
        else:
            row.append(0)
        row.append(period_pred_Y)
        row.append(period_test_Y)
        df_list.append(row)
        
        

In [None]:
scores = pd.DataFrame(df_list, columns=['day_index','criterion','max_depth','min_samples_leaf','clf','score','pred_Y', 'test_Y'])\
    .sort_values(by='score', ascending=False)

### here are the best classifiers, where "the best" means that it had just best direction prediction

In [None]:
scores.head(3)