In [2]:
import pandas as pd
import numpy as np
import plotly as py
from plotly import tools
import plotly.graph_objs as go
from feature_functions import *
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from scipy import stats
import scipy.optimize
from scipy.optimize import OptimizeWarning
import warnings
from sklearn.linear_model import LinearRegression
from scipy import stats

# Variables:
future = 15
averages = [5,10,15,20,30,50,70,100,200,300]


# Loading the data
df = pd.read_csv("data/EURUSDhour.csv")
df.columns = ['date','open','high','low','close','volume']
df.date = pd.to_datetime(df.date, format='%d.%m.%Y %H:%M:%S.%f')
df = df.set_index(df.date)
df = df[['open','high','low','close','volume']]

# Drop elements where there is no mouvement (Market closed) like in the weekends
df = df.drop_duplicates(keep=False)
df

Unnamed: 0_level_0,open,high,low,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-01-01 22:00:00,1.05236,1.05253,1.05221,1.05227,143.5400
2017-01-01 23:00:00,1.05227,1.05426,1.05226,1.05282,253.1400
2017-01-02 00:00:00,1.05283,1.05283,1.05268,1.05281,131.4000
2017-01-02 01:00:00,1.05282,1.05286,1.05225,1.05240,273.9400
2017-01-02 02:00:00,1.05239,1.05240,1.05164,1.05220,258.1400
2017-01-02 03:00:00,1.05219,1.05244,1.05184,1.05226,2288.1799
2017-01-02 04:00:00,1.05226,1.05232,1.05181,1.05193,711.8000
2017-01-02 05:00:00,1.05195,1.05197,1.05136,1.05161,288.2100
2017-01-02 06:00:00,1.05163,1.05176,1.05113,1.05172,390.8800
2017-01-02 07:00:00,1.05171,1.05243,1.05129,1.05162,3070.0701


In [3]:
# Slope Function
def slope(prices, periods):
    """

    :param prices: OHLC data
    :param periods: periods for which to compute the function
    
    :return: Slopes over the given periods
    """

    results = pd.DataFrame(index=prices.index)

    for i in range(len(periods)):
        ms = []

        for j in range(periods[i], len(prices) - periods[i]):
            y = prices.high.iloc[j - periods[i]: j].values
            x = np.arange(0, len(y))

            res = stats.linregress(x, y=y)
            m = res.slope
            ms = np.append(ms, m)

        ms = pd.DataFrame(ms, index=prices.iloc[periods[i]:-periods[i]].index)
        ms.columns = ['slope high' + str(periods[i])]

        results = pd.concat([results, ms], axis=1)


    return results

In [4]:
slopeKey = [3, 4, 5, 10, 20, 30]

slope(df, slopeKey)

Unnamed: 0_level_0,slope high3,slope high4,slope high5,slope high10,slope high20,slope high30
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-01-01 22:00:00,,,,,,
2017-01-01 23:00:00,,,,,,
2017-01-02 00:00:00,,,,,,
2017-01-02 01:00:00,0.000150,,,,,
2017-01-02 02:00:00,-0.000700,-0.000044,,,,
2017-01-02 03:00:00,-0.000215,-0.000555,-0.000166,,,
2017-01-02 04:00:00,-0.000210,-0.000163,-0.000407,,,
2017-01-02 05:00:00,-0.000040,-0.000158,-0.000144,,,
2017-01-02 06:00:00,-0.000235,-0.000141,-0.000186,,,
2017-01-02 07:00:00,-0.000280,-0.000239,-0.000175,,,


In [None]:
# The data
interval = 20
run_length = 100
correct_predictions = 0
for i in range(run_length):
    X = df_with_averages.iloc[:df_with_averages.shape[0]- (i*interval),:-1]
    y = df_with_averages.result

    # Split the data
    X_train= X.iloc[:X.shape[0]-1,:]
    X_test= X.iloc[X.shape[0]-1:X.shape[0],:]
    y_train= y[:X.shape[0]-1]
    y_test =y [X.shape[0]-1:X.shape[0]]

    clf = GradientBoostingClassifier(random_state=5, learning_rate=0.01, n_estimators=10000)
    clf.fit(X_train, y_train) 

    # Predictions
    predicted = clf.predict(X_test)
    print (str(accuracy_score(y_test, predicted)))
    correct_predictions += accuracy_score(y_test, predicted)