In [149]:
import pandas as pd
import seaborn as sns
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pickle
import keras
from sklearn.dummy import DummyClassifier
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from numpy import array

In [121]:
def create_target_label(df):
    
    """
    Function takes as input the data-frame and creates a binary variable = 1 if taking this trade would have increased the 
    score, and 0 otherwise.
    """
    
    df['y'] = 0
    mask = df[['resp','resp_1','resp_2','resp_3','resp_4']].sum(axis = 1) >= 0
    df.loc[mask,'y'] = 1
    df = df.drop(['resp_1', 'resp_2','resp_3','resp_4','resp'], axis = 1)
    
    return df

In [122]:
def load_data(nrows):
    
    df = pd.read_csv('C:/Users/mathias.buxhoeveden/Desktop/kagglepm/Data/train.csv', nrows= nrows)
    df = df.sort_values(by = 'ts_id')
    
    return df

In [123]:
def train_test_split(test_share, data):
    
    #Split data into initial train/test
    
    train_share = 1 - test_share    
    train_size = int(len(data) * train_share)
    train_set = data[0:train_size]
    test_set = data[train_size:len(data)]    
    test_set = test_set.drop('y', axis = 1)    
    
    
    return (train_set, test_set)

In [150]:
def split_sequence(sequence, n_steps):
    
    X, y = list(), list()
    
    for i in range(len(sequence)):
        
        # find the end of this pattern
        
        end_ix = i + n_steps
        
        # check if we are beyond the sequence
        
        if end_ix > len(sequence)-1:
            break
        
        # gather input and output parts of the pattern
        
        seq_x, seq_y = sequence[i:end_ix], sequence[end_ix]
        
        X.append(seq_x)
        y.append(seq_y)
    
    return array(X), array(y)

In [124]:
#Load data!

df = load_data(nrows = 100000)

In [125]:
#Flag trades that will increase the target-score!

df = create_target_label(df = df)

In [126]:
#Fill in missing values with mean for now!

df = df.fillna(df.mean())

In [127]:
#Split into train/test!

train_set, test_set = train_test_split(test_share = 0.3, data = df)

In [156]:
X, y = split_sequence(train_set.y, n_steps = 3)

In [161]:
#Prepare data for LSTM!

model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(1,1)))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

In [162]:
# define input sequence
raw_seq = [10, 20, 30, 40, 50, 60, 70, 80, 90]
# choose a number of time steps
n_steps = 3
# split into samples
X, y = split_sequence(raw_seq, n_steps)
# summarize the data
for i in range(len(X)):
    print(X[i], y[i])

[10 20 30] 40
[20 30 40] 50
[30 40 50] 60
[40 50 60] 70
[50 60 70] 80
[60 70 80] 90
