In [1]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

In [2]:
def groupby_Date_Hr(df,delta):
    
    df['time_gmt_datetime'] = pd.Series(index=df.index)
    
    for i in range(df.shape[0]):
        x = df['time_gmt_date'].iloc[i]
        x = datetime.strptime(x,"%Y-%m-%d")
        x = x+timedelta(hours=df['time_gmt_hr'].iloc[i])
        df['time_gmt_datetime'].iloc[i] = x
    
    dates = list(set(df['time_gmt_datetime'].to_list()))
    
    rows = []
    
    for dt in dates:
        
        dt_ = dt-timedelta(hours=delta)
        df_datetime = df.loc[(df['time_gmt_datetime'] > dt_) & (df['time_gmt_datetime'] <= dt)]
        
        text = ""
        for i in range(df_datetime.shape[0]):
            text = text+" "+df_datetime['title'].iloc[i]
            
        rows.append([dt, text])
    
    df_ = pd.DataFrame(rows)
    df_.columns=["time_gmt_date_hr","combined_text"]
    df_.sort_values(by='time_gmt_date_hr', ascending=True, inplace=True)
    
    return df_

def encode_price_direction(df):
    
    for i in range(df.shape[0]):
        
        val = df['price_direction'].iloc[i]
        
        if val == 'up':
            df['price_direction'].iloc[i] = 1
        else: 
            df['price_direction'].iloc[i] = 0
            
    return df

def align_datetime(df, delta):
    
    for i in range(df.shape[0]):
        x = df['date'].iloc[i]
        x = datetime.strptime(x,"%Y-%m-%d %H:%M:%S")
        x = x-timedelta(hours=delta) 
        df['date'].iloc[i] = x
    
    df.rename(columns={"date": "time_gmt_date_hr"}, inplace=True)

    return df

In [20]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
    
def plotLossCurve(estimator):
    # Plot model loss over epochs
    plt.plot(estimator.history['loss'])
    plt.plot(estimator.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'valid'], loc='upper left')
    plt.show()

def get_vad_features(df):
    
    from tensorflow.keras.models import load_model

    # load model
    model = load_model('Model/model-glove-embedding-uscaled.h5')
    print("Loaded model from disk")
    
    import pickle

    # loading
    with open('Model/tokenizer.pickle', 'rb') as handle:
        tokenizer = pickle.load(handle)
    
    
    from tensorflow.keras.preprocessing.sequence import pad_sequences

    max_length =  200 #Maximum length of each row
    padding_type = "post"
    trunction_type="post"

    X_test_sequences = tokenizer.texts_to_sequences(df['combined_text'])
    X_test_padded = pad_sequences(X_test_sequences,maxlen=max_length,
                                   padding=padding_type, truncating=trunction_type)
    
    vad = model.predict(X_test_padded)
    
    return vad

def logistic_regression(X_train,X_test,y_train,y_test):
    from sklearn.linear_model import LogisticRegression
    
    logisticRegr = LogisticRegression()
    logisticRegr.fit(X_train, y_train.ravel())
    
    pred = logisticRegr.predict(X_test)
    
    p = logisticRegr.predict_proba(X_test)
    
    pred = p[:,1]
    fpr, tpr, thresholds = metrics.roc_curve(y_test, pred, pos_label=1)
    
    auc = metrics.auc(fpr, tpr)
    
    print('Test AUC is {} '.format(auc))
    
def randomforest_classifier(X_train,X_test,y_train,y_test):
    
    from sklearn.ensemble import RandomForestClassifier
    
    clf = RandomForestClassifier(max_depth=2, random_state=0)
    clf.fit(X_train, y_train.ravel())
    p = clf.predict_proba(X_test)
    pred = p[:,1]
    fpr, tpr, thresholds = metrics.roc_curve(y_test, pred, pos_label=1)
    auc = metrics.auc(fpr, tpr)
    
    print('Test AUC is {} '.format(auc))

# Delta = 1 Hr

In [4]:
df_news = pd.read_csv('Data/news_data.csv')
df_combined_news = groupby_Date_Hr(df_news, delta=1)
df_combined_news.head(5)

  df['time_gmt_datetime'] = pd.Series(index=df.index)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,time_gmt_date_hr,combined_text
1917,2015-12-28 00:00:00,japan output retail sales slump dampen recove...
4671,2015-12-28 04:00:00,japan business lobby head won t commit to hig...
4360,2015-12-28 07:00:00,south korea japan agree to irreversibly end c...
5160,2015-12-28 08:00:00,japan firms cold on abe s calls for wage hike...
1711,2015-12-28 10:00:00,for japan and south korea comfort women bronz...


In [7]:
df_usdjpy = pd.read_csv('Data/usdjpy_hourly.csv')
df_usdjpy = encode_price_direction(df_usdjpy)
df_usdjpy.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,date,price,price_direction
0,2016-01-01 00:00:00,120.2665,1
1,2016-01-01 01:00:00,120.2665,1
2,2016-01-01 02:00:00,120.2665,1
3,2016-01-01 03:00:00,120.2665,1
4,2016-01-01 04:00:00,120.2665,1


In [8]:
df_usdjpy_ = align_datetime(df_usdjpy, delta = 1)
df_usdjpy_.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,time_gmt_date_hr,price,price_direction
0,2015-12-31 23:00:00,120.2665,1
1,2016-01-01 00:00:00,120.2665,1
2,2016-01-01 01:00:00,120.2665,1
3,2016-01-01 02:00:00,120.2665,1
4,2016-01-01 03:00:00,120.2665,1


In [9]:
df_combined_news['time_gmt_date_hr'] = pd.to_datetime(df_combined_news['time_gmt_date_hr'])
df_usdjpy_['time_gmt_date_hr'] = pd.to_datetime(df_usdjpy_['time_gmt_date_hr'])
df_merged = df_combined_news.merge(df_usdjpy_, on =['time_gmt_date_hr'])
df_merged.head()

Unnamed: 0,time_gmt_date_hr,combined_text,price,price_direction
0,2016-01-01 20:00:00,airbus to sell three a380 superjumbo jets to ...,120.2665,1
1,2016-01-01 21:00:00,airbus to sell three a380 superjumbo jets to ...,120.2665,1
2,2016-01-03 21:00:00,japan central bank turns activist investor to...,114.048,1
3,2016-01-05 18:00:00,automakers not silicon valley lead in driverl...,106.335,1
4,2016-01-06 04:00:00,japan says to make firm response to north kor...,109.9575,0


In [10]:
df_merged.shape

(6357, 4)

In [14]:
vad = get_vad_features(df_merged)

Loaded model from disk


In [15]:
from sklearn.model_selection import train_test_split

X = vad
y = df_merged[['price_direction']]

X_train, X_test , y_train, y_test = train_test_split(X, y , test_size = 0.20)
y_train = np.asarray(y_train).astype(np.float32)
y_test = np.asarray(y_test).astype(np.float32)

In [21]:
logistic_regression(X_train,X_test,y_train,y_test)

Test AUC is 0.477940301756322 


In [17]:
randomforest_classifier(X_train,X_test,y_train,y_test)

Test AUC is 0.48345427715920797 


# Delta = 2Hr

In [34]:
df_news = pd.read_csv('Data/news_data.csv')
df_combined_news = groupby_Date_Hr(df_news, delta=2)
df_combined_news.head(5)

  df['time_gmt_datetime'] = pd.Series(index=df.index)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,time_gmt_date_hr,combined_text
1917,2015-12-28 00:00:00,japan output retail sales slump dampen recove...
4671,2015-12-28 04:00:00,japan business lobby head won t commit to hig...
4360,2015-12-28 07:00:00,south korea japan agree to irreversibly end c...
5160,2015-12-28 08:00:00,south korea japan agree to irreversibly end c...
1711,2015-12-28 10:00:00,for japan and south korea comfort women bronz...


In [37]:
df_usdjpy = pd.read_csv('Data/usdjpy_hourly.csv')
df_usdjpy = encode_price_direction(df_usdjpy)
df_usdjpy.head()

Unnamed: 0,date,price,price_direction
0,2016-01-01 00:00:00,120.2665,1
1,2016-01-01 01:00:00,120.2665,1
2,2016-01-01 02:00:00,120.2665,1
3,2016-01-01 03:00:00,120.2665,1
4,2016-01-01 04:00:00,120.2665,1


In [38]:
df_usdjpy_ = align_datetime(df_usdjpy, delta = 2)
df_usdjpy_.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,time_gmt_date_hr,price,price_direction
0,2015-12-31 22:00:00,120.2665,1
1,2015-12-31 23:00:00,120.2665,1
2,2016-01-01 00:00:00,120.2665,1
3,2016-01-01 01:00:00,120.2665,1
4,2016-01-01 02:00:00,120.2665,1


In [39]:
df_combined_news['time_gmt_date_hr'] = pd.to_datetime(df_combined_news['time_gmt_date_hr'])
df_usdjpy_['time_gmt_date_hr'] = pd.to_datetime(df_usdjpy_['time_gmt_date_hr'])
df_merged = df_combined_news.merge(df_usdjpy_, on =['time_gmt_date_hr'])
df_merged.head()

Unnamed: 0,time_gmt_date_hr,combined_text,price,price_direction
0,2016-01-01 20:00:00,airbus to sell three a380 superjumbo jets to ...,120.2665,1
1,2016-01-01 21:00:00,airbus to sell three a380 superjumbo jets to ...,120.2665,1
2,2016-01-03 21:00:00,japan central bank turns activist investor to...,113.918,0
3,2016-01-05 18:00:00,automakers not silicon valley lead in driverl...,106.335,1
4,2016-01-06 04:00:00,japan says to make firm response to north kor...,110.0615,1


In [40]:
vad = get_vad_features(df_merged)

Loaded model from disk


In [41]:
from sklearn.model_selection import train_test_split

X = vad
y = df_merged[['price_direction']]

X_train, X_test , y_train, y_test = train_test_split(X, y , test_size = 0.20)
y_train = np.asarray(y_train).astype(np.float32)
y_test = np.asarray(y_test).astype(np.float32)

In [42]:
logistic_regression(X_train,X_test,y_train,y_test)

Test AUC is 0.5162374693126023 


In [43]:
randomforest_classifier(X_train,X_test,y_train,y_test)

Test AUC is 0.46884973404255315 


# Delta = 5Hr

In [44]:
df_news = pd.read_csv('Data/news_data.csv')
df_combined_news = groupby_Date_Hr(df_news, delta=5)
df_combined_news.head(5)

  df['time_gmt_datetime'] = pd.Series(index=df.index)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,time_gmt_date_hr,combined_text
1917,2015-12-28 00:00:00,japan output retail sales slump dampen recove...
4671,2015-12-28 04:00:00,japan business lobby head won t commit to hig...
4360,2015-12-28 07:00:00,south korea japan agree to irreversibly end c...
5160,2015-12-28 08:00:00,south korea japan agree to irreversibly end c...
1711,2015-12-28 10:00:00,south korea japan agree to irreversibly end c...


In [45]:
df_usdjpy = pd.read_csv('Data/usdjpy_hourly.csv')
df_usdjpy = encode_price_direction(df_usdjpy)
df_usdjpy.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,date,price,price_direction
0,2016-01-01 00:00:00,120.2665,1
1,2016-01-01 01:00:00,120.2665,1
2,2016-01-01 02:00:00,120.2665,1
3,2016-01-01 03:00:00,120.2665,1
4,2016-01-01 04:00:00,120.2665,1


In [46]:
df_usdjpy_ = align_datetime(df_usdjpy, delta = 2)
df_usdjpy_.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,time_gmt_date_hr,price,price_direction
0,2015-12-31 22:00:00,120.2665,1
1,2015-12-31 23:00:00,120.2665,1
2,2016-01-01 00:00:00,120.2665,1
3,2016-01-01 01:00:00,120.2665,1
4,2016-01-01 02:00:00,120.2665,1


In [47]:
df_combined_news['time_gmt_date_hr'] = pd.to_datetime(df_combined_news['time_gmt_date_hr'])
df_usdjpy_['time_gmt_date_hr'] = pd.to_datetime(df_usdjpy_['time_gmt_date_hr'])
df_merged = df_combined_news.merge(df_usdjpy_, on =['time_gmt_date_hr'])
df_merged.head()

Unnamed: 0,time_gmt_date_hr,combined_text,price,price_direction
0,2016-01-01 20:00:00,airbus to sell three a380 superjumbo jets to ...,120.2665,1
1,2016-01-01 21:00:00,airbus to sell three a380 superjumbo jets to ...,120.2665,1
2,2016-01-03 21:00:00,japan central bank turns activist investor to...,113.918,0
3,2016-01-05 18:00:00,automakers not silicon valley lead in driverl...,106.335,1
4,2016-01-06 04:00:00,japan says to make firm response to north kor...,110.0615,1


In [48]:
vad = get_vad_features(df_merged)

Loaded model from disk


In [49]:
from sklearn.model_selection import train_test_split

X = vad
y = df_merged[['price_direction']]

X_train, X_test , y_train, y_test = train_test_split(X, y , test_size = 0.20)
y_train = np.asarray(y_train).astype(np.float32)
y_test = np.asarray(y_test).astype(np.float32)

In [50]:
logistic_regression(X_train,X_test,y_train,y_test)

Test AUC is 0.5112949805992099 


In [51]:
randomforest_classifier(X_train,X_test,y_train,y_test)

Test AUC is 0.4821254559122421 
