In [14]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import spacy

# Need to load the model to get the vectors
nlp = spacy.load('en_core_web_md')

In [15]:
def groupbyDate(df):
    
    dates = list(set(df['time_gmt_date'].to_list()))
    
    rows = []
    
    for dt in dates:
        
        df_date = df[df['time_gmt_date']==dt]
        text = ""
        
        for i in range(df_date.shape[0]):
            text = text+" "+df_date['title'].iloc[i]
            
        rows.append([dt, text])
    
    df_ = pd.DataFrame(rows)
    df_.columns=["date","combined_text"]
    df_.sort_values(by='date', ascending=True, inplace=True)
    
    return df_

def encode_price_direction(df):
    
    for i in range(df.shape[0]):
        
        val = df['price_direction'].iloc[i]
        
        if val == 'up':
            df['price_direction'].iloc[i] = 1
        else: 
            df['price_direction'].iloc[i] = 0
            
    return df

def align_date(df):
    
    for i in range(df.shape[0]):
        x = df['date'].iloc[i]
        x = datetime.strptime(x,"%Y-%m-%d")
        x = x-timedelta(hours=24) 
        df['date'].iloc[i] = x
    return df

In [59]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
    
def plotLossCurve(estimator):
    # Plot model loss over epochs
    plt.plot(estimator.history['loss'])
    plt.plot(estimator.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'valid'], loc='upper left')
    plt.show()

def get_vad_features(df):
    
    from tensorflow.keras.models import load_model

    # load model
    model = load_model('Model/model-glove-embedding-uscaled.h5')
    print("Loaded model from disk")
    
    import pickle

    # loading
    with open('Model/tokenizer.pickle', 'rb') as handle:
        tokenizer = pickle.load(handle)
    
    
    from tensorflow.keras.preprocessing.sequence import pad_sequences

    max_length =  200 #Maximum length of each row
    padding_type = "post"
    trunction_type="post"

    X_test_sequences = tokenizer.texts_to_sequences(df['combined_text'])
    X_test_padded = pad_sequences(X_test_sequences,maxlen=max_length,
                                   padding=padding_type, truncating=trunction_type)
    
    vad = model.predict(X_test_padded)
    
    return vad

def logistic_regression(X_train,X_test,y_train,y_test):
    from sklearn.linear_model import LogisticRegression
    
    logisticRegr = LogisticRegression()
    logisticRegr.fit(X_train, y_train)
    
    pred = logisticRegr.predict(X_test)
    
    p = logisticRegr.predict_proba(X_test)
    
    pred = p[:,1]
    fpr, tpr, thresholds = metrics.roc_curve(y_test, pred, pos_label=1)
    
    auc = metrics.auc(fpr, tpr)
    
    print('Test AUC is {} '.format(auc))
    
def randomforest_classifier(X_train,X_test,y_train,y_test):
    
    from sklearn.ensemble import RandomForestClassifier
    
    clf = RandomForestClassifier(max_depth=2, random_state=0)
    clf.fit(X_train, y_train.ravel())
    p = clf.predict_proba(X_test)
    pred = p[:,1]
    fpr, tpr, thresholds = metrics.roc_curve(y_test, pred, pos_label=1)
    auc = metrics.auc(fpr, tpr)
    
    print('Test AUC is {} '.format(auc))

In [16]:
df_news = pd.read_csv('Data/news_data.csv')
df_combined_news = groupbyDate(df_news)
df_combined_news.head()

Unnamed: 0,date,combined_text
401,2015-12-28,carl icahn trumps bridgestone s offer for pep...
1318,2015-12-29,china to wait and see if japan sincere on com...
984,2015-12-30,south korean comfort women protest against ac...
1336,2015-12-31,vw scandal could kill u s diesel car market c...
171,2016-01-01,airbus to sell three a380 superjumbo jets to ...


In [17]:
df_usdjpy = pd.read_csv('Data/usdjpy_daily.csv')
df_usdjpy = encode_price_direction(df_usdjpy)
df_usdjpy.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,date,price,price_direction
0,2016-01-01,120.2665,1
1,2016-01-02,120.8575,1
2,2016-01-03,113.918,0
3,2016-01-04,111.6515,0
4,2016-01-05,106.4375,0


In [18]:
df_usdjpy_ = align_date(df_usdjpy)
df_usdjpy_.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,date,price,price_direction
0,2015-12-31 00:00:00,120.2665,1
1,2016-01-01 00:00:00,120.8575,1
2,2016-01-02 00:00:00,113.918,0
3,2016-01-03 00:00:00,111.6515,0
4,2016-01-04 00:00:00,106.4375,0


In [19]:
df_combined_news['date'] = pd.to_datetime(df_combined_news['date'])
df_usdjpy_['date'] = pd.to_datetime(df_usdjpy_['date'])
df_merged = df_combined_news.merge(df_usdjpy_, on =['date'])
df_merged.head()

Unnamed: 0,date,combined_text,price,price_direction
0,2015-12-31,vw scandal could kill u s diesel car market c...,120.2665,1
1,2016-01-01,airbus to sell three a380 superjumbo jets to ...,120.8575,1
2,2016-01-03,japan central bank turns activist investor to...,111.6515,0
3,2016-01-05,automakers not silicon valley lead in driverl...,109.4655,1
4,2016-01-06,japanese u s leaders phone talks over north k...,102.524,0


In [20]:
df_merged.shape

(1719, 4)

In [23]:
vad = get_vad_features(df_merged)

Loaded model from disk


In [38]:
from sklearn.model_selection import train_test_split

X = vad
y = df_merged[['price_direction']]

X_train, X_test , y_train, y_test = train_test_split(X, y , test_size = 0.20)
y_train = np.asarray(y_train).astype(np.float32)
y_test = np.asarray(y_test).astype(np.float32)

In [60]:
logistic_regression(X_train,X_test,y_train,y_test)

Test AUC is 0.5172508591065292 


  return f(*args, **kwargs)


In [56]:
randomforest_classifier(X_train,X_test,y_train,y_test)

Test AUC is 0.4231099656357389 
