In [26]:
import pickle
import numpy as np
import pandas as pd
from scipy.stats import t
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, PredefinedSplit

In [2]:
data = pd.read_pickle('../data/processed/feature_encoded_merged_data.pkl')
data.head()

Unnamed: 0,date,time,event_type,cleaned_event,full_text,symbol,Surprise(%),Reported EPS,Consensus EPS,hr,...,price_change_7,price_change_30,price_change_90,price_change_365,targe_price_change,prev_vix_values,dataset,target,unigram_vec,phrase_vec
0,2004-07-20,143800,EVENTS:\t\tFinancial statements and exhibits\n,[financial statements and exhibits],\n<DOCUMENT>\nFILE:FULT/FULT-8K-20040720143800...,FULT,-3.13,0.31,0.32,14.38,...,-0.5,2.57,3.01,8.21,0.414034,14.17,train,STAY,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,2004-10-19,174320,EVENTS:\tResults of Operations and Financial C...,[results of operations and financial condition...,\n<DOCUMENT>\nFILE:FULT/FULT-8K-20041019174320...,FULT,0.0,0.32,0.32,17.43,...,0.16,0.39,7.58,14.93,-1.208981,15.13,train,DOWN,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,2005-01-18,123338,EVENTS:\tResults of Operations and Financial C...,[results of operations and financial condition...,\n<DOCUMENT>\nFILE:FULT/FULT-8K-20050118123338...,FULT,0.0,0.33,0.33,12.33,...,0.53,0.61,5.15,13.6,-0.25099,12.47,train,STAY,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,2005-04-13,140932,EVENTS:\tResults of Operations and Financial C...,[results of operations and financial condition...,\n<DOCUMENT>\nFILE:FULT/FULT-8K-20050413140932...,FULT,0.0,0.33,0.33,14.09,...,-1.75,-2.19,-1.37,8.56,0.070178,13.31,train,STAY,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,2005-07-19,132220,EVENTS:\tResults of Operations and Financial C...,[results of operations and financial condition...,\n<DOCUMENT>\nFILE:FULT/FULT-8K-20050719132220...,FULT,0.0,0.27,0.27,13.22,...,-1.28,2.88,11.54,17.17,0.604141,10.45,train,STAY,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


# Phrase Selection

In [3]:
# t_test = pd.read_csv('../data/t-test.csv')
# t_test = t_test.dropna(subset=['t values'])
# t_test.shape

In [4]:
# def get_prob(t_val):
#     return t.sf(abs(t_val), 2998)

In [5]:
# t_test['Probabilities'] = t_test['Coefficients'].apply(get_prob)

In [6]:
# phrase_indicies = t_test.sort_values(by = ['Probabilities']).head(2319).index.values

In [7]:
# def select_phrases(phrases):
#     return np.array(phrases)[phrase_indicies]

In [8]:
# %%time

# data['top_phrases'] = data['phrase_vec'].apply(select_phrases)

In [9]:
def select_phrases(phrases):
    return phrases[:2107]

In [10]:
data['top_phrases'] = data['phrase_vec'].apply(select_phrases)

# Splits and Events 

In [11]:
train = data.loc[data['dataset'] == 'train'].copy()
val = data.loc[data['dataset'] == 'val'].copy()
test = data.loc[data['dataset'] == 'test'].copy()

In [12]:
mlb = MultiLabelBinarizer()

all_events = pd.DataFrame(mlb.fit_transform(data['cleaned_event']),
                   columns = mlb.classes_,
                   index = data['cleaned_event'].index)

# EDA by split

In [13]:
pd.DataFrame(data = [data.groupby(data['target']).count()['symbol'] / data.shape[0],
                     train.groupby(train['target']).count()['symbol'] / train.shape[0],
                     val.groupby(val['target']).count()['symbol'] / val.shape[0],
                     test.groupby(test['target']).count()['symbol'] / test.shape[0]],
             index = ["all_data", "train", "val", "test"]
            )

target,DOWN,STAY,UP
all_data,0.38133,0.216474,0.402195
train,0.372236,0.213762,0.414003
val,0.392472,0.215515,0.392013
test,0.388398,0.222859,0.388742


In [14]:
pd.DataFrame(data = [data.groupby(data['target']).mean()['targe_price_change'],
                     train.groupby(train['target']).mean()['targe_price_change'],
                     val.groupby(val['target']).mean()['targe_price_change'],
                     test.groupby(test['target']).mean()['targe_price_change']],
             index = ["all_data", "train", "val", "test"]
            )

target,DOWN,STAY,UP
all_data,-5.416298,-0.032407,5.942713
train,-5.692446,-0.038576,6.178306
val,-4.850204,-0.033597,5.717755
test,-5.458181,-0.019418,5.667296


# Baseline

### Training Set

In [15]:
num_train_X = train[['Surprise(%)', 'price_change_7', 
              'price_change_30', 'price_change_90', 'price_change_365',
              'prev_vix_values']].to_numpy()

In [16]:
uni_scaler = StandardScaler()
uni_scaler.fit(num_train_X)
num_train_X = uni_scaler.transform(num_train_X)

In [17]:
train_events = all_events.iloc[train.index].to_numpy()

In [18]:
base_train_y = train[['target']].to_numpy().ravel()

In [19]:
base_train_X = np.concatenate((train_events, num_train_X), axis = 1)

In [20]:
base_model = RandomForestClassifier(max_depth = 10, n_estimators = 2000)

In [22]:
%%time

base_model = base_model.fit(base_train_X, base_train_y)

CPU times: user 34.3 s, sys: 372 ms, total: 34.6 s
Wall time: 34.7 s


In [27]:
with open('../data/processed/base_model.pckl', 'wb') as f:
        pickle.dump(base_model, f)

In [29]:
sum(base_model.predict(base_train_X) == base_train_y) / len(base_train_y)

0.56365303082388

### Validation Set

In [23]:
num_val_X = val[['Surprise(%)', 'price_change_7', 
              'price_change_30', 'price_change_90', 'price_change_365',
              'prev_vix_values']].to_numpy()

In [24]:
num_val_X = uni_scaler.transform(num_val_X)

In [25]:
val_events = all_events.iloc[val.index].to_numpy()

In [26]:
base_val_y = val[['target']].to_numpy().ravel()

In [27]:
base_val_X = np.concatenate((val_events, num_val_X), axis = 1)

In [28]:
sum(base_model.predict(base_val_X) == base_val_y) / len(base_val_y)

0.5105577232040395

# Unigram Features

### Model Selection

In [15]:
def feature_prep_uni(train, val):
    numerics_train = train[['Surprise(%)', 'price_change_7', 
              'price_change_30', 'price_change_90', 'price_change_365',
              'prev_vix_values']].to_numpy()
    
    scale = StandardScaler()
    scale.fit(numerics_train)
    numerics_train = scale.transform(numerics_train)
    
    train_unigrams = np.array(train['unigram_vec'].values.tolist())
    train_events = all_events.iloc[train.index].to_numpy()
    
    train_y = train[['target']].to_numpy().ravel()
    train_X = np.concatenate((train_events, numerics_train, train_unigrams), axis = 1)
    
    numerics_val = val[['Surprise(%)', 'price_change_7', 
              'price_change_30', 'price_change_90', 'price_change_365',
              'prev_vix_values']].to_numpy()

    numerics_val = scale.transform(numerics_val)
    
    val_unigrams = np.array(val['unigram_vec'].values.tolist())
    val_events = all_events.iloc[val.index].to_numpy()
    
    val_y = val[['target']].to_numpy().ravel()
    val_X = np.concatenate((val_events, numerics_val, val_unigrams), axis = 1)
    
    
    comb_X = np.concatenate((train_X, val_X), axis = 0)
    comb_y = np.concatenate((train_y, val_y), axis = 0)
    split_index = np.concatenate((np.ones(train.shape[0]), np.zeros(val.shape[0])), axis = 0)
    
    return (comb_X, comb_y, split_index)

In [16]:
comb_X, comb_y, split_index = feature_prep_uni(train, val)

In [17]:
pds = PredefinedSplit(test_fold = split_index)

In [23]:
n_estimators = [2000]
max_depth = [10]
max_features = [250, 750, 1500, 'auto']
param_grid = dict(n_estimators = n_estimators, max_depth = max_depth, max_features = max_features)

In [24]:
uni_model = RandomForestClassifier()

In [25]:
uni_search = GridSearchCV(uni_model, param_grid, scoring = 'accuracy', cv = pds, refit = False)

In [None]:
%%time

uni_search.fit(comb_X, comb_y)

In [None]:
uni_search.best_params_

### Train Set

In [29]:
num_train_X = train[['Surprise(%)', 'price_change_7', 
              'price_change_30', 'price_change_90', 'price_change_365',
              'prev_vix_values']].to_numpy()

uni_scaler = StandardScaler()
uni_scaler.fit(num_train_X)
num_train_X = uni_scaler.transform(num_train_X)

In [30]:
train_unigrams = np.array(train['unigram_vec'].values.tolist())
train_events = all_events.iloc[train.index].to_numpy()

In [31]:
uni_train_y = train[['target']].to_numpy().ravel()

In [32]:
uni_train_X = np.concatenate((train_events, num_train_X, train_unigrams), axis = 1)

In [33]:
uni_model_best = RandomForestClassifier(max_depth = 10, n_estimators = 2000, max_features = 1250)

In [34]:
%%time

uni_model_best.fit(uni_train_X, uni_train_y)

CPU times: user 37min 19s, sys: 5.04 s, total: 37min 24s
Wall time: 37min 26s


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=10, max_features=1250,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=2000,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [35]:
sum(uni_model_best.predict(uni_train_X) == uni_train_y) / len(uni_train_y)

0.6659218517245331

### Val Set

In [36]:
num_val_X = val[['Surprise(%)', 'price_change_7', 
              'price_change_30', 'price_change_90', 'price_change_365',
              'prev_vix_values']].to_numpy()

num_val_X = uni_scaler.transform(num_val_X)

In [37]:
val_unigrams = np.array(val['unigram_vec'].values.tolist())
val_events = all_events.iloc[val.index].to_numpy()

In [38]:
uni_val_y = val[['target']].to_numpy().ravel()

In [39]:
uni_val_X = np.concatenate((val_events, num_val_X, val_unigrams), axis = 1)

In [40]:
sum(uni_model_best.predict(uni_val_X) == uni_val_y) / len(uni_val_y)

0.5114757860913473

# Phrase Features

### Model Selection

In [None]:
def feature_prep_phrase(train, val):
    numerics_train = train[['Surprise(%)', 'price_change_7', 
              'price_change_30', 'price_change_90', 'price_change_365',
              'prev_vix_values']].to_numpy()
    
    scale = StandardScaler()
    scale.fit(numerics_train)
    numerics_train = scale.transform(numerics_train)
    
    train_unigrams = np.array(train['top_phrases'].values.tolist())
    train_events = all_events.iloc[train.index].to_numpy()
    
    train_y = train[['target']].to_numpy().ravel()
    train_X = np.concatenate((train_events, numerics_train, train_unigrams), axis = 1)
    
    numerics_val = val[['Surprise(%)', 'price_change_7', 
              'price_change_30', 'price_change_90', 'price_change_365',
              'prev_vix_values']].to_numpy()

    numerics_val = scale.transform(numerics_val)
    
    val_unigrams = np.array(val['top_phrases'].values.tolist())
    val_events = all_events.iloc[val.index].to_numpy()
    
    val_y = val[['target']].to_numpy().ravel()
    val_X = np.concatenate((val_events, numerics_val, val_unigrams), axis = 1)
    
    
    comb_X = np.concatenate((train_X, val_X), axis = 0)
    comb_y = np.concatenate((train_y, val_y), axis = 0)
    split_index = np.concatenate((np.ones(train.shape[0]), np.zeros(val.shape[0])), axis = 0)
    
    return (comb_X, comb_y, split_index)

In [None]:
comb_X, comb_y, split_index = feature_prep_phrase(train, val)

In [None]:
pds = PredefinedSplit(test_fold = split_index)

In [None]:
n_estimators = [2000]
max_depth = [10]
max_features = [250, 750, 1500, 'auto']
param_grid = dict(n_estimators = n_estimators, max_depth = max_depth, max_features = max_features)

In [None]:
phrase_model = RandomForestClassifier()

In [None]:
phrase_search = GridSearchCV(phrase_model, param_grid, scoring = 'accuracy', cv = pds, refit = False)

In [None]:
%%time

phrase_search.fit(comb_X, comb_y)

In [None]:
phrase_search.best_params_

### Train Set

In [41]:
num_train_X = train[['Surprise(%)', 'price_change_7', 
               'price_change_30', 'price_change_90', 'price_change_365',
               'prev_vix_values']].to_numpy()

phrase_scaler = StandardScaler()
phrase_scaler.fit(num_train_X)
num_train_X = phrase_scaler.transform(num_train_X)

In [42]:
train_events = all_events.iloc[train.index].to_numpy()
train_phrase = np.array(train['top_phrases'].values.tolist())

In [43]:
phrase_train_y = train[['target']].to_numpy().ravel()

In [44]:
phrase_train_X = np.concatenate((train_events, num_train_X, train_phrase), axis = 1)

In [45]:
phrase_model_best = RandomForestClassifier(max_depth = 10, n_estimators = 1000, max_features = 1250)

In [46]:
%%time

phrase_model_best.fit(phrase_train_X, phrase_train_y)

CPU times: user 18min 25s, sys: 4.18 s, total: 18min 29s
Wall time: 18min 30s


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=10, max_features=1250,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [47]:
sum(phrase_model_best.predict(phrase_train_X) == phrase_train_y) / len(phrase_train_y)

0.6234674000229173

### Val Set

In [48]:
num_val_X = val[['Surprise(%)', 'price_change_7', 
               'price_change_30', 'price_change_90', 'price_change_365',
               'prev_vix_values']].to_numpy()

num_val_X = phrase_scaler.transform(num_val_X)

In [49]:
val_events = all_events.iloc[val.index].to_numpy()
val_phrase = np.array(val['top_phrases'].values.tolist())

In [50]:
phrase_val_y = val[['target']].to_numpy().ravel()

In [51]:
phrase_val_X = np.concatenate((val_events, num_val_X, val_phrase), axis = 1)

In [52]:
sum(phrase_model_best.predict(phrase_val_X) == phrase_val_y) / len(phrase_val_y)

0.5109019967867799

# Test Results

In [53]:
num_train_X = train[['Surprise(%)', 'price_change_7', 
               'price_change_30', 'price_change_90', 'price_change_365',
               'prev_vix_values']].to_numpy()

test_scaler = StandardScaler()

num_test_X = test[['Surprise(%)', 'price_change_7', 
               'price_change_30', 'price_change_90', 'price_change_365',
               'prev_vix_values']].to_numpy()

test_scaler.fit(num_train_X)
num_test_X = phrase_scaler.transform(num_test_X)

In [54]:
test_events = all_events.iloc[test.index].to_numpy()

test_unigrams = np.array(test['unigram_vec'].values.tolist())
test_phrase = np.array(test['top_phrases'].values.tolist())

In [55]:
base_test_X = np.concatenate((test_events, num_test_X), axis = 1)
unigram_test_X = np.concatenate((test_events, num_test_X, test_unigrams), axis = 1)
phrase_test_X = np.concatenate((test_events, num_test_X, test_phrase), axis = 1)

In [56]:
test_y = test[['target']].to_numpy().ravel()

In [57]:
test['base_pred'] = base_model.predict(base_test_X)
print(sum(test['base_pred'] == test_y) / len(test_y))

test['unigram_pred'] = uni_model_best.predict(unigram_test_X)
print(sum(test['unigram_pred'] == test_y) / len(test_y))

test['phrase_pred'] = phrase_model_best.predict(phrase_test_X)
print(sum(test['phrase_pred'] == test_y) / len(test_y))

0.5194313882838473
0.525621919064542
0.5261951163590508


In [58]:
test.to_pickle('../data/model_results.pkl')

# Results Analysis

In [None]:
num_train_X = train[['Surprise(%)', 'price_change_7', 
               'price_change_30', 'price_change_90', 'price_change_365',
               'prev_vix_values']].to_numpy()

phrase_scaler = StandardScaler()
phrase_scaler.fit(num_train_X)
num_train_X = phrase_scaler.transform(num_train_X)

In [None]:
num_data_X = val[['Surprise(%)', 'price_change_7', 
               'price_change_30', 'price_change_90', 'price_change_365',
               'prev_vix_values']].to_numpy()

num_data_X = phrase_scaler.transform(num_val_X)

In [55]:
val['uni_pred'] = uni_model_best.predict(uni_val_X)
uni_val_results = val.loc[val['target'] != val['uni_pred']].copy()
uni_val_results = uni_val_results.groupby(val['target']).count()['symbol'] / uni_val_results.shape[0]
uni_val_results = uni_val_results

In [56]:
val['phrase_pred'] = phrase_model_best.predict(phrase_val_X)
phrase_val_results = val.loc[val['target'] != val['phrase_pred']].copy()
phrase_val_results = phrase_val_results.groupby(val['target']).count()['symbol'] / phrase_val_results.shape[0]
phrase_val_results = phrase_val_results

In [57]:
pd.concat([uni_val_results, phrase_val_results], axis = 1)

Unnamed: 0_level_0,symbol,symbol
target,Unnamed: 1_level_1,Unnamed: 2_level_1
DOWN,0.463989,0.475217
STAY,0.420013,0.411869
UP,0.115997,0.112914
