In [1]:
import pandas as pd
import numpy as np
import pickle 

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# 1. load & prepare data

In [2]:
# all_data = pickle.load(open('../data/processed/feature_encoded_merged_data.pkl', 'rb'))
all_data = pd.read_pickle('../data/processed/feature_encoded_merged_data.pkl')

all_data.shape

(34891, 19)

In [3]:
all_data.head(2)

Unnamed: 0,date,time,event_type,cleaned_event,full_text,symbol,Surprise(%),Reported EPS,Consensus EPS,hr,pre_market,date_idx,price_change_7,price_change_30,price_change_90,price_change_365,targe_price_change,unigram_vec,phrase_vec
0,2003-11-06,163525,EVENTS:\t\tFinancial statements and exhibits\n,[financial statements and exhibits],\n<DOCUMENT>\nFILE:CNL/CNL-8K-20031106163525.t...,CNL,14.29,0.48,0.42,16.35,False,370,1.43,2.81,14.2,14.97,-0.122546,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."
1,2004-03-09,170611,EVENTS:\t\tFinancial statements and exhibits\n,[financial statements and exhibits],\n<DOCUMENT>\nFILE:CNL/CNL-8K-20040309170611.t...,CNL,-150.0,-0.06,0.12,17.06,False,494,-0.58,2.48,11.09,78.98,-1.036715,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."


In [4]:
all_data.columns

Index(['date', 'time', 'event_type', 'cleaned_event', 'full_text', 'symbol',
       'Surprise(%)', 'Reported EPS', 'Consensus EPS', 'hr', 'pre_market',
       'date_idx', 'price_change_7', 'price_change_30', 'price_change_90',
       'price_change_365', 'targe_price_change', 'unigram_vec', 'phrase_vec'],
      dtype='object')

In [5]:
# def find_label(price_change):
#     if price_change > 1:
#         return 2
#     elif price_change > -1:
#         return 1
#     else:
#         return 0

def find_label(target):
    if target == 'UP':
        return 2
    elif target == 'STAY':
        return 1
    else:
        return 0

all_data['label'] = all_data.target.apply(find_label)

In [6]:
# def clean_event_type_2(e):
#     result = []
#     for event in e:
#         cleaned = event.replace('2.02', '').strip()
#         if cleaned != '' and cleaned not in result:
#             result.append(cleaned)
#     return result
# all_data.cleaned_event = all_data.cleaned_event.apply(clean_event_type_2)

# all_event_type = []
# for i in train_df.cleaned_event.values:
#     for e in i:
#         if e not in all_event_type:
#             all_event_type.append(e)

In [7]:
mlb = MultiLabelBinarizer()
all_events = pd.DataFrame(mlb.fit_transform(all_data['cleaned_event']),
                   columns = mlb.classes_,
                   index = all_data['cleaned_event'].index)
all_data = all_data.merge(all_events, left_index = True, right_index=True)

In [8]:
train_df = all_data.query('dataset == "train"').reset_index(drop = True)
val_df = all_data.query('dataset == "val"').reset_index(drop = True)
test_df = all_data.query('dataset == "test"').reset_index(drop = True)

In [9]:
train_df.shape, val_df.shape, test_df.shape

((17454, 57), (8714, 57), (8723, 57))

# 2. Train

In [10]:
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
# from sklearn.datasets import make_classification

## 2.1. Baseline 1

In [25]:
y_train, y_val, y_test = train_df.label.values, val_df.label.values, test_df.label.values
X_train, X_val, X_test = train_df[['Surprise(%)']], val_df[['Surprise(%)']], test_df[['Surprise(%)']]

In [26]:
clf = make_pipeline(LinearSVC(random_state=0, tol=1e-5, C = 0.1))
clf.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('linearsvc',
                 LinearSVC(C=0.1, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
                           loss='squared_hinge', max_iter=1000,
                           multi_class='ovr', penalty='l2', random_state=0,
                           tol=1e-05, verbose=0))],
         verbose=False)

In [27]:
sum(clf.predict(X_train) == y_train) / len(y_train)

0.49203620946487914

In [28]:
sum(clf.predict(X_val) == y_val) / len(y_val)

0.4757860913472573

## 2.2 baseline 2

In [11]:
num_feat = ['Surprise(%)', 'price_change_7', 'price_change_30', 'price_change_90', \
                      'price_change_365', 'prev_vix_values']
cat_feat = list(mlb.classes_)
baseline2_features = num_feat + cat_feat 


X_train, X_val, X_test = train_df[baseline2_features], \
                            val_df[baseline2_features], \
                            test_df[baseline2_features]
y_train, y_val, y_test = train_df.label.values, val_df.label.values, test_df.label.values

In [12]:

num_trans = Pipeline(steps=[("scaler", FunctionTransformer(lambda x: x))])
cat_trans = Pipeline(
    steps=[("onehot", OneHotEncoder(handle_unknown="ignore"))]
)
col_trans = ColumnTransformer(
    transformers=[
        ("num", num_trans, num_feat),
        ("cat", cat_trans, cat_feat),
    ]
)

col_trans.fit(X_train)

ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('num',
                                 Pipeline(memory=None,
                                          steps=[('scaler',
                                                  FunctionTransformer(accept_sparse=False,
                                                                      check_inverse=True,
                                                                      func=<function <lambda> at 0x7f0804915cb0>,
                                                                      inv_kw_args=None,
                                                                      inverse_func=None,
                                                                      kw_args=None,
                                                                      validate=False))],
                                          verbose=False),
                           

In [13]:
X_train = col_trans.transform(X_train)
X_val = col_trans.transform(X_val)
X_test = col_trans.transform(X_test)

In [14]:
clf = make_pipeline(LinearSVC(random_state=0, tol=1e-5, C = 0.01))
clf.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('linearsvc',
                 LinearSVC(C=0.01, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
                           loss='squared_hinge', max_iter=1000,
                           multi_class='ovr', penalty='l2', random_state=0,
                           tol=1e-05, verbose=0))],
         verbose=False)

In [15]:
sum(clf.predict(X_train) == y_train) / len(y_train)

0.47725449753638133

In [16]:
sum(clf.predict(X_val) == y_val) / len(y_val)

0.48232728941932523

In [17]:
X_train.shape

(17454, 69)

In [18]:
X_train

array([[ -3.13,  -0.5 ,   2.57, ...,   0.  ,   1.  ,   0.  ],
       [  0.  ,   0.16,   0.39, ...,   0.  ,   1.  ,   0.  ],
       [  0.  ,   0.53,   0.61, ...,   0.  ,   1.  ,   0.  ],
       ...,
       [  6.25,  -2.26,  -4.17, ...,   0.  ,   1.  ,   0.  ],
       [-11.76,  -3.96, -23.52, ...,   0.  ,   1.  ,   0.  ],
       [ 75.  ,  14.02,  12.5 , ...,   0.  ,   1.  ,   0.  ]])

## 2.3 unigram model

In [19]:
uni_X_train = np.array([np.array(i) for i in train_df.unigram_vec.values])
uni_X_val = np.array([np.array(i) for i in val_df.unigram_vec.values])
uni_X_test = np.array([np.array(i) for i in test_df.unigram_vec.values])

In [20]:
X_train_concat = np.concatenate((X_train, uni_X_train), axis=1)
X_val_concat = np.concatenate((X_val, uni_X_val), axis=1)
X_test_concat = np.concatenate((X_test, uni_X_test), axis=1)

In [41]:
# clf = make_pipeline(LinearSVC(random_state=0, tol=1e-5, C = 0.001))
# clf.fit(X_train_concat, y_train)

In [None]:
sum(clf.predict(X_train_concat) == y_train) / len(y_train)

In [28]:
sum(clf.predict(X_val_concat) == y_val) / len(y_val)

0.44365389029148494

## 2.4 phrase model - svc linear kernel

In [20]:
phrase_X_train = np.array([np.array(i) for i in train_df.phrase_vec.values])
phrase_X_val = np.array([np.array(i) for i in val_df.phrase_vec.values])
phrase_X_test = np.array([np.array(i) for i in test_df.phrase_vec.values])

In [21]:
X_train_concat = np.concatenate((X_train, phrase_X_train), axis=1)
X_val_concat = np.concatenate((X_val, phrase_X_val), axis=1)
X_test_concat = np.concatenate((X_test, phrase_X_test), axis=1)

In [22]:
# clf = make_pipeline(LinearSVC(random_state=0, tol=1e-5, C = 0.1))
# clf.fit(X_train_concat, y_train)

for c in [0.0001, 0.001, 0.01, 1, 10]:
    print('=========')
    print('C =', c)
    clf = make_pipeline(LinearSVC(random_state=0, tol=1e-5, C = c))
    clf.fit(X_train_concat, y_train)
    print('Train acc:', round(sum(clf.predict(X_train_concat) == y_train) / len(y_train), 4))
    print('Val acc:', round(sum(clf.predict(X_val_concat) == y_val) / len(y_val), 4))
    print()



C = 0.0001




Train acc: 0.4676
Val acc: 0.4193

C = 0.001




Train acc: 0.5301
Val acc: 0.434

C = 0.01




Train acc: 0.5993
Val acc: 0.4334

C = 1




Train acc: 0.601
Val acc: 0.3945

C = 10




Train acc: 0.5319
Val acc: 0.3358



- phrase model 2 - svc rbf kernel

In [23]:
from sklearn.svm import SVC

In [45]:
clf = make_pipeline(SVC(random_state=0, kernel='rbf', C = 0.1))
clf.fit(X_train_concat, y_train)

Pipeline(memory=None,
         steps=[('svc',
                 SVC(C=0.1, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=3,
                     gamma='scale', kernel='rbf', max_iter=-1,
                     probability=False, random_state=0, shrinking=True,
                     tol=0.001, verbose=False))],
         verbose=False)

In [46]:
sum(clf.predict(X_train_concat) == y_train) / len(y_train)

0.4391543485733929

In [47]:
sum(clf.predict(X_val_concat) == y_val) / len(y_val)

0.42609593757172365

In [None]:
for c in [0.0001, 0.001, 0.01, 1, 10]:
    print('=========')
    print('C =', c)
    clf = make_pipeline(SVC(random_state=0, kernel='rbf', C = c))
    clf.fit(X_train_concat, y_train)
    print('Train acc:', round(sum(clf.predict(X_train_concat) == y_train) / len(y_train), 4))
    print('Val acc:', round(sum(clf.predict(X_val_concat) == y_val) / len(y_val), 4))
    print()

C = 0.0001
Train acc: 0.414
