# Final Report

In [None]:
import json
import numpy as np
import pandas as pd
import datetime as dt
from scipy.stats import t
from scipy import interp
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import label_binarize
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier

plt.style.use('ggplot')

In [None]:
notebook_config = json.load(open('../config/notebook.json', 'r'))
if notebook_config['testing']:
    data_dir = '../test/'
else:
    data_dir = '../data/'

# 1. Result Analysis

In [None]:
test_data = pd.read_pickle(data_dir + 'tmp/model_results.pkl')

## 1.1 Accuracy

In [None]:
print('baseline accuracy: ' + str(sum(test_data['target'] == test_data['base_pred']) / len(test_data['target'])))
print('unigram accuracy: ' + str(sum(test_data['target'] == test_data['unigram_pred']) / len(test_data['target'])))
print('phrase accuracy: ' + str(sum(test_data['target'] == test_data['phrase_pred']) / len(test_data['target'])))

- **Note**:
    - Adding text info did improve the performance of the models
    - Phrase models and unigram models are not as different as we expected, maybe because there are many high-quality unigrams as well.

In [None]:
up_data = test_data.loc[test_data['phrase_pred'] == 'UP'].copy()
stay_data = test_data.loc[test_data['phrase_pred'] == 'STAY'].copy()
down_data = test_data.loc[test_data['phrase_pred'] == 'DOWN'].copy()

In [None]:
labels = [up_data, stay_data, down_data]
models = ['base_pred', 'unigram_pred', 'phrase_pred']
accuracy = []
for model in models:
    temp = []
    for label in labels:
        if len(label) == 0:
            temp.append(0)
        else:
            temp.append(len(label[label['target'] == label[model]]) / len(label))
    accuracy.append(temp)

In [None]:
pd.DataFrame({'baseline': accuracy[0],
             'unigram': accuracy[1],
             'phrase': accuracy[2]},
            index = ['UP', 'STAY', 'DOWN'] )

- **Note**: Our phrase model performs much better for the `STAY` class compared to other baselines!

# 2. Feature Importance

In [None]:
data = pd.read_pickle(data_dir + 'processed/feature_encoded_merged_data.pkl')

train = data.loc[data['dataset'] == 'train'].copy()
val = data.loc[data['dataset'] == 'val'].copy()
test = data.loc[data['dataset'] == 'test'].copy()

unigrams = pd.read_csv(data_dir + 'processed/model_unigrams.csv')
phrases = pd.read_csv(data_dir + 'financial_phrases_sample.txt', sep = '\t', header = None)

In [None]:
mlb = MultiLabelBinarizer()

all_events = pd.DataFrame(mlb.fit_transform(data['cleaned_event']),
                   columns = mlb.classes_,
                   index = data['cleaned_event'].index)

In [None]:
def compute_feature_importance(all_data, train, test, **kwargs):

    num_train = train[['Surprise(%)', 'price_change_7', 
              'price_change_30', 'price_change_90', 'price_change_365',
              'prev_vix_values']].to_numpy()    
    scaler = StandardScaler()
    scaler.fit(num_train)
    num_train = scaler.transform(num_train)
    
    mlb = MultiLabelBinarizer()
    all_events = pd.DataFrame(mlb.fit_transform(all_data['cleaned_event']),
                              columns = mlb.classes_,
                              index = all_data['cleaned_event'].index)   
    train_events = all_events.iloc[all_data.loc[all_data['dataset'] == 'train'].index].to_numpy()
    
    train_y = train[['target']].to_numpy().ravel()
    
    if kwargs['train_type'] == 'unigram':
        train_unigrams = np.array(train['unigram_vec'].values.tolist())
        train_X = np.concatenate((train_events, num_train, train_unigrams), axis = 1)

        model = RandomForestClassifier(max_depth = 10, n_estimators = 2000, max_features = kwargs['max_features'])
        model = model.fit(train_X, train_y)
        
    if kwargs['train_type'] == 'phrase':
        train_phrases = np.array(train['phrase_vec'].values.tolist())
        train_X = np.concatenate((train_events, num_train, train_phrases), axis = 1)
        
        model = RandomForestClassifier(max_depth = 10, n_estimators = 2000, max_features = kwargs['max_features'])
        model = model.fit(train_X, train_y)
    
    return model.feature_importances_

In [None]:
if notebook_config['testing']:
    max_features = 50
else:
    max_features = 1250

In [None]:
# %%time

uni_importance = compute_feature_importance(data, train, test, train_type = 'unigram', max_features = max_features)

In [None]:
# %%time

phrase_importance = compute_feature_importance(data, train, test, train_type = 'phrase', max_features = max_features)

In [None]:
events = np.array(all_events.columns)
numerical = np.array(['Surprise(%)', 'price_change_7', 
              'price_change_30', 'price_change_90', 'price_change_365',
              'prev_vix_values'])

## 2.1 Unigram Model Feature Importance

In [None]:
uni_feature_importance = pd.DataFrame({'feature': np.concatenate((events, numerical, unigrams.values.ravel())),
                                       'importance': uni_importance})
uni_feature_importance.sort_values(by = 'importance', ascending = False).head(20).reset_index(drop = True)

## 2.2 Phrase Model Feature Importance

In [None]:
if len(phrases) > len(phrase_importance):
    phrase_features = np.concatenate((events, numerical, phrases[1].values.ravel()))[:len(phrase_importance)]
else:
    phrase_importance = phrase_importance[:len(phrases)]

phrase_feature_importance = pd.DataFrame({'feature': phrase_features,
                                       'importance': phrase_importance})
phrase_feature_importance.sort_values(by = 'importance', ascending = False).head(20).reset_index(drop = True)

- **Summary**: As we can see from the two tables above, the important features in our model, compared to those in the unigram model, are relatively similar. In fact, all of the text features have not make it into top 5 :/