# Natural Language Processing 

## Project 1: Fine-Grained Sentiment Analysis on Financial Microblogs

In [1]:
import os
import os.path as osp
import json
import numpy as np
import pandas as pd
import re
from autocorrect import spell

In [2]:
datadir = 'data/'
savedir = 'models_6/'
if not osp.exists(savedir): os.mkdir(savedir)

In [3]:
from keras import backend as K
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, MaxPooling1D, Flatten
from keras.layers import LSTM, GRU, Conv1D
from keras.layers import Input, Dense, Dropout, BatchNormalization, Activation
from keras.layers.merge import Concatenate
from keras.models import Sequential, Model, load_model
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard

Using TensorFlow backend.


## Read Data

In [4]:
with open(datadir + 'training_set.json', 'r') as f:
    training_set = pd.DataFrame(json.load(f))

with open(datadir + 'test_set.json', 'r') as f:
    test_set = pd.DataFrame(json.load(f))
    
with open(datadir + 'NTUSD_Fin_word_v1.0.json', 'r') as f:
    word_dict = pd.DataFrame(json.load(f)).set_index('token')

with open(datadir + 'NTUSD_Fin_hashtag_v1.0.json', 'r') as f:
    hashtag_dict = pd.DataFrame(json.load(f)).set_index('token')

## Preprocessing

In [5]:
train_raw = training_set['tweet'].tolist()
train_Y = np.array(training_set['sentiment'].astype(np.float32).tolist())
train_len = len(train_raw)

In [6]:
test_raw = test_set['tweet'].tolist()
test_Y = np.array(test_set['sentiment'].astype(np.float32).tolist())
test_len = len(test_raw)

** Clean Corpus **

In [7]:
def clean_corpus(corpus):
    for i, data in enumerate(corpus):
        data = re.sub(r'\$[A-Za-z0-9]*[ ,]?', '', data) # remove $ target
        data = re.sub(r'@[a-zA-Z0-9]*', '', data) # remove @ tag
        data = re.sub(r'http.*[a-zA-Z0-9]?', '', data) # remove url
        data = re.sub(r'&#39;', '\'', data) # fix '
        data = re.sub(r'[0-9.,]*[0-9]+', '', data) # remove numbers
        data = re.sub(r'(~?&[a-z]*;)', '', data) # remove Latex
        data = re.sub(r'["$%&()*+\-/:;<=>@[\]^_`{|}~…—\n\t•]|[.]+\.', ' ', data) # remove characters
        data = re.sub(r'#', ' #', data) # split continuous hashtag
        data = re.sub(r' +', ' ', data) # remove space redundancy
        corpus[i] = data
    return corpus

In [8]:
def general_sentiment(corpus):
    S = []
    for data in corpus:
        S_data = 0.0
        sentences = re.split('[.,!?]', re.sub(r'#[A-Za-z]*', '', data))
        for sentence in sentences:
            S_sentence, v = 1., 0.
            words = [w for w in re.split(' ', sentence) if w != '']
            for word in words:
                try:
                    s = word_dict.loc[word.lower()]['market_sentiment']
                    S_sentence *= s
                    v += 1
                except:
                    pass
            if v > 0:
                S_data += np.sign(S_sentence) * np.abs(S_sentence) ** (1/v)
        S.append(S_data)
    return np.array(S)

In [9]:
corpus = clean_corpus(train_raw + test_raw)
senti = general_sentiment(corpus)

train_X = corpus[:train_len]
test_X = corpus[train_len:]
train_S = senti[:train_len]
test_S = senti[train_len:]

## Construct Word Embedding

** Tokenize words **

In [10]:
filters = '!"$%&()*+,-./:;<=>?@[\]^_`{|}~'
tokenizer = Tokenizer(filters=filters)
tokenizer.fit_on_texts(corpus)
word_index = tokenizer.word_index

In [11]:
with open(savedir + 'word_index.json', 'w') as f:
    json.dump(word_index, f)

** Text to index sequences **

In [12]:
train_Xi = tokenizer.texts_to_sequences(train_X)
test_Xi = tokenizer.texts_to_sequences(test_X)

** Pad index sequences **

In [13]:
train_Xi = pad_sequences(train_Xi)
maxlen = train_Xi.shape[1]
test_Xi = pad_sequences(test_Xi, maxlen=maxlen)

In [14]:
num_words = len(word_index) + 1
emb_dim = 300 + 3
embedding_matrix = np.zeros((num_words, emb_dim))

for (word, index) in word_index.items():
    try:
        if word[0] == '#':
            content = hashtag_dict.loc[word[1:]]    
        else:
            content = word_dict.loc[word]
        
        bear = content['bear_cfidf'] / 100
        bull = content['bull_cfidf'] / 100
        sentiment = content['market_sentiment']
        word_vec = content['word_vec']
        embedding_matrix[index] = np.asarray(word_vec + [bear, bull, sentiment], dtype=np.float32)
    except:
        continue

## Construct Models

### Loss

In [15]:
def f1(Y, Yp):
    thresh = 0.0
    Yp = K.cast(K.greater(Yp, thresh), dtype='float32')
    Y = K.cast(K.greater(Y, thresh), dtype='float32')
    tp = K.sum(Y * Yp)
    
    precision = tp / (K.sum(Yp))
    recall = tp / (K.sum(Y))
    result = 2 * ((precision * recall) / (precision + recall))
    return result

### 1. GRU

In [16]:
in_emb = Input(shape=(maxlen,), name='Input-Text')
in_senti = Input(shape=(1,), name='Input-Sentiment')

x = Embedding(num_words,
              emb_dim,
              weights=[embedding_matrix],
              input_length=maxlen,
              trainable=False,
              name='Embedding')(in_emb)
x = GRU(128, activation='relu', dropout=0.2, name='GRU')(x)
x = Dense(64, activation='relu', name='Dense')(x)
x = Dropout(0.2, name='Dropout')(x)
x = Concatenate(name='Concatenate')([x, in_senti])
out = Dense(1, activation='tanh', name='Tanh')(x)

gru = Model(inputs=[in_emb, in_senti], outputs=out)
gru.compile(optimizer='adam', loss='mse', metrics=[f1])
#gru.summary()

** Train GRU **

In [17]:
model_path = savedir + 'GRU.h5'

es = EarlyStopping(monitor='val_loss', patience=5, verbose=1, mode='min')
cp = ModelCheckpoint(monitor='val_loss', save_best_only=True, save_weights_only=False,
                     mode='min', filepath=model_path)

history = gru.fit([train_Xi, train_S], train_Y, validation_data=([test_Xi, test_S], test_Y),
                  epochs=30, verbose=1, batch_size=32, callbacks=[es, cp])

hist = history.history

Train on 1396 samples, validate on 634 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 00016: early stopping


** Save GRU Model **

In [18]:
best_val_mse = np.min(hist['val_loss'])
print('Best MSE:', best_val_mse)
gru_path = savedir + 'GRU_%.4f.h5' % best_val_mse
if osp.exists(model_path): os.rename(model_path, gru_path)

Best MSE: 0.077510135454


### 2. LSTM

In [19]:
in_emb = Input(shape=(maxlen,), name='Input')
in_senti = Input(shape=(1,), name='Input-Sentiment')

x = Embedding(num_words,
              emb_dim,
              weights=[embedding_matrix],
              input_length=maxlen,
              trainable=False,
              name='Embedding')(in_emb)
x = LSTM(128, activation='relu', dropout=0.2, name='LSTM')(x)
x = Dense(64, activation='relu', name='Dense')(x)
x = Dropout(0.2, name='Dropout')(x)
x = Concatenate(name='Concatenate')([x, in_senti])
out = Dense(1, activation='tanh', name='Tanh')(x)

lstm = Model(inputs=[in_emb, in_senti], outputs=out)
lstm.compile(optimizer='adam', loss='mse', metrics=[f1])
#lstm.summary()

** Train LSTM **

In [20]:
model_path = savedir + 'LSTM.h5'

es = EarlyStopping(monitor='val_loss', patience=5, verbose=1, mode='min')
cp = ModelCheckpoint(monitor='val_loss', save_best_only=True, save_weights_only=False,
                     mode='min', filepath=model_path)

history = lstm.fit([train_Xi, train_S], train_Y, validation_data=([test_Xi, test_S], test_Y),
                   epochs=30, verbose=1, batch_size=32, callbacks=[es, cp])

hist = history.history

Train on 1396 samples, validate on 634 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 00020: early stopping


** Save LSTM Model **

In [21]:
best_val_mse = np.min(hist['val_loss'])
print('Best MSE:', best_val_mse)
lstm_path = savedir + 'LSTM_%.4f.h5' % best_val_mse
if osp.exists(model_path): os.rename(model_path, lstm_path)

Best MSE: 0.0709879824228


### 3. Conv1D

In [34]:
in_emb = Input(shape=(maxlen,), name='Input')
in_senti = Input(shape=(1,), name='Input-Sentiment')

x = Embedding(num_words,
              emb_dim,
              weights=[embedding_matrix],
              input_length=maxlen,
              trainable=False,
              name='Embedding')(in_emb)
x = Conv1D(128, 3, activation='relu', name='Conv1D')(x)
x = Flatten()(x)
x = Dense(64, activation='relu', name='Dense')(x)
x = Dropout(0.2, name='Dropout')(x)
x = Concatenate(name='Concatenate')([x, in_senti])
out = Dense(1, activation='tanh', name='Tanh')(x)

conv = Model(inputs=[in_emb, in_senti], outputs=out)
conv.compile(optimizer='adam', loss='mse', metrics=[f1])
conv.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
Input (InputLayer)               (None, 29)            0                                            
____________________________________________________________________________________________________
Embedding (Embedding)            (None, 29, 303)       1092012     Input[0][0]                      
____________________________________________________________________________________________________
Conv1D (Conv1D)                  (None, 27, 128)       116480      Embedding[0][0]                  
____________________________________________________________________________________________________
flatten_3 (Flatten)              (None, 3456)          0           Conv1D[0][0]                     
___________________________________________________________________________________________

** Train Conv **

In [23]:
model_path = savedir + 'CONV.h5'

es = EarlyStopping(monitor='val_loss', patience=5, verbose=1, mode='min')
cp = ModelCheckpoint(monitor='val_loss', save_best_only=True, save_weights_only=False,
                     mode='min', filepath=model_path)

history = conv.fit([train_Xi, train_S], train_Y, validation_data=([test_Xi, test_S], test_Y),
                   epochs=30, verbose=1, batch_size=32, callbacks=[es, cp])

hist = history.history

Train on 1396 samples, validate on 634 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 00020: early stopping


** Save Conv Model **

In [24]:
best_val_mse = np.min(hist['val_loss'])
print('Best MSE:', best_val_mse)
conv_path = savedir + 'CONV_%.4f.h5' % best_val_mse
if osp.exists(model_path): os.rename(model_path, conv_path)

Best MSE: 0.0735768427138


## Test

In [25]:
def compute_mse(Y, Yp):
    return np.mean((Y - Yp) ** 2)

def classify_sentiments(Y, thres):
    res = np.zeros(len(Y))
    res[Y > thres] = 1
    res[Y < thres] = -1
    res[(res != 1) & (res != -1)] = 0
    return res

def compute_acc(Y, Yp, thres, Cgt=None):
    if Cgt is None: Cgt = classify_sentiments(Y, thres)
    Cp = classify_sentiments(Yp, thres)
    
    true = (Cgt == Cp)
    tp, tn = true & (Cgt > 0), true & (Cgt < 0)
    
    acc = np.sum(true) / len(Y)
    
    precision = np.sum(tp) / np.sum(Cp[Cp > 0])
    recall = np.sum(tp) / np.sum(Cgt[Cgt > 0])
    f1 = 2 * (precision * recall) / (precision + recall)
    
    return [acc, f1]

** Load Best Model **

In [26]:
gru = load_model(gru_path, custom_objects={'f1': f1})
test_gru = gru.predict([test_Xi, test_S]).flatten()

lstm = load_model(lstm_path, custom_objects={'f1': f1})
test_lstm= lstm.predict([test_Xi, test_S]).flatten()

conv = load_model(conv_path, custom_objects={'f1': f1})
test_conv = conv.predict([test_Xi, test_S]).flatten()

** Bullish / Bearish / Neutral **

In [27]:
thres = 0.0
Cgt = classify_sentiments(test_Y, thres)

res = []
res.append(['GRU'] + compute_acc(test_Y, test_gru, thres, Cgt))
res.append(['LSTM'] + compute_acc(test_Y, test_lstm, thres, Cgt))
res.append(['CONV'] + compute_acc(test_Y, test_conv, thres, Cgt))

res_df = pd.DataFrame(res, columns=['Model', 'Accuracy', 'F1 score'])
res_df

Unnamed: 0,Model,Accuracy,F1 score
0,GRU,0.804416,0.854742
1,LSTM,0.826498,0.875878
2,CONV,0.802839,0.856132


## Ensemble

In [28]:
def ensemble(model_paths):
    test_ens = np.zeros(test_len)
    for path in model_paths:
        print('model: %s' % path)
        model = load_model(path, custom_objects={'f1': f1})
        test_ens += model.predict([test_Xi, test_S]).flatten()
    return test_ens / len(model_paths)

In [29]:
model_list = [gru, lstm, conv]
model_paths = [gru_path, lstm_path, conv_path]

#test_ens = (test_gru + test_lstm + test_conv) / 3
test_ens = ensemble(model_paths)
mse_ens = compute_mse(test_Y, test_ens)
acc_ens, f1_ens = compute_acc(test_Y, test_ens, thres, Cgt)
res_string = 'mse: %.4f, acc: %.4f, f1: %.4f' % (mse_ens, acc_ens, f1_ens)
with open(savedir + 'res.txt', 'w') as f:
    f.write(res_string)
    print(res_string)

model: models_6/GRU_0.0775.h5
model: models_6/LSTM_0.0710.h5
model: models_6/CONV_0.0736.h5
mse: 0.0673, acc: 0.8312, f1: 0.8790


** Show difference **

In [30]:
diff = pd.DataFrame([])
diff['tweet'] = pd.Series(corpus[-test_len:])
diff['snippet'] = test_set['snippet']
diff['ground truth'] = pd.Series(test_Y)
diff['prediction'] = pd.Series(test_ens)
diff['Correct?'] = pd.Series(['x' if d else 'o' for d in np.abs(np.sign(test_Y) - np.sign(test_ens))])

In [31]:
pd.set_option('display.max_rows', 1000)
diff

Unnamed: 0,tweet,snippet,ground truth,prediction,Correct?
0,ooks pretty bullish for now. from a short term...,ooks pretty bullish for now,0.323,-0.005806,x
1,"looks really interesting on drop, grabbed some...","[looks really interesting on drop, grabbed som...",0.579,0.416405,o
2,covered some shorts for pts,covered some shorts,0.294,-0.039397,x
3,Watching W triple top forming.,triple top forming.,0.028,0.302616,o
4,Whole Foods shareholders vote down activist in...,Whole Foods shareholders vote down activist in...,-0.076,0.229382,x
5,has returned to my short watchlist. Still too ...,Still too early for an entry,-0.387,0.171539,x
6,Today I bought more,Today I bought more,0.087,0.165196,o
7,Short Setups Looking Nice Really Nice,Short Setups Looking Nice....Really Nice,-0.464,-0.4676,o
8,Tesla is recalling Model X cars,"Tesla is recalling 2,700 Model X cars",-0.291,-0.316396,o
9,Thank you Google Alphabet and Facebook stocks!...,What a nice reversal.,0.318,0.187781,o
