In [1]:
import tensorflow as tf
import pandas as pd

import nltk
import numpy as np

import tensorflow_hub as hub
import tensorflow_text as text


In [2]:
df = pd.read_csv("financial_data_kaggle.csv",
                 names=["sentiment", "text"],
                 encoding="utf-8", encoding_errors="replace")

In [3]:
df.head()

Unnamed: 0,sentiment,text
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


In [4]:
df["sentiment"].value_counts()

neutral     2879
positive    1363
negative     604
Name: sentiment, dtype: int64

In [5]:
filter_ = df["sentiment"].map(lambda x: x in {"positive", "negative"})

In [6]:
# let's deal with neutrals later, probably can assign them to 0.5
df = df[filter_].copy()

In [7]:
df["sentiment"].value_counts()

positive    1363
negative     604
Name: sentiment, dtype: int64

In [8]:
mapping_ = {
    "positive": 1,
    "negative": 0,
}

In [9]:
df["sentiment"] = df["sentiment"].map(mapping_)

In [10]:
def preprocess_text(input_text: str) -> str:
    s = input_text.lower()
    s = (s
         .replace('<br />', ' ')
         .replace('`', "'")
         .replace('´',"'")
         .replace(" '", ' "')
         .replace("-", " - ")
         .replace("/", " ")
         .replace("_", " ")
        )
    return s

In [11]:
df["text"] = df["text"].map(preprocess_text)

In [12]:
tokenizer_func = nltk.word_tokenize

In [13]:
def texts_to_sequences(corpus, vocabulary_index, tok_func):
    corpus_tokens = []
    for text in corpus:
        tokens = tok_func(text)
        indicies = [vocabulary_index.get(x, 1) for x in tokens]
        corpus_tokens.append(indicies)
    return corpus_tokens

In [14]:
import json

with open('vocab_index.json', 'r') as fp:
    vocabulary_index = json.load(fp)

In [15]:
model = tf.keras.models.load_model('glove_word_averaging_model.keras')

In [16]:
corpus = df["text"]

In [17]:
X_eval = texts_to_sequences(corpus, vocabulary_index, tokenizer_func)

In [18]:
X_eval = tf.keras.preprocessing.sequence.pad_sequences(X_eval, maxlen=512)

In [19]:
predictions = model.predict(X_eval)



In [20]:
df["preds"] = predictions

In [21]:
df.head()

Unnamed: 0,sentiment,text,preds
2,0,the international electronic industry company ...,0.673213
3,1,with the new production plant the company woul...,0.715204
4,1,"according to the company ""s updated strategy f...",0.194155
5,1,"financing of aspocomp ""s growth aspocomp is ag...",0.487524
6,1,"for the last quarter of 2010 , componenta ""s n...",0.817759


In [22]:
df["pred_discrete"] = (df["preds"] > 0.5).map(int)

In [23]:
print("accuracy is", np.mean(df["pred_discrete"] == df["sentiment"]))

accuracy is 0.5892221657346213


In [None]:
# download from dropbox link I shared:
# and exctract using tar -xvf fname.tar.gz 
# should point the load to the directory coming
# out of the untar. 

In [24]:
model = tf.saved_model.load('bert_trained_on_imdb.saved_model')

In [25]:
def batchify(lst, batch_size=16): 
    indx_ = 0
    
    while indx_ < len(lst):
        yield lst[indx_: indx_ + batch_size]
        indx_ += batch_size

In [31]:
batches = list(batchify(corpus, batch_size=32))

In [32]:
from tqdm import tqdm

all_preds = []
for batch in tqdm(batches): 
    preds = model.__call__(batch, training=False)
    all_preds.append(preds.numpy().ravel())

100%|██████████| 62/62 [00:08<00:00,  7.31it/s]


In [34]:
all_preds = np.concatenate(all_preds)

In [35]:
df["bert_preds"] = all_preds

In [36]:
df["bert_preds_discrete"] = (df["bert_preds"] > 0.5).map(int)

In [37]:
print("bert model accuracy is", np.mean(df["bert_preds_discrete"] == df["sentiment"]))

bert model accuracy is 0.668530757498729
