In [34]:
import tensorflow as tf
import pandas as pd

import nltk
import numpy as np


In [7]:
df = pd.read_csv("financial_data_kaggle.csv",
                 names=["sentiment", "text"],
                 encoding="utf-8", encoding_errors="replace")

In [8]:
df.head()

Unnamed: 0,sentiment,text
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


In [9]:
df["sentiment"].value_counts()

neutral     2879
positive    1363
negative     604
Name: sentiment, dtype: int64

In [10]:
filter_ = df["sentiment"].map(lambda x: x in {"positive", "negative"})

In [11]:
# let's deal with neutrals later, probably can assign them to 0.5
df = df[filter_].copy()

In [12]:
df["sentiment"].value_counts()

positive    1363
negative     604
Name: sentiment, dtype: int64

In [13]:
mapping_ = {
    "positive": 1,
    "negative": 0,
}

In [14]:
df["sentiment"] = df["sentiment"].map(mapping_)

In [16]:
def preprocess_text(input_text: str) -> str:
    s = input_text.lower()
    s = (s
         .replace('<br />', ' ')
         .replace('`', "'")
         .replace('´',"'")
         .replace(" '", ' "')
         .replace("-", " - ")
         .replace("/", " ")
         .replace("_", " ")
        )
    return s

In [17]:
df["text"] = df["text"].map(preprocess_text)

In [18]:
tokenizer_func = nltk.word_tokenize

In [19]:
def texts_to_sequences(corpus, vocabulary_index, tok_func):
    corpus_tokens = []
    for text in corpus:
        tokens = tok_func(text)
        indicies = [vocabulary_index.get(x, 1) for x in tokens]
        corpus_tokens.append(indicies)
    return corpus_tokens

In [20]:
import json

with open('vocab_index.json', 'r') as fp:
    vocabulary_index = json.load(fp)

In [23]:
model = tf.keras.models.load_model('glove_word_averaging_model.keras')

In [24]:
X_eval = texts_to_sequences(df["text"], vocabulary_index, tokenizer_func)

In [25]:
X_eval = tf.keras.preprocessing.sequence.pad_sequences(X_eval, maxlen=512)

In [26]:
predictions = model.predict(X_eval)



In [27]:
df["preds"] = predictions

In [28]:
df.head()

Unnamed: 0,sentiment,text,preds
2,0,the international electronic industry company ...,0.673213
3,1,with the new production plant the company woul...,0.715204
4,1,"according to the company ""s updated strategy f...",0.194155
5,1,"financing of aspocomp ""s growth aspocomp is ag...",0.487524
6,1,"for the last quarter of 2010 , componenta ""s n...",0.817759


In [29]:
df["pred_discrete"] = (df["preds"] > 0.5).map(int)

In [35]:
print("accuracy is", np.mean(df["pred_discrete"] == df["sentiment"]))

accuracy is 0.5892221657346213
