In [1]:
import tensorflow as tf
import pandas as pd

import nltk
import numpy as np

import tensorflow_hub as hub
import tensorflow_text as text


In [2]:
df = pd.read_csv("financial_data_kaggle.csv",
                 names=["sentiment", "text"],
                 encoding="utf-8", encoding_errors="replace")

In [3]:
df.head()

Unnamed: 0,sentiment,text
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


In [4]:
df["sentiment"].value_counts()

neutral     2879
positive    1363
negative     604
Name: sentiment, dtype: int64

In [5]:
filter_ = df["sentiment"].map(lambda x: x in {"positive", "negative"})

In [6]:
# let's deal with neutrals later, probably can assign them to 0.5
df = df[filter_].copy()

In [7]:
df["sentiment"].value_counts()

positive    1363
negative     604
Name: sentiment, dtype: int64

In [8]:
mapping_ = {
    "positive": 1,
    "negative": 0,
}

In [9]:
df["sentiment"] = df["sentiment"].map(mapping_)

In [3]:
def preprocess_text(input_text: str) -> str:
    s = input_text.lower()
    s = (s
         .replace('<br />', ' ')
         .replace('`', "'")
         .replace('´',"'")
         .replace(" '", ' "')
         .replace("-", " - ")
         .replace("/", " ")
         .replace("_", " ")
        )
    return s

In [11]:
df["text"] = df["text"].map(preprocess_text)

In [12]:
tokenizer_func = nltk.word_tokenize

In [13]:
def texts_to_sequences(corpus, vocabulary_index, tok_func):
    corpus_tokens = []
    for text in corpus:
        tokens = tok_func(text)
        indicies = [vocabulary_index.get(x, 1) for x in tokens]
        corpus_tokens.append(indicies)
    return corpus_tokens

In [14]:
import json

with open('vocab_index.json', 'r') as fp:
    vocabulary_index = json.load(fp)

In [15]:
model = tf.keras.models.load_model('glove_word_averaging_model.keras')

In [16]:
corpus = df["text"]

In [17]:
X_eval = texts_to_sequences(corpus, vocabulary_index, tokenizer_func)

In [18]:
X_eval = tf.keras.preprocessing.sequence.pad_sequences(X_eval, maxlen=512)

In [19]:
predictions = model.predict(X_eval)



In [20]:
df["preds"] = predictions

In [21]:
df.head()

Unnamed: 0,sentiment,text,preds
2,0,the international electronic industry company ...,0.673213
3,1,with the new production plant the company woul...,0.715204
4,1,"according to the company ""s updated strategy f...",0.194155
5,1,"financing of aspocomp ""s growth aspocomp is ag...",0.487524
6,1,"for the last quarter of 2010 , componenta ""s n...",0.817759


In [22]:
df["pred_discrete"] = (df["preds"] > 0.5).map(int)

In [23]:
print("accuracy is", np.mean(df["pred_discrete"] == df["sentiment"]))

accuracy is 0.5892221657346213


In [26]:
df_test = pd.read_csv("./imdb_full_dataset_test.csv")

corpus = df_test["text"]
X_eval = texts_to_sequences(corpus, vocabulary_index, tokenizer_func)
X_eval = tf.keras.preprocessing.sequence.pad_sequences(X_eval, maxlen=512)
predictions = model.predict(X_eval)

df_test["preds"] = predictions

df_test["pred_discrete"] = (df_test["preds"] > 0.5).map(int)

print("accuracy is", np.mean(df_test["pred_discrete"] == df_test["label"]))

accuracy is 0.87936


In [27]:
model = tf.keras.models.load_model('glove_lstm_model.keras')

In [28]:
df_test = pd.read_csv("./imdb_full_dataset_test.csv")

corpus = df_test["text"]
X_eval = texts_to_sequences(corpus, vocabulary_index, tokenizer_func)
X_eval = tf.keras.preprocessing.sequence.pad_sequences(X_eval, maxlen=256)
predictions = model.predict(X_eval)

df_test["preds"] = predictions

df_test["pred_discrete"] = (df_test["preds"] > 0.5).map(int)

print("accuracy is", np.mean(df_test["pred_discrete"] == df_test["label"]))

accuracy is 0.86532


In [29]:
corpus = df["text"]
X_eval = texts_to_sequences(corpus, vocabulary_index, tokenizer_func)
X_eval = tf.keras.preprocessing.sequence.pad_sequences(X_eval, maxlen=256)
predictions = model.predict(X_eval)
df["preds"] = predictions
df["pred_discrete"] = (df["preds"] > 0.5).map(int)
print("accuracy is", np.mean(df["pred_discrete"] == df["sentiment"]))

accuracy is 0.5683782409761058


In [None]:
# download from dropbox link I shared:
# and exctract using tar -xvf fname.tar.gz 
# should point the load to the directory coming
# out of the untar. 

In [2]:
model = tf.saved_model.load('bert_trained_on_imdb.saved_model')

In [4]:
def batchify(lst, batch_size=16): 
    indx_ = 0
    
    while indx_ < len(lst):
        yield lst[indx_: indx_ + batch_size]
        indx_ += batch_size

In [5]:
from tqdm import tqdm

def bert_batch_predict(corpus):
    batches = list(batchify(corpus, batch_size=32))
    
    all_preds = []
    
    for batch in tqdm(batches): 
        preds = model.__call__(batch, training=False)
        all_preds.append(preds.numpy().ravel())
    
    return np.concatenate(all_preds)

In [17]:
all_preds = bert_batch_predict(list(df["text"]))

100%|██████████| 62/62 [00:34<00:00,  1.79it/s]


In [18]:
df["bert_preds"] = all_preds

In [19]:
df["bert_preds_discrete"] = (df["bert_preds"] > 0.5).map(int)

In [20]:
print("bert model accuracy is", np.mean(df["bert_preds_discrete"] == df["sentiment"]))

bert model accuracy is 0.668530757498729


In [6]:
!ls

aclImdb					   glove.6B.zip
aclImdb_v1.tar.gz			   glove_lstm_model.keras
baseline.ipynb				   glove_vecs.bin
bert_trained_on_imdb.saved_model	   glove_word_averaging_model.h5
bert_trained_on_imdb.saved_model.tar.gz    glove_word_averaging_model.keras
bert_train_on_imdb.ipynb		   imdb_full_dataset.csv
CEO CFO Remarks.py			   imdb_full_dataset_test.csv
earning_call_train.zip			   imdb_full_dataset_train.csv
Earning Call Transcript - Training Data    infer_financial_sentiments.ipynb
Earning Call Transcript - Validation Data  random_word_averaging_model.keras
earning_call_validation.zip		   random_word_lstm_model.keras
eps_logistic_regression.ipynb		   run_model_on_our_data.ipynb
financial_data_kaggle.csv		   train-00000-of-00001.parquet
financial_labels.csv			   using_glove.ipynb
glove.42B.300d.txt			   vocab_index.json
glove.42B.300d.zip


In [7]:
df = pd.read_csv("./imdb_full_dataset_test.csv")

In [8]:
df["text"] = df["text"].map(preprocess_text)
all_preds = bert_batch_predict(list(df["text"]))

100%|██████████| 782/782 [07:31<00:00,  1.73it/s]


In [9]:
df["bert_preds"] = all_preds
df["bert_preds_discrete"] = (df["bert_preds"] > 0.5).map(int)
print("bert model accuracy is", np.mean(df["bert_preds_discrete"] == df["label"]))

bert model accuracy is 0.91776


In [21]:
import json

training_earning_call_transcript_structured = json.load(
    open("../extract_data_scripts/training_earning_call_transcript_structured.json")
)
validation_earning_call_transcript_structured = json.load(
    open("../extract_data_scripts/validation_earning_call_transcript_structured.json")
)

In [22]:
# counter = 0
# from collections import Counter

# part_counter = Counter()
# for k, v in training_earning_call_transcript_structured.items():
#     try:
#         paragraphs = v['paragraphs']
#         participants = v['participants']
#         part_counter.update(participants.values())
#     except:
#         print(f"couldn't find paragraph or participant for {k}")
#         counter += 1
#         continue
    

In [23]:
# part_counter

In [24]:
# so the one we will consider are

participant_canonical = {
    'Chief Financial Officer': 'CFO',
    'CFO': 'CFO',
    'President and CEO': 'CEO',
    'President and Chief Executive Officer': 'CEO',
    'CEO': 'CEO',
    'President': 'CEO',
    'Chairman and Chief Executive Officer': 'CEO',
    'Executive Vice President and Chief Financial Officer': 'CFO',
    'President & Chief Executive Officer': 'CEO',
    'EVP and CFO': 'CFO',
    'President & CEO': 'CEO',
    'Chairman and CEO': 'CEO',
    'Senior Vice President and Chief Financial Officer': 'CFO',
    'EVP & CFO': 'CFO',
    'Executive Vice President & Chief Financial Officer': 'CFO',
    'Chairman & Chief Executive Officer': 'CEO',
    'Chairman, President and Chief Executive Officer': 'CEO',
    'SVP and CFO': 'CFO',
    'Chairman & CEO': 'CEO',
    'Chairman, President & CEO': 'CEO',
    'Senior Vice President & Chief Financial Officer': 'CFO',
    'Chief Executive Officer and President': 'CEO',
    'Chairman, President and CEO': 'CEO',
    'President, Chief Executive Officer': 'CEO',
    'Chair and Chief Executive Officer': 'CEO',
    'Senior Executive Vice President and Chief Financial Officer': 'CFO',
    'Group Chief Financial Officer': 'CFO',
    'President, CEO': 'CEO',
    'Executive Vice President, Chief Financial Officer': 'CFO',
    'SVP & CFO': 'CFO',
    'Vice President and Chief Financial Officer': 'CFO',
    'Chief Financial Officer and Treasurer': 'CFO',
    'Executive Vice President, Chief Financial Officer and Treasurer': 'CFO',
    'EVP and Chief Financial Officer': 'CFO',
    'CEO and President': 'CEO',
    'Chief Financial Officer and Chief Operating Officer': 'CFO',
    'Chairman, President & Chief Executive Officer': 'CEO',
    'Chairman, President, and CEO': 'CEO',
    'CEO & Director': 'CEO',
    'Chief Executive': 'CEO',
    'SVP & Chief Financial Officer': 'CFO',
    'Chief Execuitve Officer': 'CEO',
    'Head of Financial Control and Capital': 'CFO',
    'Senior Managing Corporate Officer and Group Chief Financial': 'CFO',
    'Acting Chief Financial Officer': 'CFO',
}

In [25]:
def canonicalize(x):
    if 'CEO' in x:
        return 'CEO'
    
    if 'Chief Executive Officer' in x:
        return 'CEO'
    
    if 'CFO' in x:
        return 'CFO'
    
    # Group Chief Financial
    
    return participant_canonical.get(x, x)

In [32]:
def bert_batch_predict(corpus):
    batches = list(batchify(corpus, batch_size=32))
    
    all_preds = []
    
    for batch in batches: 
        preds = model.__call__(batch, training=False)
        all_preds.append(preds.numpy().ravel())
    
    return np.concatenate(all_preds)

In [33]:
counter = 0

for k, v in tqdm(training_earning_call_transcript_structured.items()):
    try:
        paragraphs = v['paragraphs']
        participants = v['participants']
        
        if not participants:
            # print(f"participant empty for {k}")
            counter += 1
            continue
        
        v['_participants'] = {k: canonicalize(v) for k, v in participants.items()}
        
        if 'CEO' not in v['_participants']:
            if 'CFO' not in v['_participants']:
                counter += 1
                continue
                # print(f"couldn't find CEO/CFO participant for {k}")
                # print(participants)
        
        ceo_paragraphs = []
        cfo_paragraphs = []
        for item in paragraphs:
            speaker = item['speaker']
            if v['_participants'].get(speaker) == 'CEO':
                ceo_paragraphs.append(item['text'])
            if v['_participants'].get(speaker) == 'CFO':
                cfo_paragraphs.append(item['text'])
        
        if len(ceo_paragraphs) == 0:
            if len(cfo_paragraphs) == 0:
                counter += 1
                print(f"no ceo cfo speak found for {k}")
                continue
                
        v['_ceo_paragraphs'] = ceo_paragraphs
        v['_cfo_paragraphs'] = cfo_paragraphs
        
        if len(ceo_paragraphs) > 0:
            x = bert_batch_predict(ceo_paragraphs)
            x = [float(v) for v in x]
            v['_ceo_sentiments'] = x
            v['_ceo_avg_sentiment'] = float(np.mean(x))
            v['_ceo_median_sentiment'] = float(np.median(x))
            v['_ceo_min_sentiment'] = float(np.min(x))
            v['_ceo_max_sentiment'] = float(np.max(x))
        
        if len(cfo_paragraphs) > 0:
            x = bert_batch_predict(cfo_paragraphs)
            x = [float(v) for v in x]
            v['_cfo_sentiments'] = x
            v['_cfo_avg_sentiment'] = float(np.mean(x))
            v['_cfo_median_sentiment'] = float(np.median(x))
            v['_cfo_min_sentiment'] = float(np.min(x))
            v['_cfo_max_sentiment'] = float(np.max(x))
        
    except:
        # print(f"couldn't find paragraph or participant for {k}")
        counter += 1
        continue

 40%|███▉      | 270/683 [06:50<13:05,  1.90s/it]

no ceo cfo speak found for FJTSF


 42%|████▏     | 288/683 [07:16<09:09,  1.39s/it]

no ceo cfo speak found for GPI


 54%|█████▍    | 370/683 [09:15<08:59,  1.72s/it]

no ceo cfo speak found for LTOUF


 70%|██████▉   | 476/683 [11:34<04:22,  1.27s/it]

no ceo cfo speak found for PCRFY


 78%|███████▊  | 536/683 [12:55<04:01,  1.64s/it]

no ceo cfo speak found for SSNLF


 79%|███████▉  | 542/683 [12:59<02:12,  1.07it/s]

no ceo cfo speak found for SARTF
no ceo cfo speak found for SDMHF


 87%|████████▋ | 594/683 [14:11<02:48,  1.90s/it]

no ceo cfo speak found for TTDKY


100%|█████████▉| 680/683 [16:31<00:05,  1.83s/it]

no ceo cfo speak found for ELF


100%|██████████| 683/683 [16:35<00:00,  1.46s/it]


In [39]:
with open("../extract_data_scripts/training_earning_call_transcript_structured_with_sentiment.json", "w") as fp:
    json.dump(training_earning_call_transcript_structured, fp)

In [35]:
counter = 0

for k, v in tqdm(validation_earning_call_transcript_structured.items()):
    try:
        paragraphs = v['paragraphs']
        participants = v['participants']
        
        if not participants:
            # print(f"participant empty for {k}")
            counter += 1
            continue
        
        v['_participants'] = {k: canonicalize(v) for k, v in participants.items()}
        
        if 'CEO' not in v['_participants']:
            if 'CFO' not in v['_participants']:
                counter += 1
                continue
                # print(f"couldn't find CEO/CFO participant for {k}")
                # print(participants)
        
        ceo_paragraphs = []
        cfo_paragraphs = []
        for item in paragraphs:
            speaker = item['speaker']
            if v['_participants'].get(speaker) == 'CEO':
                ceo_paragraphs.append(item['text'])
            if v['_participants'].get(speaker) == 'CFO':
                cfo_paragraphs.append(item['text'])
        
        if len(ceo_paragraphs) == 0:
            if len(cfo_paragraphs) == 0:
                counter += 1
                print(f"no ceo cfo speak found for {k}")
                continue
                
        v['_ceo_paragraphs'] = ceo_paragraphs
        v['_cfo_paragraphs'] = cfo_paragraphs
        
        if len(ceo_paragraphs) > 0:
            x = bert_batch_predict(ceo_paragraphs)
            x = [float(v) for v in x]
            v['_ceo_sentiments'] = x
            v['_ceo_avg_sentiment'] = float(np.mean(x))
            v['_ceo_median_sentiment'] = float(np.median(x))
            v['_ceo_min_sentiment'] = float(np.min(x))
            v['_ceo_max_sentiment'] = float(np.max(x))
        
        if len(cfo_paragraphs) > 0:
            x = bert_batch_predict(cfo_paragraphs)
            x = [float(v) for v in x]
            v['_cfo_sentiments'] = x
            v['_cfo_avg_sentiment'] = float(np.mean(x))
            v['_cfo_median_sentiment'] = float(np.median(x))
            v['_cfo_min_sentiment'] = float(np.min(x))
            v['_cfo_max_sentiment'] = float(np.max(x))
        
    except:
        # print(f"couldn't find paragraph or participant for {k}")
        counter += 1
        continue

 11%|█         | 18/163 [00:24<03:25,  1.42s/it]

no ceo cfo speak found for BMA


 15%|█▌        | 25/163 [00:32<02:41,  1.17s/it]

no ceo cfo speak found for BB


 94%|█████████▍| 154/163 [03:33<00:13,  1.48s/it]

no ceo cfo speak found for XFLT


 99%|█████████▉| 161/163 [03:43<00:03,  1.55s/it]

no ceo cfo speak found for ARGX


100%|██████████| 163/163 [03:43<00:00,  1.37s/it]


In [42]:
with open("../extract_data_scripts/validation_earning_call_transcript_structured_with_sentiment.json", "w") as fp:
    json.dump(validation_earning_call_transcript_structured, fp)