## Imports

In [1]:
import os
import re
from collections import Counter 
import pandas as pd
import numpy as np
from pprint import pprint

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize, sent_tokenize

import spacy

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from transformers import BertTokenizer, BertForSequenceClassification, pipeline
from notebookjs import execute_js

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Setting SPACY configs

nlp_spacy = spacy.load("en_core_web_lg")
nlp_spacy.max_length = 1500000

# Setting NLTK configs
stop_words = stopwords.words('english')
ps = WordNetLemmatizer()

## Initializing D3JS source

In [3]:
d3_path = "https://d3js.org/d3.v7.min.js"
code_vis_box_plot  = open("d3js_javascript_files/vis_box_plot.js", "r").read()
code_vis_bubble_chart  = open("d3js_javascript_files/vis_bubble_chart.js", "r").read()
code_vis_grouped_bar_chart  = open("d3js_javascript_files/vis_grouped_bar_chart.js", "r").read()
code_vis_pie_charts  = open("d3js_javascript_files/vis_pie_charts.js", "r").read()
code_vis_simple_bar_charts  = open("d3js_javascript_files/vis_simple_bar_charts.js", "r").read()
code_vis_small_multiple_area_chart  = open("d3js_javascript_files/vis_small_multiple_area_chart.js", "r").read()
code_viz_lda_topics_matrix = open("d3js_javascript_files/viz_lda_topics_matrix.js", "r").read()

## Initializing FINBERT

In [4]:
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer)

## Reading Earnings Calls Transcripts

In [5]:
transcript_data_list = []
path = "/Users/raghavsikaria/Sangharsh/NYU Sem 2/VML/project/transcript_data"

In [6]:
for filename in os.listdir(path):
    filepath = os.path.join(path, filename)
    print(f"filename: {filename}")
    
    with open(filepath, mode='r') as f:
        content = f.read()
        transcript_data_list.append({"company": filename.split(".")[0], "transcript": content})

filename: wfc.txt
filename: gs.txt
filename: usb.txt
filename: jpm.txt
filename: cof.txt
filename: ms.txt
filename: tfc.txt
filename: td.txt
filename: c.txt
filename: bac.txt
filename: pnc.txt


In [7]:
df = pd.DataFrame(transcript_data_list)
df

Unnamed: 0,company,transcript
0,wfc,John Campbell\n\nGood morning. Thank you for j...
1,gs,Carey Halio\n\nGood morning. This is Carey Hal...
2,usb,"George Anderson\n\nThank you, Brad. Good morni..."
3,jpm,"Jeremy Barnum\n\nThanks, and good morning, eve..."
4,cof,"Jeff Norris\n\nThanks very much, Amy, and welc..."
5,ms,"James Gorman\n\nGood morning, everyone and tha..."
6,tfc,"Ankur Vyas\n\nThank you, Allay, and good morni..."
7,td,"Brooke Hales\n\nThank you, Operator. Good afte..."
8,c,"Jen Landis\n\nThank you, operator. Good mornin..."
9,bac,"Lee McEntire\n\nThank you, Catherine. Good mor..."


## Applying techniques for basic text analysis

In [8]:
df['word_count'] = df['transcript'].apply(lambda x: len(str(x).split(" ")))
df['char_count'] = df['transcript'].str.len()
df['stopwords'] = df['transcript'].apply(lambda x: len([x for x in x.split() if x in stop_words]))
df['numerics'] = df['transcript'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
df

Unnamed: 0,company,transcript,word_count,char_count,stopwords,numerics
0,wfc,John Campbell\n\nGood morning. Thank you for j...,4237,27344,1531,15
1,gs,Carey Halio\n\nGood morning. This is Carey Hal...,2585,16623,911,15
2,usb,"George Anderson\n\nThank you, Brad. Good morni...",2341,15185,846,15
3,jpm,"Jeremy Barnum\n\nThanks, and good morning, eve...",1991,12535,702,5
4,cof,"Jeff Norris\n\nThanks very much, Amy, and welc...",2424,15581,835,35
5,ms,"James Gorman\n\nGood morning, everyone and tha...",2137,13983,732,10
6,tfc,"Ankur Vyas\n\nThank you, Allay, and good morni...",4445,28748,1621,41
7,td,"Brooke Hales\n\nThank you, Operator. Good afte...",3973,26737,1266,31
8,c,"Jen Landis\n\nThank you, operator. Good mornin...",4797,29936,1849,24
9,bac,"Lee McEntire\n\nThank you, Catherine. Good mor...",6242,38407,2393,45


## Preprocessing data and using FinBERT to get sentiment scores

In [9]:
def process_transcript_corpus(raw_transcript_corpus):
    ptc = re.sub('[^a-zA-Z]', ' ', raw_transcript_corpus)
    ptc = ptc.lower()
    ptc = ptc.split()
    processed_corpus = [ps.lemmatize(word) for word in ptc if not word in stop_words]
    processed_corpus = ' '.join(processed_corpus)
    return processed_corpus

def top_10_words(processed_corpus):
    word_counts = Counter(processed_corpus.split(" ")) 
    return word_counts.most_common(10)

def tokenize_sentences(raw_corpus):
    return sent_tokenize(raw_corpus)

def apply_finbert(tokenized_raw_sentences):
    return nlp(tokenized_raw_sentences)

def sentence_counter(tokenized_raw_sentences):
    return len(tokenized_raw_sentences)

def score_array_converter(finbert_scores):
    label_score_mapper = {'Neutral': 0, 'Positive': 1, 'Negative': -1}
    scores = []
    for item in finbert_scores:
        scores.append(label_score_mapper[item['label']])
    return scores

def score_array_cumulative(score_array):
    prefixed_sum_score = []
    prefixed_sum_score.append(score_array[0])
    
    for i in range(1, len(score_array)):
        prefixed_sum_score.append(score_array[i] + prefixed_sum_score[i-1])
    return prefixed_sum_score

def count_sentiment_totals(finbert_score_array): 
    sentiment_totals = {1: 0, -1: 0, 0: 0}
    for i in finbert_score_array:
        sentiment_totals[i] += 1
    return sentiment_totals

lda_corpus_list = []
def process_for_lda(raw_transcript):
    # Code source: https://highdemandskills.com/topic-modeling-lda/#h3-2
    # all credits to the author

    raw_transcript = raw_transcript.strip()  # Remove white space at the beginning and end
    raw_transcript = raw_transcript.replace('\n', ' ') # Replace the \n (new line) character with space
    raw_transcript = raw_transcript.replace('\r', '') # Replace the \r (carriage returns -if you're on windows) with null
    raw_transcript = raw_transcript.replace(' ', ' ') # Replace " " (a special character for space in HTML) with space. 
    while '  ' in raw_transcript:
        raw_transcript = raw_transcript.replace('  ', ' ') 
        
    proc_spacy_transcript = nlp_spacy(raw_transcript)
    temp_list = []
    
    for token in proc_spacy_transcript:
        if token.is_stop == False and token.is_punct == False and (token.pos_ == "NOUN" or token.pos_ == "ADJ" or token.pos_ =="VERB"):
            temp_list.append(token.lemma_.lower())
    lda_corpus_list.append(temp_list)
    return temp_list

In [10]:
df['p_transcript']=df['transcript'].apply(process_transcript_corpus)
df['top_10_words']=df['p_transcript'].apply(top_10_words)
df['tokenized_raw_sentences']=df['transcript'].apply(tokenize_sentences)
df['finbert_scores']=df['tokenized_raw_sentences'].apply(apply_finbert)
df['number_tokenized_raw_sentences']=df['tokenized_raw_sentences'].apply(sentence_counter)
df['finbert_score_array']=df['finbert_scores'].apply(score_array_converter)
df['finbert_score_array_cumulative']=df['finbert_score_array'].apply(score_array_cumulative)
df['sentiment_totals']=df['finbert_score_array'].apply(count_sentiment_totals)
df['processed_corpus_for_lda']=df['transcript'].apply(process_for_lda)

In [11]:
companies_full_name = ['Wells Fargo & Co', 'Goldman Sachs Group Inc', 'US Bancorp', 'JPMorgan Chase & Co', 'Capital One Financial Corp.', 'Morgan Stanley', 'Truist Financial Corp', 'Toronto-Dominion Bank', 'Citigroup Inc', 'Bank of America Corp', 'PNC']

In [12]:
df['full_company_name'] = companies_full_name

In [13]:
df

Unnamed: 0,company,transcript,word_count,char_count,stopwords,numerics,p_transcript,top_10_words,tokenized_raw_sentences,finbert_scores,number_tokenized_raw_sentences,finbert_score_array,finbert_score_array_cumulative,sentiment_totals,processed_corpus_for_lda,full_company_name
0,wfc,John Campbell\n\nGood morning. Thank you for j...,4237,27344,1531,15,john campbell good morning thank joining call ...,"[(quarter, 71), (year, 59), (loan, 45), (ago, ...","[John Campbell\n\nGood morning., Thank you for...","[{'label': 'Neutral', 'score': 0.6510444283485...",198,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, ...","{1: 78, -1: 38, 0: 82}","[good, morning, thank, join, today, ceo, discu...",Wells Fargo & Co
1,gs,Carey Halio\n\nGood morning. This is Carey Hal...,2585,16623,911,15,carey halio good morning carey halio head inve...,"[(billion, 39), (year, 36), (quarter, 31), (re...","[Carey Halio\n\nGood morning., This is Carey H...","[{'label': 'Positive', 'score': 0.536954581737...",139,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, -...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, -1, -1, -...","{1: 53, -1: 28, 0: 58}","[good, morning, fourth, quarter, earning, conf...",Goldman Sachs Group Inc
2,usb,"George Anderson\n\nThank you, Brad. Good morni...",2341,15185,846,15,george anderson thank brad good morning everyo...,"[(quarter, 34), (deposit, 28), (billion, 26), ...","[George Anderson\n\nThank you, Brad., Good mor...","[{'label': 'Neutral', 'score': 0.9224075675010...",110,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, ...","[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, ...","{1: 34, -1: 3, 0: 73}","[thank, good, morning, today, prepared, remark...",US Bancorp
3,jpm,"Jeremy Barnum\n\nThanks, and good morning, eve...",1991,12535,702,5,jeremy barnum thanks good morning everyone pre...,"[(year, 72), (billion, 40), (quarter, 38), (dr...","[Jeremy Barnum\n\nThanks, and good morning, ev...","[{'label': 'Neutral', 'score': 0.8495514988899...",112,"[0, 0, 0, 0, -1, 0, 0, 0, 1, -1, 0, 0, 0, 0, 1...","[0, 0, 0, 0, -1, -1, -1, -1, 0, -1, -1, -1, -1...","{1: 42, -1: 24, 0: 46}","[thank, good, morning, presentation, available...",JPMorgan Chase & Co
4,cof,"Jeff Norris\n\nThanks very much, Amy, and welc...",2424,15581,835,35,jeff norris thanks much amy welcome everybody ...,"[(quarter, 86), (year, 35), (first, 33), (basi...","[Jeff Norris\n\nThanks very much, Amy, and wel...","[{'label': 'Neutral', 'score': 0.9985010623931...",140,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1,...","{1: 52, -1: 25, 0: 63}","[thank, welcome, quarter, earning, conference,...",Capital One Financial Corp.
5,ms,"James Gorman\n\nGood morning, everyone and tha...",2137,13983,732,10,james gorman good morning everyone thank joini...,"[(year, 31), (billion, 30), (revenue, 29), (qu...","[James Gorman\n\nGood morning, everyone and th...","[{'label': 'Neutral', 'score': 0.6943262815475...",120,"[0, 1, 1, 0, 1, -1, -1, -1, 0, 1, 0, 1, 1, 1, ...","[0, 1, 2, 2, 3, 2, 1, 0, 0, 1, 1, 2, 3, 4, 5, ...","{1: 56, -1: 26, 0: 38}","[good, morning, thank, join, quarter, eventful...",Morgan Stanley
6,tfc,"Ankur Vyas\n\nThank you, Allay, and good morni...",4445,28748,1621,41,ankur vyas thank allay good morning everyone w...,"[(truist, 41), (quarter, 41), (client, 34), (d...","[Ankur Vyas\n\nThank you, Allay, and good morn...","[{'label': 'Neutral', 'score': 0.9455332159996...",196,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 1, 1, 1,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 1, 2,...","{1: 90, -1: 21, 0: 85}","[thank, good, morning, quarter, earning, today...",Truist Financial Corp
7,td,"Brooke Hales\n\nThank you, Operator. Good afte...",3973,26737,1266,31,brooke hale thank operator good afternoon welc...,"[(quarter, 57), (year, 54), (bank, 44), (td, 3...","[Brooke Hales\n\nThank you, Operator., Good af...","[{'label': 'Neutral', 'score': 0.9762963652610...",203,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","{1: 95, -1: 25, 0: 83}","[thank, good, afternoon, welcome, quarter, inv...",Toronto-Dominion Bank
8,c,"Jen Landis\n\nThank you, operator. Good mornin...",4797,29936,1849,24,jen landis thank operator good morning thank j...,"[(client, 45), (revenue, 38), (quarter, 35), (...","[Jen Landis\n\nThank you, operator., Good morn...","[{'label': 'Neutral', 'score': 0.9504846930503...",236,"[0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, ...","[0, 1, 1, 1, 1, 1, 2, 3, 4, 5, 6, 6, 6, 7, 8, ...","{1: 115, -1: 27, 0: 94}","[thank, operator, good, morning, thank, join, ...",Citigroup Inc
9,bac,"Lee McEntire\n\nThank you, Catherine. Good mor...",6242,38407,2393,45,lee mcentire thank catherine good morning welc...,"[(quarter, 98), (deposit, 61), (billion, 60), ...","[Lee McEntire\n\nThank you, Catherine., Good m...","[{'label': 'Neutral', 'score': 0.9740446805953...",309,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","{1: 128, -1: 32, 0: 149}","[thank, good, morning, thank, join, review, qu...",Bank of America Corp


## Finding top 10 terms which are present in all earnings calls transcripts

In [14]:
all_top_10_terms = []
for i, c_d in enumerate(list(df['top_10_words'])):
    all_top_10_terms.extend([t[0] for t in c_d])
all_top_10_terms = list(set(all_top_10_terms))
# all_top_10_terms

In [15]:
total_count_all_top_10_terms = Counter()  
for term in all_top_10_terms:
    count_for_term = 0
    for doc_id in list(df['top_10_words']):
        if term in [t[0] for t in doc_id]:
            count_for_term += 1
    total_count_all_top_10_terms[term] = count_for_term

In [16]:
total_count_all_top_10_terms.most_common(10)

[('quarter', 11),
 ('year', 9),
 ('deposit', 7),
 ('billion', 7),
 ('client', 5),
 ('first', 4),
 ('revenue', 4),
 ('business', 4),
 ('loan', 3),
 ('rate', 3)]

## Applying TF-IDF to Earnings Calls Transcripts corpus

In [17]:
corpus = list(df['p_transcript'])
# corpus

In [18]:
len(corpus)

11

In [19]:
tfidf_v=TfidfVectorizer(ngram_range=(1,3))
X=tfidf_v.fit_transform(corpus).toarray()

In [20]:
X.shape

(11, 38972)

In [21]:
tfidf_v.get_feature_names_out()[:40]

array(['abated', 'abated cet', 'abated cet ratio', 'abated period',
       'abated period end', 'ability', 'ability attract',
       'ability attract asset', 'ability complete',
       'ability complete work', 'ability continue',
       'ability continue deliver', 'ability convert',
       'ability convert new', 'ability realize',
       'ability realize significant', 'able', 'able discus',
       'able discus specific', 'absence', 'absence goodwill',
       'absence goodwill impairment', 'absent', 'absent geopolitical',
       'absent geopolitical surprise', 'absent retailer',
       'absent retailer partner', 'absolute', 'absolute level',
       'absolute level interest', 'absolutely', 'absolutely paramount',
       'absolutely paramount mark', 'absorb', 'absorb temporary',
       'absorb temporary upfront', 'absorbed', 'absorbed partner',
       'absorbed partner impact', 'absorbing'], dtype=object)

In [22]:
tfidf_v.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.float64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 3),
 'norm': 'l2',
 'preprocessor': None,
 'smooth_idf': True,
 'stop_words': None,
 'strip_accents': None,
 'sublinear_tf': False,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'use_idf': True,
 'vocabulary': None}

In [23]:
tfidf_df = pd.DataFrame(X, columns=tfidf_v.get_feature_names_out())
tfidf_df

Unnamed: 0,abated,abated cet,abated cet ratio,abated period,abated period end,ability,ability attract,ability attract asset,ability complete,ability complete work,...,zelle grew,zelle grew past,zelle interaction,zelle interaction continue,zelle particular,zelle particular underscoring,zelle remember,zelle remember back,zelle transaction,zelle transaction crossed
0,0.018331,0.009165,0.009165,0.009165,0.009165,0.012315,0.0,0.0,0.009165,0.009165,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.009068,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.010045,0.014953,0.014953,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0063,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.009378,0.009378,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.007765,0.007765,0.007765,0.007765,0.0,0.0,0.007765,0.007765,0.007765,0.007765


## Finding top 10 terms by maximum TF-IDF score across corpus

In [24]:
top_10_tf_idf_values = []
for i in range(0,11):
    top_10_tf_idf_values.append(tfidf_df.iloc[i].nlargest(10).to_frame().T.to_dict())

# top_10_tf_idf_values

In [25]:
all_top_10_tfidf_terms = []
for i, c_d in enumerate(top_10_tf_idf_values):
    all_top_10_tfidf_terms.extend(list(c_d.keys()))

all_top_10_tfidf_terms = list(set(all_top_10_tfidf_terms))
# all_top_10_tfidf_terms

In [26]:
total_count_c = Counter()  
for tfidf_term in all_top_10_tfidf_terms:
    count_for_term = 0
    for doc_id in range(0,11):
        if tfidf_term in list(top_10_tf_idf_values[doc_id].keys()):
            count_for_term += 1
    total_count_c[tfidf_term] = count_for_term

In [27]:
total_count_c.most_common(10)

[('quarter', 11),
 ('year', 8),
 ('billion', 6),
 ('deposit', 5),
 ('revenue', 4),
 ('client', 4),
 ('first', 3),
 ('adjusted', 3),
 ('slide', 3),
 ('first quarter', 3)]

## Applying LDA to conduct Topic Modelling

In [28]:
NUM_topics = 5
ID2word = corpora.Dictionary(lda_corpus_list)
train_corpus_bow = [ID2word.doc2bow(doc) for doc in lda_corpus_list]

TFIDF = models.TfidfModel(train_corpus_bow)
train_corpus_tfidf = TFIDF[corpus]

NameError: name 'corpora' is not defined

In [None]:
lda_bow_model = gensim.models.LdaMulticore(corpus=train_corpus_bow, num_topics=NUM_topics, id2word=ID2word, passes=100)
lda_bow_model.print_topics(num_words=5)

In [None]:
coherence_model_lda_bow = gensim.models.CoherenceModel(model=lda_bow_model, texts=lda_corpus_list, dictionary=ID2word, coherence='c_v')
coherence_lda_bow = coherence_model_lda_bow.get_coherence()
print('Coherence Score: ', coherence_lda_bow)

In [None]:
lda_tfidf_model = gensim.models.LdaMulticore(corpus=train_corpus_tfidf, num_topics=NUM_topics, id2word=ID2word, passes=100)
lda_tfidf_model.print_topics(num_words=5)

In [None]:
coherence_model_lda_tfidf = gensim.models.CoherenceModel(model=lda_tfidf_model, texts=lda_corpus_list, dictionary=ID2word, coherence='c_v')
coherence_lda_tfidf = coherence_model_lda_tfidf.get_coherence()
print('Coherence Score: ', coherence_lda_tfidf)

## Functions for generating data for all visualizations

In [None]:
def get_data_for_viz_number_of_characters():
    company_character_count_data = []
    for index, row in df.iterrows():
        company_character_count_data.append({"company": row['full_company_name'], "number_of_characters":row['char_count']})

    company_character_count_data = sorted(company_character_count_data, key=lambda d: d['number_of_characters']) 
    return company_character_count_data

def get_data_for_viz_number_of_sentences():
    company_sentence_count_data = []
    for index, row in df.iterrows():
        company_sentence_count_data.append({"company": row['full_company_name'], "number_of_sentences":row['number_tokenized_raw_sentences']})

    company_sentence_count_data = sorted(company_sentence_count_data, key=lambda d: d['number_of_sentences']) 
    return company_sentence_count_data

def get_data_for_viz_number_of_words():
    maximum_number_words = max(list(df['word_count']))
    word_types = ['Words', 'Stop Words']
    
    grouped_data = []
    for index, row in df.iterrows():
        grouped_data.append(
            {
                "company": row['full_company_name'], 
                "words": [
                    {"word": "Words", "count": row['word_count']}, 
                    {"word": "Stop Words", "count": row['stopwords']}
                ]
            })
    
    grouped_data = sorted(grouped_data, key=lambda d: d['words'][0]['count']) 
    company_names = [i['company'] for i in grouped_data]
    return company_names, maximum_number_words, word_types, grouped_data

def get_data_for_viz_prefix_sum_sentiments():
    company_prefix_sum_data = []
    grid = []
    n_rows = 6
    n_cols = 2
            
    for i in range(0, n_rows):
        for j in range(0, n_cols):
            grid.append([i, 0 if not j else 1])
    
    for index, row in df.iterrows():
        
        company_prefix_sum_data.append(
            {
                "company": row['full_company_name'],
                "prefix_sum_sentiments": [
                    {"number": index, "prefix_sum": i} for index, i in enumerate(row['finbert_score_array_cumulative'])
                ],
                "row": grid[index][0],
                "col": grid[index][1]
            }
        )
        
    return company_prefix_sum_data

def get_data_for_viz_top_10_words_by_frequency():
    top_10_words_by_frequency = []
    for row in total_count_all_top_10_terms.most_common(10):
        top_10_words_by_frequency.append({"word": row[0], "number_of_transcripts":row[1]})
    return top_10_words_by_frequency

def get_data_for_viz_top_10_words_by_tfidf():
    top_10_words_by_tfidf = []
    for row in total_count_c.most_common(10):
        top_10_words_by_tfidf.append({"word": row[0], "number_of_transcripts":row[1]})
    return top_10_words_by_tfidf

def get_data_for_viz_sentiment_counts():
    sentiment_counts = []
    for index, row in df.iterrows():
        pct_of_positive = float(row['sentiment_totals'][1])/(row['sentiment_totals'][1]+row['sentiment_totals'][-1]+row['sentiment_totals'][0])
        sentiment_counts.append(
            {
                "company": row['full_company_name'], 
                "sentiment_count": [
                    {"sentiment": "Positive", "count":row['sentiment_totals'][1]}, 
                    {"sentiment": "Negative", "count":row['sentiment_totals'][-1]}, 
                    {"sentiment": "Neutral", "count":row['sentiment_totals'][0]}
                ],
                "pct_of_positive": pct_of_positive
            }
        )
    sentiment_counts = sorted(sentiment_counts, key=lambda d: d['pct_of_positive'])
    company_names = [i['company'] for i in sentiment_counts]
    return sentiment_counts, company_names

def get_data_for_parts_of_transcript_boxplot():
    box_plot_data = []
    box_plot_numbers = []
    
    for index, row in df.iterrows():
        company_name = row['full_company_name']
        parts_of_transcript = np.array_split(row['finbert_score_array'], 4)
        
        box_plot_data.append({"company": company_name, "part_of_transcript": "1st 5 sentences", "s_value": np.mean(row['finbert_score_array'][:5])})
        box_plot_data.append({"company": company_name, "part_of_transcript": "Last 5 sentences", "s_value": np.mean(row['finbert_score_array'][-5:])})
        box_plot_data.append({"company": company_name, "part_of_transcript": "1st quarter of transcript", "s_value": np.mean(parts_of_transcript[0])})
        box_plot_data.append({"company": company_name, "part_of_transcript": "2nd quarter of transcript", "s_value": np.mean(parts_of_transcript[1])})
        box_plot_data.append({"company": company_name, "part_of_transcript": "3rd quarter of transcript", "s_value": np.mean(parts_of_transcript[2])})
        box_plot_data.append({"company": company_name, "part_of_transcript": "4th quarter of transcript", "s_value": np.mean(parts_of_transcript[3])})
        
    df_stat = pd.DataFrame(box_plot_data)
    df_stat_group = df_stat.groupby(['part_of_transcript'])
    
    
    for group_id in ['1st 5 sentences', '1st quarter of transcript', '2nd quarter of transcript', '3rd quarter of transcript', '4th quarter of transcript', 'Last 5 sentences']:
        temp_list = list(df_stat_group.get_group(group_id)['s_value'])
        temp_list.sort()
        q75, q25 = np.percentile(temp_list, [75 ,25])
        box_plot_numbers.append({"key": group_id, "value": {
            "q1": q25, 
            "q3": q75, 
            "interQuantileRange": q75 - q25, 
            "median": np.median(temp_list), 
            "min": min(temp_list),
            "max": max(temp_list)
        }
    })    
    
    return box_plot_data, box_plot_numbers

def get_data_for_viz_lda_topics_matrix():
    data = []
    topics = set()
    words = set()
    max_value = -1

    for i in lda_bow_model.print_topics(num_words=5):
        tn = re.sub('[^a-zA-Z0-9.]', ' ', i[1])
        tnl = tn.split()
        res = [[tnl[i], tnl[i + 1]] for i in range(0, len(tnl), 2)]

        for index, topic_word in enumerate(res):
            topics.add(f"Topic {i[0]+1}")
            words.add(f"Word {index+1}")
            if max_value < float(topic_word[0]):
                max_value = float(topic_word[0])
            data.append({"topic": f"Topic {i[0]+1}", "word": f"Word {index+1}", "value": topic_word[1], "significance_score": float(topic_word[0])})

    topics = sorted(list(topics))
    words = sorted(list(words))
    
    return data, topics, words, max_value

In [None]:
data_for_parts_of_transcript_boxplot = get_data_for_parts_of_transcript_boxplot()
# data_for_parts_of_transcript_boxplot

In [None]:
data_for_viz_sentiment_counts = get_data_for_viz_sentiment_counts()
# data_for_viz_sentiment_counts

In [None]:
data_for_viz_top_10_words_by_tfidf = get_data_for_viz_top_10_words_by_tfidf()
# data_for_viz_top_10_words_by_tfidf

In [None]:
data_for_viz_top_10_words_by_frequency = get_data_for_viz_top_10_words_by_frequency()
# data_for_viz_top_10_words_by_frequency

In [None]:
data_for_viz_prefix_sum_sentiments = get_data_for_viz_prefix_sum_sentiments()
# data_for_viz_prefix_sum_sentiments

In [None]:
data_for_viz_number_of_words = get_data_for_viz_number_of_words()
# data_for_viz_number_of_words

In [None]:
data_for_viz_number_of_characters = get_data_for_viz_number_of_characters()
# data_for_viz_number_of_characters

In [None]:
data_for_viz_number_of_sentences = get_data_for_viz_number_of_sentences()
# data_for_viz_number_of_sentences

In [None]:
data_for_viz_lda_topics_matrix = get_data_for_viz_lda_topics_matrix()
# data_for_viz_lda_topics_matrix

## Generating all visualizations using D3JS

In [None]:
execute_js(
    library_list=[d3_path, code_vis_simple_bar_charts], 
    main_function="viz_number_of_sentences",
    data_dict={"viz_data": data_for_viz_number_of_sentences}
)

In [None]:
execute_js(
    library_list=[d3_path, code_vis_simple_bar_charts], 
    main_function="viz_number_of_characters",
    data_dict={"viz_data": data_for_viz_number_of_characters}
)

In [None]:
execute_js(
    library_list=[d3_path, code_vis_grouped_bar_chart], 
    main_function="viz_number_of_words",
    data_dict={
        "company_names": data_for_viz_number_of_words[0], 
        "maximum_number_words": data_for_viz_number_of_words[1], 
        "word_types": data_for_viz_number_of_words[2], 
        "grouped_data": data_for_viz_number_of_words[3]
    }
)

In [None]:
execute_js(
    library_list=[d3_path, code_vis_small_multiple_area_chart], 
    main_function="viz_prefix_sum_sentiments",
    data_dict={
        "data_for_viz_prefix_sum_sentiments": data_for_viz_prefix_sum_sentiments
    }
)

In [None]:
execute_js(
    library_list=[d3_path, code_vis_bubble_chart], 
    main_function="viz_top_10_words",
    data_dict={
        "data_for_viz_top_10_words": data_for_viz_top_10_words_by_frequency,
        "mode": "freq"
    }
)

In [None]:
execute_js(
    library_list=[d3_path, code_vis_bubble_chart], 
    main_function="viz_top_10_words",
    data_dict={
        "data_for_viz_top_10_words": data_for_viz_top_10_words_by_tfidf,
        "mode": "tfidf"
    }
)

In [None]:
execute_js(
    library_list=[d3_path, code_vis_pie_charts], 
    main_function="viz_sentiment_counts",
    data_dict={
        "data_for_viz_sentiment_counts": data_for_viz_sentiment_counts[0],
        "company_name": data_for_viz_sentiment_counts[1]
    }
)

In [None]:
execute_js(
    library_list=[d3_path, code_vis_box_plot], 
    main_function="viz_parts_of_transcript_boxplot",
    data_dict={
        "data_for_parts_of_transcript_boxplot": data_for_parts_of_transcript_boxplot[0],
        "data_for_parts_of_transcript_boxplot_specifics": data_for_parts_of_transcript_boxplot[1],
    }
)

In [None]:
execute_js(
    library_list=[d3_path, code_viz_lda_topics_matrix], 
    main_function="viz_lda_topics",
    data_dict={
        "data": data_for_viz_lda_topics_matrix[0],
        "topics": data_for_viz_lda_topics_matrix[1],
        "words": data_for_viz_lda_topics_matrix[2],
        "max_value": data_for_viz_lda_topics_matrix[3]
    }
)