### Imports

In [1]:
import nltk
from nltk.corpus import brown
from nltk.corpus import stopwords
import string
# nltk.download('punkt') # Download the 'punkt' tokenizer
from nltk.tokenize import word_tokenize
from nltk.util import bigrams
import numpy as np
import gensim
import re
import pandas as pd
import os
from tabulate import tabulate
from gensim.models.phrases import Phrases, Phraser
nltk.download('stopwords')
import pickle
import itertools  #used for flattening lists of lists
import math
import csv
from help_functions import test_dictionary
from help_functions import get_next_qtr
import matplotlib.pyplot as plt
from collections import Counter



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Steve\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Load Stop Words, CSV Files, and Year List, SNL_Map

In [2]:
nltk_stop_words = set(stopwords.words('english'))
words_to_remove = ['below', 'haven']   #reasonable words to be meaningful
stop_words = list(filter(lambda word: word not in words_to_remove, nltk_stop_words))
print(stop_words)

### create list of csv files and years
csv_files = [os.path.join('data', file) for file in os.listdir('data')]
years_list = list(range(2009, 2025))
print('')
print(csv_files)

### create a look up file to get ciq id and map to related snl id
snl_ciq_map = {}
with open('snldata/SNl_CIQ_MAP.csv', mode='r') as file:
    reader = csv.reader(file)
    next(reader) #skips first line
    for row in reader:
        snl_ciq_map[int(row[1][2:])] = int(row[0])

### import csv as dataframes for transcript/company ids and loan data/snl ids
delta_df = pd.read_csv('snldata/transcript_loans.csv', encoding='utf-8')
print('')
print(tabulate(delta_df.head(), headers='keys', tablefmt='pretty'))

print('')
print(f'mapping files have been loaded in')

['before', 'between', 'about', 'o', "shan't", 'now', 'him', 'is', "you've", 'had', 'whom', 'out', "she's", 'wouldn', 'again', 'hasn', "it's", 'ours', 'how', "hadn't", 'shouldn', "don't", 'he', 'their', 'why', 'more', "that'll", 'mustn', 'there', 'i', 'our', 'are', 'was', "mightn't", 'd', 'isn', 'off', 'until', 'can', 'wasn', 'further', "wasn't", 'herself', 'to', 'yours', 'nor', "aren't", 'weren', 'yourselves', 'than', 'too', 'then', 'your', 'over', 'me', 'but', 've', 'doesn', 'aren', 'ain', 'after', 'yourself', "mustn't", 'were', 'don', 'a', 'with', "couldn't", 'couldn', 'his', 'own', "didn't", 'some', "needn't", 'these', 'from', 'through', 'down', "doesn't", 'hers', 'this', 'few', 'all', 'no', "won't", 'here', 'who', "you'd", 'its', 'just', 'it', 't', "isn't", 'will', 'has', 'on', 'under', 'same', 'having', 'in', "weren't", 'she', 'and', 'only', 'such', 'shan', 'didn', 'ourselves', 'does', 'while', 'any', 'because', 'be', 'doing', 'we', 'being', 'the', 'both', 'most', "should've", "yo

## Class Definition for Object Storage

In [3]:
class model_bundles:
  bundle_dict = {}

  def __init__(self, year, model, uncertainty_wordlist, corpus_bigrams, bigram_dict, flat_corpus_bigrams):

    self.year = year
    self.model = model
    self.uncertainty_wordlist = uncertainty_wordlist
    self.corpus_bigrams = corpus_bigrams
    self.bigram_dict = bigram_dict
    self.flat_corpus_bigrams = flat_corpus_bigrams
    self.idf_dict = create_idf_dict(self)

    model_bundles.bundle_dict[year] = self

class allyr_model:

  def __init__(self, model, uncertainty_wordlist, corpus_bigrams, bigram_dict, flat_corpus_bigrams):

    self.model = model
    self.uncertainty_wordlist = uncertainty_wordlist
    self.corpus_bigrams = corpus_bigrams
    self.bigram_dict = bigram_dict
    self.flat_corpus_bigrams = flat_corpus_bigrams
    self.idf_dict = create_idf_dict(self)



## Functions

### Split Text Function

In [4]:
def split_text_to_sentences_words(text):
    # Split into sentences
    sentences = nltk.sent_tokenize(text)

    # Split each sentence into a list of words
    sentences_words = [nltk.word_tokenize(sentence) for sentence in sentences]

    return sentences_words

### Get Corpus, Process Data, and Create Bigrams

In [5]:
#takes single csv file and returns dictionary transcipt_id: [[word, word, word],[word, word, word]]
def get_corpus(csv_file):
   df = pd.read_csv(csv_file, encoding='utf-8')
   df['COMPONENTTEXT_SPLIT'] = df['COMPONENTTEXT'].apply(split_text_to_sentences_words)

   unprocessed_dict = {}
   for row in df.itertuples():
      if row.TRANSCRIPTID not in unprocessed_dict:
         unprocessed_dict[row.TRANSCRIPTID] = row.COMPONENTTEXT_SPLIT
      else:
         unprocessed_dict[row.TRANSCRIPTID].extend(row.COMPONENTTEXT_SPLIT)
   return unprocessed_dict

#takes unprocessed dict and returns processed dict
def process_the_data(unprocessed_dict, stop_words):
   num_tokens_before = 0
   num_tokens_after = 0
   processed_dict = {}

   for transcriptid, text in unprocessed_dict.items():
      p_text = []
      for sentence in text:
         p_sentence = []
         for word in sentence:
            num_tokens_before += 1
            p_word = word.lower()                                  #lowercase the text
            p_word = re.sub(r'(?<!\w)-(?!\w)|[^\w\s-]', '', p_word)  #remove punctuation but keep hyphens
            if p_word in stop_words or len(p_word) == 0:             #ignore if word has no length (ie was punctuation only) or in stop words
               continue
            p_sentence.append(p_word)
         p_text.append(p_sentence)
         num_tokens_after += len(p_sentence)
      processed_dict[transcriptid] = p_text
   return processed_dict


"""
below function creates:
corpus_bigrams (list of sentences for full corpus to run through word2vec
dict_bigrams: links transcript id to corpus for uncertainty calc
flat_corpus_bigrams: flat version of corpus bigrams for counting word appearances easily
"""

def create_bigrams(processed_dict, min_count=10, threshold = 100):
   processed_corpus = [['']]
   for value in processed_dict.values():
      processed_corpus.extend(value)

   phrases = Phrases(processed_corpus, min_count, threshold, scoring='default')
   bigram_phraser = Phraser(phrases)

   corpus_bigrams = [['']]

   dict_bigrams = {}
   for transcript_id, text in processed_dict.items():
      bigram_text = [bigram_phraser[sentence] for sentence in text]
      dict_bigrams[transcript_id] = bigram_text
      corpus_bigrams.extend(bigram_text)


   flat_corpus_bigrams = [item for sublist in corpus_bigrams for item in sublist]

   # print('bigrams created')
   return phrases, bigram_phraser, corpus_bigrams, dict_bigrams, flat_corpus_bigrams



### Create IDF dictionary

In [6]:
def create_idf_dict(bundle):
    idf_dict = {}
    n = len(bundle.bigram_dict)
    for word in bundle.uncertainty_wordlist:
        t = 0
        for transcript_id, bigrams in bundle.bigram_dict.items():
            transcript_text = list(itertools.chain.from_iterable(bigrams))
            if word in transcript_text:
                t+=1

        idf= math.log2(n/t)
        idf_dict[word] = idf

    return idf_dict



### Calculate Uncertainty scores - regular and soto variant

In [7]:
def calc_uncertainty_reg(bundle, delta_df, column_name):
    for transcript_id, bigrams in bundle.bigram_dict.items():
        transcript_text = list(itertools.chain.from_iterable(bigrams))
        uncty_cnt = 0
        for word in transcript_text:
            if word in bundle.uncertainty_wordlist:
                uncty_cnt += 1
        uncty_score = uncty_cnt/len(transcript_text)
        try:
            delta_df.loc[delta_df['transcript_id'] == transcript_id, column_name] = uncty_score
        except KeyError:
            continue

def calc_uncertainty_soto(bundle, delta_df, column_name):
    for transcript_id, bigrams in bundle.bigram_dict.items():
        transcript_text = list(itertools.chain.from_iterable(bigrams))
        word_count_dict = Counter(transcript_text)
        d = len(transcript_text)
        tf_idf_sum = 0
        for word in full_model.uncertainty_wordlist:
            if word in transcript_text:
                t = word_count_dict[word]
                tf = t/d
                idf = full_model.idf_dict[word]
                tf_idf = tf*idf
                tf_idf_sum += tf_idf

        uncty_score = tf_idf_sum/len(set(transcript_text))
        try:
            delta_df.loc[delta_df['transcript_id'] == transcript_id, column_name] = uncty_score
            # print(uncty_score)
        except KeyError:
            # print(f'{transcript_id}: key error')
            continue



### Extend yearly corpus to full corpus

In [8]:
def extend_to_full(model_bundle_dict):
    corpus_bigrams_allyrs = []
    flat_corpus_allyrs = []
    bigram_dict_allyrs = {}

    for year, bundle in model_bundle_dict.items():
        corpus_bigrams_allyrs.extend(bundle.corpus_bigrams)
        flat_corpus_allyrs.extend(bundle.flat_corpus_bigrams)
        bigram_dict_allyrs.update(bundle.bigram_dict)

    return corpus_bigrams_allyrs,flat_corpus_allyrs,bigram_dict_allyrs


### Word2Vec Model

In [9]:

def apply_word2vec(corpus_bigrams):
   model = gensim.models.Word2Vec (
    vector_size=150,    # Number of features in word vector

    window=10,   # Context window size (in each direction). Default is 5


    min_count=5, # Words must appear this many times to be in vocab.
                 #   Default is 5

    workers=10,  # Training thread count

    sg=1,        # 0: CBOW, 1: Skip-gram.

    hs=0,        # 0: Negative Sampling, 1: Hierarchical Softmax
                 #   Default is 0, NS

    negative=5   # Nmber of negative samples
                 #   Default is 5
   )

   model.build_vocab(
    corpus_bigrams,
    progress_per=20000  # Tweaks how often progress is reported
   )

   model.train(
    corpus_bigrams,
    total_examples=len(corpus_bigrams),
    epochs=10,        # How many training passes to take.
    report_delay=10.0 # Report progress every 10 seconds.
   )

   return(model)

## Run Models

In [10]:
for index, file in enumerate(csv_files):
    corpus_dict = get_corpus(file)
    processed_corpus_dict = process_the_data(corpus_dict, stop_words)
    phrases, bigram_phraser, corpus_bigrams, bigram_dict, flat_corpus_bigrams = create_bigrams(processed_corpus_dict)
    trained_model = apply_word2vec(corpus_bigrams)

    similar_words = trained_model.wv.most_similar('uncertainty', topn=100)
    word_list = [word for word, number in similar_words]

    model_bundles(year=years_list[index], model=trained_model, uncertainty_wordlist=word_list,
               corpus_bigrams=corpus_bigrams, bigram_dict=bigram_dict, flat_corpus_bigrams = flat_corpus_bigrams)




print('done!!')

done!!


In [11]:
corpus_bigrams_allyrs,flat_corpus_allyrs,bigram_dict_allyrs = extend_to_full(model_bundles.bundle_dict)
trained_model = apply_word2vec(corpus_bigrams_allyrs)
similar_words = trained_model.wv.most_similar('uncertainty', topn=150)
word_list = [word for word, number in similar_words]

full_model = allyr_model(model=trained_model, uncertainty_wordlist=word_list,
               corpus_bigrams=corpus_bigrams_allyrs, bigram_dict = bigram_dict_allyrs, flat_corpus_bigrams = flat_corpus_allyrs)

print('done!!')

done!!


## Calc Uncertainty Scores

In [38]:
for year, bundle in model_bundles.bundle_dict.items():
    for transcript_id, bigrams in bundle.bigram_dict.items():
        transcript_text = list(itertools.chain.from_iterable(bigrams))
        uncty_cnt = 0
        for word in transcript_text:
            if word in bundle.uncertainty_wordlist:
                uncty_cnt += 1
        uncty_score = uncty_cnt/len(transcript_text)
        try:
            delta_df.loc[delta_df['transcript_id'] == transcript_id, 'year_uncertainty_score'] = uncty_score
        except KeyError:
            continue


for transcript_id, bigrams in full_model.bigram_dict.items():
    transcript_text = list(itertools.chain.from_iterable(bigrams))

    uncty_cnt = 0
    for word in transcript_text:
        if word in full_model.uncertainty_wordlist:
            uncty_cnt += 1
    uncty_score = uncty_cnt/len(transcript_text)
    try:
        delta_df.loc[delta_df['transcript_id'] == transcript_id, 'full_uncertainty_score'] = uncty_score
    except KeyError:
        continue




In [12]:
for year, bundle in model_bundles.bundle_dict.items():
    calc_uncertainty_reg(bundle, delta_df, 'year_uncertainty_score_reg')


calc_uncertainty_reg(full_model, delta_df, 'full_uncertainty_score_reg')

for year, bundle in model_bundles.bundle_dict.items():
    calc_uncertainty_soto(bundle, delta_df, 'year_uncertainty_score_soto')


calc_uncertainty_soto(full_model, delta_df, 'full_uncertainty_score_soto')

In [29]:
# for year, bundle in model_bundles.bundle_dict.items():
#     idf_dict = {}
#     n = len(bundle.bigram_dict)
#     for word in bundle.uncertainty_wordlist:
#         t = 0
#         for transcript_id, bigrams in bundle.bigram_dict.items():
#             transcript_text = list(itertools.chain.from_iterable(bigrams))
#             if word in transcript_text:
#                 t+=1

#         idf= math.log2(n/t)
#         idf_dict[word] = idf

#     bundle.idf_dict = idf_dict


# idf_dict = {}
# n = len(full_model.bigram_dict)
# for word in full_model.uncertainty_wordlist:
#     t = 0
#     for transcript_id, bigrams in full_model.bigram_dict.items():
#         transcript_text = list(itertools.chain.from_iterable(bigrams))
#         if word in transcript_text:
#             t+=1

#     idf= math.log2(n/t)
#     idf_dict[word] = idf

# full_model.idf_dict = idf_dict



In [45]:
for year, bundle in model_bundles.bundle_dict.items():
    for transcript_id, bigrams in bundle.bigram_dict.items():
        transcript_text = list(itertools.chain.from_iterable(bigrams))
        word_count_dict = Counter(transcript_text)
        d = len(transcript_text)
        tf_idf_sum = 0
        for word in bundle.uncertainty_wordlist:
            if word in transcript_text:
                t = word_count_dict[word]
                tf = t/d
                idf = bundle.idf_dict[word]
                tf_idf = tf*idf
                tf_idf_sum += tf_idf

        uncty_score = tf_idf_sum/len(set(transcript_text))
        try:
            # print(f'uncertainty score:{uncty_score}| tf_idf_sum = {tf_idf_sum}| transcript_id = {transcript_id}| d = {d}')
            delta_df.loc[delta_df['transcript_id'] == transcript_id, 'full_uncertainty_score_soto'] = uncty_score
            # print(uncty_score)
        except KeyError:
            # print(f'{transcript_id}: key error')
            continue


for transcript_id, bigrams in full_model.bigram_dict.items():
    transcript_text = list(itertools.chain.from_iterable(bigrams))
    word_count_dict = Counter(transcript_text)
    d = len(transcript_text)
    tf_idf_sum = 0
    for word in full_model.uncertainty_wordlist:
        if word in transcript_text:
            t = word_count_dict[word]
            tf = t/d
            idf = full_model.idf_dict[word]
            tf_idf = tf*idf
            tf_idf_sum += tf_idf

    uncty_score = tf_idf_sum/len(set(transcript_text))
    try:
        delta_df.loc[delta_df['transcript_id'] == transcript_id, 'year_uncertainty_score_soto'] = uncty_score
        # print(uncty_score)
    except KeyError:
        # print(f'{transcript_id}: key error')
        continue

In [13]:
print(tabulate(delta_df.head(), headers='keys', tablefmt='pretty'))

+---+---------------+----------+---------+---------+------+--------------------+-------------------+-------------+------------+--------------------+-----------+----------------------------+----------------------------+-----------------------------+-----------------------------+
|   | transcript_id |  ciq_id  | snl_id  | quarter | year | total_asset_before | total_asset_after | loan_before | loan_after |     l2a_delta      | loan_diff | year_uncertainty_score_reg | full_uncertainty_score_reg | year_uncertainty_score_soto | full_uncertainty_score_soto |
+---+---------------+----------+---------+---------+------+--------------------+-------------------+-------------+------------+--------------------+-----------+----------------------------+----------------------------+-----------------------------+-----------------------------+
| 0 |    1790209    | 13314302 | 4055785 | Q4 2019 | 2019 |      12269288      |     12159919      |  10240434   |  10500284  | 0.0288763191369147 |  259850   |   

## Get Word Lists

In [None]:
for key, value in model_year.years.items():

    print(f'{key}: {value.uncertainty_wordlist}')

In [None]:
# idf_scores = {}
# n = len(years_list)

# for object in model_year.years.values():
#     for word in object.uncertainty_wordlist:
#         if word not in idf_scores.keys():
#             count = 0

#             for object in model_year.years.values():
#                 if word in object.flat_corpus_bigrams:
#                     count += 1



#         # print(f'{word}:({n}/{1+count})')
#         idf = math.log(n/(1+count))
#         idf_scores[word] = idf

# print(idf_scores)

idf_scores = {}
n = len(years_list)

for object in model_year.years.values():
    for word in object.uncertainty_wordlist:
        if word not in idf_scores.keys():
            count = 0

            for object in model_year.years.values():
                if word in object.uncertainty_wordlist:
                    count += 1

        print(f'{word}:({n}/{1+count})')
        idf = math.log(n/(1+count))
        idf_scores[word] = idf

# print(idf_scores)



In [None]:
tfidf_df = empty_df = pd.DataFrame(index=years_list, columns=idf_scores.keys())
print(tfidf_df.head())

In [12]:
from collections import Counter


In [None]:
combined_uncertainty_wordlist = []

for year in years_list:
    for uncertainty_word in model_year.years[year].uncertainty_wordlist:
        if uncertainty_word not in combined_uncertainty_wordlist:
            combined_uncertainty_wordlist.append(uncertainty_word)

print(len(combined_uncertainty_wordlist))

In [14]:
## count total instances of word
for year in years_list:
    word_counts = Counter((model_year.years[year].flat_corpus_bigrams))
    for uncertainty_word in combined_uncertainty_wordlist:
        count = word_counts[uncertainty_word]
        tfidf_df.at[year, uncertainty_word] = count

In [None]:
print(years_list)

In [None]:
for year in years_list:
    print(year)
    for word in model_year.years[year].uncertainty_wordlist:
        average_count = tfidf_df[word].mean()
        year_count = tfidf_df.at[year, word]

        if year_count/average_count > 3:
            print(f"{word} | {year_count} / {average_count}")
    print()
    print('---')

In [None]:
index = model_year.years[2024].flat_corpus_bigrams.index('qt')
print(index)
start = max(0, index - 10)
end = min(len(word_list), index + 11)

print(model_year.years[2024].flat_corpus_bigrams[index - 10:index + 10])

In [30]:
for word in tfidf_df.columns:
    for year in years_list:
        idf = idf_scores[word]

        doc_length = len(model_year.years[year].uncertainty_wordlist)
        count = 0

        for checked_word in model_year.years[year].uncertainty_wordlist:
            if word == checked_word:
                count+=1

        tf = count/doc_length
        tf_idf = tf * idf

        tfidf_df.at[year, word] = tf_idf







In [None]:
print(tfidf_df.head())

In [None]:
# def top_10_columns_by_row(row):
#     # Sort values in descending order and get top 10 columns
#     return row.nlargest(10).index.tolist()

# df['top_10_columns'] = tfidf_df.apply(top_10_columns_by_row, axis=1)
# print(df[['top_10_columns']])

for index, row in tfidf_df.iterrows():
    # Sort row values in descending order and print top 10 with column names
    print(f"Row {index}:")
    sorted_row = row.sort_values(ascending=False)

    # Print column names and values for the top 10 items
    for col, value in sorted_row.head(20).items():
        print(f"{col}, Value: {value}")

In [None]:
print(model_year.years[2010].corpus_bigrams[1:5])
print(len(model_year.years[2010].corpus_bigrams))

In [26]:
scores = {}

for year, model in model_year.years.items():
    score_list = [0] * len(model.uncertainty_wordlist)  #create dummy holder for frequency scores

    for index, word in enumerate(model.uncertainty_wordlist):
        count = 0
        for model in model_year.years.values():
            if word in model.uncertainty_wordlist:
                count += 1
        score_list[index] = count
    scores[year] = score_list


In [None]:
for year, score_list in scores.items():
    print(year)
    for index, score in enumerate(score_list):
        if score < 2:
            print(f'{model_year.years[year].uncertainty_wordlist[index]} : {score}')
    print()
    print()

In [None]:
similar_words = trained_model.wv.most_similar('uncertainty', topn=100)
word_list = [word for word, number in similar_words]

# Print the most similar words and their similarity scores
for word, similarity in similar_words:
    print(f"{word}: Similarity = {similarity:.4f}")

In [None]:


# Scatter plot of uncertainty score vs loan movement
plt.scatter(delta_df['uncertainty_score'], delta_df['l2a_delta'])
plt.xlabel('Uncertainty Score')
plt.ylabel('Loan Movement')
plt.title('Uncertainty Score vs Loan Movement')
plt.show()

correlation = delta_df['uncertainty_score'].corr(delta_df['l2a_delta'])
print(f"Correlation between Uncertainty Score and Loan Movement: {correlation}")