### Imports

In [1]:
import nltk
from nltk.corpus import brown
from nltk.corpus import stopwords
import string
# nltk.download('punkt') # Download the 'punkt' tokenizer
from nltk.tokenize import word_tokenize
from nltk.util import bigrams
import numpy as np
import gensim
import re
import pandas as pd
import os
from tabulate import tabulate
from gensim.models.phrases import Phrases, Phraser
nltk.download('stopwords')
import pickle
import itertools  #used for flattening lists of lists
import math




[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Steve\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Stop Words, CSV Files, and Year List

In [2]:
nltk_stop_words = set(stopwords.words('english'))
words_to_remove = ['below', 'haven']
stop_words = list(filter(lambda word: word not in words_to_remove, nltk_stop_words))
print(stop_words)


csv_files = [os.path.join('data', file) for file in os.listdir('data')]
years_list = list(range(2009, 2025))
print('')
print(csv_files)


['after', 'he', "didn't", "that'll", 'whom', 'shan', 'hadn', "hasn't", 'about', 'she', 'yours', 'herself', 'only', 'had', "you'll", 'if', 'from', "mustn't", 'been', 'o', 'doesn', 'hasn', 'over', "wouldn't", 'of', 'we', 'more', 'further', 'in', 'and', 'which', 'yourselves', "doesn't", 'him', "couldn't", 'such', 'her', "aren't", 'between', "should've", 'off', 'to', 'yourself', 'that', 'do', 'few', 'a', 'because', 'i', 'is', 'has', 'who', 'any', 'having', 'again', 'all', 'aren', 'too', 'are', 'above', 'wouldn', 'at', 'against', 'during', 'before', 'up', 'then', 'ma', "mightn't", "haven't", 'what', 'where', 'ain', 'our', 'on', 'so', 'both', 'there', 'an', 'as', 'why', 'wasn', 'be', 'into', 'not', 'but', "won't", 'some', 'this', 'my', 'or', 'me', "it's", 'those', 'should', "needn't", "don't", 'once', 've', 'than', 'myself', 'same', "you've", 'doing', 'with', 'weren', 'ours', 'them', 'themselves', "hadn't", 'won', "weren't", 'm', 'hers', 'theirs', "she's", 'shouldn', 'mightn', 'when', 'will'

## Class Definition for Object Storage

In [7]:
class model_year:
  years = {}

  def __init__(self, year, model, uncertainty_wordlist, corpus_bigrams, bigram_dict, flat_corpus_bigrams):

    self.year = year
    self.model = model
    self.uncertainty_wordlist = uncertainty_wordlist
    self.corpus_bigrams = corpus_bigrams
    self.bigram_dict = bigram_dict
    self.flat_corpus_bigrams = flat_corpus_bigrams

    model_year.years[year] = self



## Functions

### Split Text Function

In [None]:
def split_text_to_sentences_words(text):
    # Split into sentences
    sentences = nltk.sent_tokenize(text)

    # Split each sentence into a list of words
    sentences_words = [nltk.word_tokenize(sentence) for sentence in sentences]

    return sentences_words

### Get Corpus, Process Data, and Create Bigrams

In [8]:


def get_corpus(csv_file):

   df = pd.read_csv(csv_file, encoding='utf-8')

   df['COMPONENTTEXT_SPLIT'] = df['COMPONENTTEXT'].apply(split_text_to_sentences_words)
   # print(f'split text has been applied:')
   # print(tabulate(df.head(2), headers='keys', tablefmt='pretty'))

   unprocessed_dict = {}
   for row in df.itertuples():
      if row.TRANSCRIPTID not in unprocessed_dict:
         unprocessed_dict[row.TRANSCRIPTID] = row.COMPONENTTEXT_SPLIT
      else:
         unprocessed_dict[row.TRANSCRIPTID].extend(row.COMPONENTTEXT_SPLIT)
   return(unprocessed_dict)

def process_the_data(unprocessed_dict, stop_words):
   num_tokens_before = 0
   num_tokens_after = 0
   processed_dict = {}

   for transcriptid, text in unprocessed_dict.items():
      p_text = []
      for sentence in text:
         p_sentence = []
         for word in sentence:
            num_tokens_before += 1
            p_word = word.lower()                                  #lowercase the text
            p_word = re.sub(r'(?<!\w)-(?!\w)|[^\w\s-]', '', p_word)  #remove punctuation but keep hyphens
            if p_word in stop_words or len(p_word) == 0:             #ignore if word has no length (ie was punctuation only) or in stop words
               continue
            p_sentence.append(p_word)
         p_text.append(p_sentence)
         num_tokens_after += len(p_sentence)
      processed_dict[transcriptid] = p_text

   # print(f'Number of Tokens before processing: {num_tokens_before:,}')
   # print(f'Number of Tokens after processing: {num_tokens_after:,}')
   # print(f'Process Sentence Examples:')

   # for index, (key, value) in enumerate(processed_dict.items()):
   #    print(f'{key}: {value}')
   #    if index == 2:
   #       break

   return processed_dict

def create_bigrams(processed_dict, min_count=10, threshold = 100):

   processed_corpus = [['']]
   for value in processed_dict.values():
      processed_corpus.extend(value)

   phrases = Phrases(processed_corpus, min_count, threshold, scoring='default')
   bigram_phraser = Phraser(phrases)

   corpus_bigrams = [['']]

   dict_bigrams = {}
   for transcript_id, text in processed_dict.items():
      bigram_text = [bigram_phraser[sentence] for sentence in text]
      dict_bigrams[transcript_id] = bigram_text
      corpus_bigrams.extend(bigram_text)


   flat_corpus_bigrams = [item for sublist in corpus_bigrams for item in sublist]

   # print('bigrams created')
   return phrases, bigram_phraser, corpus_bigrams, dict_bigrams, flat_corpus_bigrams



### Word2Vec Model

In [9]:

def apply_word2vec(corpus_bigrams):
   model = gensim.models.Word2Vec (
    vector_size=150,    # Number of features in word vector

    window=10,   # Context window size (in each direction). Default is 5


    min_count=5, # Words must appear this many times to be in vocab.
                 #   Default is 5

    workers=10,  # Training thread count

    sg=1,        # 0: CBOW, 1: Skip-gram.

    hs=0,        # 0: Negative Sampling, 1: Hierarchical Softmax
                 #   Default is 0, NS

    negative=5   # Nmber of negative samples
                 #   Default is 5
   )

   model.build_vocab(
    corpus_bigrams,
    progress_per=20000  # Tweaks how often progress is reported
   )

   print('Training the model...')

   model.train(
    corpus_bigrams,
    total_examples=len(corpus_bigrams),
    epochs=10,        # How many training passes to take.
    report_delay=10.0 # Report progress every 10 seconds.
   )

   print(' Modeling Training Done.')
   print('')

   return(model)

## Run Models

In [10]:
for index, file in enumerate(csv_files):
    corpus_dict = get_corpus(file)
    processed_corpus_dict = process_the_data(corpus_dict, stop_words)
    phrases, bigram_phraser, corpus_bigrams, dict_bigrams, flat_corpus_bigrams = create_bigrams(processed_corpus_dict)
    trained_model = apply_word2vec(corpus_bigrams)

    similar_words = trained_model.wv.most_similar('uncertainty', topn=100)
    word_list = [word for word, number in similar_words]


    model_year(year=years_list[index], model=trained_model, uncertainty_wordlist=word_list,
               corpus_bigrams=corpus_bigrams, bigram_dict = dict_bigrams, flat_corpus_bigrams = flat_corpus_bigrams)


print('done!!')

Training the model...
 Modeling Training Done.

Training the model...
 Modeling Training Done.

Training the model...
 Modeling Training Done.

Training the model...
 Modeling Training Done.

Training the model...
 Modeling Training Done.

Training the model...
 Modeling Training Done.

Training the model...
 Modeling Training Done.

Training the model...
 Modeling Training Done.

Training the model...
 Modeling Training Done.

Training the model...
 Modeling Training Done.

Training the model...
 Modeling Training Done.

Training the model...
 Modeling Training Done.

Training the model...
 Modeling Training Done.

Training the model...
 Modeling Training Done.

Training the model...
 Modeling Training Done.

Training the model...
 Modeling Training Done.

done!!


## Get Word Lists

In [11]:
for key, value in model_year.years.items():

    print(f'{key}: {value.uncertainty_wordlist}')

2009: ['turbulence', 'obstacles', 'looming', 'removes', 'safety', 'harsh', 'keenly', 'instability', 'stressful', 'rebounds', 'economic', 'crystal_ball', 'evolution', 'turndown', 'amidst', 'guarded', 'governed', 'prudent', 'stimulus', 'disruptions', 'overriding', 'exists', 'monetary', 'uncertain', 'unforeseen', 'confidence', 'poised', 'certainty', 'unknowns', 'waters', 'toll', 'predictability', 'dissatisfied', 'healthier', 'dire', 'unpredictable', 'gratifying', 'fear', 'mid-west', 'precision', 'terrible', 'plaguing', 'accurately', 'relevance', 'faces', 'political', 'prudence', 'escalating', 'navigating', 'macroeconomic', 'firmly', 'turbulent', 'illiquidity', 'confront', 'distinguish', 'monetary_fiscal', 'demands', 'unsettled', 'stagnant', 'eminent', 'unprecedented', 'predictions', 'fluid', 'differentiates', 'onerous', 'latitude', 'ratchet', 'prevalent', 'grateful', 'prevailing', 'fragile', 'feelings', 'hesitant', 'abundance', 'assurance', 'characterization', 'questioning', 'restore', 'l

In [32]:
# idf_scores = {}
# n = len(years_list)

# for object in model_year.years.values():
#     for word in object.uncertainty_wordlist:
#         if word not in idf_scores.keys():
#             count = 0

#             for object in model_year.years.values():
#                 if word in object.flat_corpus_bigrams:
#                     count += 1



#         # print(f'{word}:({n}/{1+count})')
#         idf = math.log(n/(1+count))
#         idf_scores[word] = idf

# print(idf_scores)

idf_scores = {}
n = len(years_list)

for object in model_year.years.values():
    for word in object.uncertainty_wordlist:
        if word not in idf_scores.keys():
            count = 0

            for object in model_year.years.values():
                if word in object.uncertainty_wordlist:
                    count += 1

        print(f'{word}:({n}/{1+count})')
        idf = math.log(n/(1+count))
        idf_scores[word] = idf

# print(idf_scores)



turbulence:(16/5)
obstacles:(16/4)
looming:(16/7)
removes:(16/2)
safety:(16/2)
harsh:(16/4)
keenly:(16/2)
instability:(16/4)
stressful:(16/4)
rebounds:(16/3)
economic:(16/15)
crystal_ball:(16/2)
evolution:(16/2)
turndown:(16/2)
amidst:(16/3)
guarded:(16/5)
governed:(16/2)
prudent:(16/3)
stimulus:(16/4)
disruptions:(16/6)
overriding:(16/2)
exists:(16/8)
monetary:(16/9)
uncertain:(16/17)
unforeseen:(16/8)
confidence:(16/5)
poised:(16/2)
certainty:(16/11)
unknowns:(16/9)
waters:(16/2)
toll:(16/3)
predictability:(16/2)
dissatisfied:(16/2)
healthier:(16/3)
dire:(16/2)
unpredictable:(16/9)
gratifying:(16/2)
fear:(16/4)
mid-west:(16/2)
precision:(16/7)
terrible:(16/2)
plaguing:(16/2)
accurately:(16/3)
relevance:(16/2)
faces:(16/7)
political:(16/11)
prudence:(16/4)
escalating:(16/3)
navigating:(16/4)
macroeconomic:(16/13)
firmly:(16/2)
turbulent:(16/4)
illiquidity:(16/2)
confront:(16/2)
distinguish:(16/2)
monetary_fiscal:(16/2)
demands:(16/2)
unsettled:(16/5)
stagnant:(16/2)
eminent:(16/2)
unp

In [40]:
tfidf_df = empty_df = pd.DataFrame(index=years_list, columns=idf_scores.keys())
print(tfidf_df.head())

     turbulence obstacles looming removes safety harsh keenly instability  \
2009        NaN       NaN     NaN     NaN    NaN   NaN    NaN         NaN   
2010        NaN       NaN     NaN     NaN    NaN   NaN    NaN         NaN   
2011        NaN       NaN     NaN     NaN    NaN   NaN    NaN         NaN   
2012        NaN       NaN     NaN     NaN    NaN   NaN    NaN         NaN   
2013        NaN       NaN     NaN     NaN    NaN   NaN    NaN         NaN   

     stressful rebounds  ... awful arise rearview_mirror communicating  \
2009       NaN      NaN  ...   NaN   NaN             NaN           NaN   
2010       NaN      NaN  ...   NaN   NaN             NaN           NaN   
2011       NaN      NaN  ...   NaN   NaN             NaN           NaN   
2012       NaN      NaN  ...   NaN   NaN             NaN           NaN   
2013       NaN      NaN  ...   NaN   NaN             NaN           NaN   

     well-positioned disruption agnostic cleaner materializing navigated  
2009             

In [34]:
from collections import Counter


In [47]:
combined_uncertainty_wordlist = []

for year in years_list:
    for uncertainty_word in model_year.years[year].uncertainty_wordlist:
        if uncertainty_word not in combined_uncertainty_wordlist:
            combined_uncertainty_wordlist.append(uncertainty_word)

print(len(combined_uncertainty_wordlist))

920


In [48]:
## count total instances of word
for year in years_list:
    word_counts = Counter((model_year.years[year].flat_corpus_bigrams))
    for uncertainty_word in combined_uncertainty_wordlist:
        count = word_counts[uncertainty_word]
        tfidf_df.at[year, uncertainty_word] = count

In [52]:
for years in years_list:
    for word in model_year.years[year].uncertainty_wordlist:
        average_count = df['word'].mean()
        year_count

        if

    # Sort row values in descending order and print top 10 with column names
    print(f"Row {index}:")
    sorted_row = row.sort_values(ascending=False)

    # Print column names and values for the top 10 items
    for col, value in sorted_row.head(20).items():
        print(f"{col}, Value: {value}")

    print()
    print('---')

2009     0
2010     0
2011     0
2012     0
2013     0
2014     4
2015     1
2016     0
2017     1
2018     0
2019     0
2020     0
2021     0
2022    44
2023     8
2024     0
Name: ukraine, dtype: object


In [30]:
for word in tfidf_df.columns:
    for year in years_list:
        idf = idf_scores[word]

        doc_length = len(model_year.years[year].uncertainty_wordlist)
        count = 0

        for checked_word in model_year.years[year].uncertainty_wordlist:
            if word == checked_word:
                count+=1

        tf = count/doc_length
        tf_idf = tf * idf

        tfidf_df.at[year, word] = tf_idf







In [19]:
print(tfidf_df.head())

     turbulence obstacles   looming removes    safety harsh    keenly  \
2009  -0.000001  0.000001  0.000002     0.0 -0.000002   0.0 -0.000001   
2010       -0.0  0.000001  0.000001     0.0 -0.000001   0.0      -0.0   
2011       -0.0       0.0  0.000001     0.0 -0.000001   0.0      -0.0   
2012       -0.0  0.000001  0.000001     0.0      -0.0   0.0      -0.0   
2013       -0.0       0.0  0.000001     0.0      -0.0   0.0      -0.0   

     instability stressful rebounds  ...     awful     arise rearview_mirror  \
2009    0.000016 -0.000001      0.0  ... -0.000001 -0.000003             0.0   
2010    0.000003      -0.0      0.0  ...      -0.0 -0.000002             0.0   
2011    0.000004      -0.0      0.0  ...      -0.0 -0.000002             0.0   
2012         0.0      -0.0      0.0  ...      -0.0 -0.000002        0.000005   
2013    0.000001      -0.0      0.0  ...      -0.0 -0.000001             0.0   

     communicating well-positioned disruption agnostic   cleaner  \
2009        

In [31]:
# def top_10_columns_by_row(row):
#     # Sort values in descending order and get top 10 columns
#     return row.nlargest(10).index.tolist()

# df['top_10_columns'] = tfidf_df.apply(top_10_columns_by_row, axis=1)
# print(df[['top_10_columns']])

for index, row in tfidf_df.iterrows():
    # Sort row values in descending order and print top 10 with column names
    print(f"Row {index}:")
    sorted_row = row.sort_values(ascending=False)

    # Print column names and values for the top 10 items
    for col, value in sorted_row.head(20).items():
        print(f"{col}, Value: {value}")

Row 2009:
turbulence, Value: 0.020794415416798356
assurance, Value: 0.020794415416798356
hesitant, Value: 0.020794415416798356
feelings, Value: 0.020794415416798356
fragile, Value: 0.020794415416798356
prevailing, Value: 0.020794415416798356
grateful, Value: 0.020794415416798356
prevalent, Value: 0.020794415416798356
ratchet, Value: 0.020794415416798356
latitude, Value: 0.020794415416798356
onerous, Value: 0.020794415416798356
differentiates, Value: 0.020794415416798356
fluid, Value: 0.020794415416798356
predictions, Value: 0.020794415416798356
unprecedented, Value: 0.020794415416798356
eminent, Value: 0.020794415416798356
stagnant, Value: 0.020794415416798356
demands, Value: 0.020794415416798356
monetary_fiscal, Value: 0.020794415416798356
distinguish, Value: 0.020794415416798356
Row 2010:
withstand, Value: 0.020794415416798356
uncertainties, Value: 0.020794415416798356
determinations, Value: 0.020794415416798356
lingering, Value: 0.020794415416798356
downturns, Value: 0.0207944154167

In [None]:
print(model_year.years[2010].corpus_bigrams[1:5])
print(len(model_year.years[2010].corpus_bigrams))

In [26]:
scores = {}

for year, model in model_year.years.items():
    score_list = [0] * len(model.uncertainty_wordlist)  #create dummy holder for frequency scores

    for index, word in enumerate(model.uncertainty_wordlist):
        count = 0
        for model in model_year.years.values():
            if word in model.uncertainty_wordlist:
                count += 1
        score_list[index] = count
    scores[year] = score_list


In [None]:
for year, score_list in scores.items():
    print(year)
    for index, score in enumerate(score_list):
        if score < 2:
            print(f'{model_year.years[year].uncertainty_wordlist[index]} : {score}')
    print()
    print()

In [16]:
corpus_for_vectorizor = []

for key, value in model_year.years.items():

    flat_list = [word for sublist in value.corpus_bigrams for word in sublist]
    joined_string = ' '.join(flat_list)
    corpus_for_vectorizor.append(joined_string)

In [None]:
for i in corpus_for_vectorizor:
    print(len(i))

## Vectorizer if needed

In [None]:
# from sklearn.feature_extraction.text import TfidfVectorizer

# # Example corpus: list of documents

# # Initialize TfidfVectorizer
# vectorizer = TfidfVectorizer(
#                                 lowercase=False,
#                                 max_features=1000000,
#                                 max_df=0.8,
#                                 min_df=5,
#                             )

# # Fit the vectorizer on the corpus and transform the corpus into a TF-IDF matrix
# tfidf_matrix = vectorizer.fit_transform(corpus_for_vectorizor)

# # Get the feature names (i.e., the words in the vocabulary)
# feature_names = vectorizer.get_feature_names_out()

# # Convert the TF-IDF matrix into a dense format (if necessary, for easier indexing)
# dense_matrix = tfidf_matrix.todense()

# # Convert dense matrix to a pandas DataFrame for easy lookup (optional but helpful)
# df_tfidf = pd.DataFrame(dense_matrix, columns=feature_names)

# # View the TF-IDF matrix
# print("TF-IDF matrix:\n", df_tfidf)

# # List of words you want to check the TF-IDF score for
# words_to_check = model_year.years[2021].uncertainty_wordlist

# # Loop through each word and print the corresponding TF-IDF score for each document
# for word in words_to_check:
#     if word in feature_names:
#         print(f"\nTF-IDF scores for word '{word}':")
#         print(df_tfidf[word])  # This will give you the TF-IDF score for that word in all documents
#     else:
#         print(f"Word '{word}' not found in the vocabulary")

In [None]:
similar_words = trained_model.wv.most_similar('uncertainty', topn=100)
word_list = [word for word, number in similar_words]

# Print the most similar words and their similarity scores
for word, similarity in similar_words:
    print(f"{word}: Similarity = {similarity:.4f}")

## Calc Uncertainty Scores

In [None]:
uncertainty_score_dict = {}

for transcript_id, value in dict_bigrams.items():

    transcript_text = list(itertools.chain.from_iterable(value))


    neg_count = 0
    for word in transcript_text:
        if word in word_list:
            neg_count += 1
            if transcript_id == 3006899:
                print(word)

    neg_score = neg_count/len(transcript_text)
    if transcript_id == 3006899:
        print(f'------------{neg_count}/{len(transcript_text)}={neg_score}')

    uncertainty_score_dict[transcript_id] = neg_score


