In [34]:
import nltk
from nltk.corpus import brown
from nltk.corpus import stopwords
import string
# nltk.download('punkt') # Download the 'punkt' tokenizer
from nltk.tokenize import word_tokenize
from nltk.util import bigrams
import numpy as np
import gensim
import re
import pandas as pd
import os
from tabulate import tabulate
from gensim.models.phrases import Phrases, Phraser
nltk.download('stopwords')
import pickle
import itertools  #used for flattening lists of lists




[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Steve\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [35]:
nltk_stop_words = set(stopwords.words('english'))
words_to_remove = ['below', 'haven']
stop_words = list(filter(lambda word: word not in words_to_remove, nltk_stop_words))
print(stop_words)


['if', 'more', 'these', "mightn't", "weren't", 'my', 'you', 'out', 'i', 'just', 'she', 'further', 'there', 'should', 'hadn', 'during', 'under', "shouldn't", 'so', 'other', 'theirs', 'before', 'whom', 'weren', 'up', 'and', "aren't", 'they', 'here', 'same', 'through', "hasn't", 'o', 'needn', 'not', 'are', 't', 'do', 'on', 'off', 'the', 'itself', 'into', 'her', 'when', 'been', 'an', 'being', 'ourselves', 'yourself', 's', 'having', "isn't", 'who', 'don', 'wasn', 'its', 'myself', 'herself', 'were', 'own', 'few', 'hasn', 'won', 'me', "you're", 'that', 'after', 'themselves', "you'd", 'any', 'some', 'is', 'your', 'until', 'didn', 'most', 'y', "couldn't", 'or', 'm', 'his', 're', 'those', 'nor', 'than', "won't", 'himself', 'had', "she's", 'between', 'can', 'it', 'couldn', "needn't", 'but', 'this', "you'll", 'yourselves', 'll', "doesn't", "didn't", "wouldn't", 'hers', 'against', 'with', 'does', 'a', 'what', "shan't", 'd', 'from', 'has', 'too', 'isn', 'did', 'aren', 'ma', 'shouldn', 'while', 'over

In [36]:
def split_text_to_sentences_words(text):
    # Split into sentences
    sentences = nltk.sent_tokenize(text)

    # Split each sentence into a list of words
    sentences_words = [nltk.word_tokenize(sentence) for sentence in sentences]

    return sentences_words

def get_corpus(directory):
    csv_files = [os.path.join(directory, file) for file in os.listdir(directory)]
   #  csv_file = [csv_files[-1]]
# print(csv_files)

    dfs = []
    for file in csv_files:
        df = pd.read_csv(file, encoding='utf-8')
        dfs.append(df)
    large_df = pd.concat(dfs, ignore_index=True)

   #  num_rows = large_df.shape[0]
   #  print(f"Number of rows: {num_rows}")

    large_df['COMPONENTTEXT_SPLIT'] = large_df['COMPONENTTEXT'].apply(split_text_to_sentences_words)
    print(tabulate(large_df.head(2), headers='keys', tablefmt='pretty'))

    print(f'split text has been applied')


    dict = {}

    for row in large_df.itertuples():
        if row.TRANSCRIPTID not in dict:
            dict[row.TRANSCRIPTID] = row.COMPONENTTEXT_SPLIT
        else:
            dict[row.TRANSCRIPTID].extend(row.COMPONENTTEXT_SPLIT)

    return(dict)

def process_the_data(dict, stop_words):

   num_tokens_before = 0
   num_tokens_after = 0
   processed_dict = {}

   for transcriptid, text in dict.items():
      p_text = []
      for sentence in text:
         p_sentence = []
         for word in sentence:
            num_tokens_before += 1
            p_word = word.lower()                                  #lowercase the text
            p_word = re.sub(r'(?<!\w)-(?!\w)|[^\w\s-]', '', p_word)  #remove punctuation but keep hyphens
            if p_word in stop_words or len(p_word) == 0:             #ignore if word has no length (ie was punctuation only) or in stop words
               continue
            p_sentence.append(p_word)
         p_text.append(p_sentence)
         num_tokens_after += len(p_sentence)
      processed_dict[transcriptid] = p_text


   #count tokens after processing
   print(f'Number of Tokens before processing: {num_tokens_before:,}')
   print(f'Number of Tokens after processing: {num_tokens_after:,}')
   print(f'Process Sentence Examples:')

   x = 0
   for key, value in processed_dict.items():
      print(f'{key}: {value}')
      x += 1
      if x == 2:
         break

   return processed_dict


def create_bigrams(processed_dict):

   processed_corpus = [['start']]
   for value in processed_dict.values():
      processed_corpus.extend(value)

   phrases = Phrases(processed_corpus, min_count=10, threshold=100, scoring='default')
   bigram_phraser = Phraser(phrases)

   corpus_bigrams = [['start']]
   dict_bigrams = {}

   for key, value in processed_dict.items():
      new_value = [bigram_phraser[sentence] for sentence in value]
      dict_bigrams[key] = new_value
      corpus_bigrams.extend(new_value)


   print('bigrams created')

   return phrases, bigram_phraser, corpus_bigrams, dict_bigrams



In [37]:

def apply_word2vec(corpus_bigrams):
   model = gensim.models.Word2Vec (
    vector_size=150,    # Number of features in word vector

    window=10,   # Context window size (in each direction). Default is 5


    min_count=5, # Words must appear this many times to be in vocab.
                 #   Default is 5

    workers=10,  # Training thread count

    sg=1,        # 0: CBOW, 1: Skip-gram.

    hs=0,        # 0: Negative Sampling, 1: Hierarchical Softmax
                 #   Default is 0, NS

    negative=5   # Nmber of negative samples
                 #   Default is 5
   )

   model.build_vocab(
    corpus_bigrams,
    progress_per=20000  # Tweaks how often progress is reported
   )

   print('Training the model...')

   model.train(
    corpus_bigrams,
    total_examples=len(corpus_bigrams),
    epochs=10,        # How many training passes to take.
    report_delay=10.0 # Report progress every 10 seconds.
   )

   print(' Modeling Training Done.')
   print('')

   return(model)

In [38]:
corpus_dict = get_corpus('data')

+---+-----------+------------------------+---------------+---------------------------+----------+---------------------+------------------------------------------------------------+------------------------------+--------------+----------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [39]:
processed_corpus_dict = process_the_data(corpus_dict, stop_words)
print()
phrases, bigram_phraser, corpus_bigrams, dict_bigrams = create_bigrams(processed_corpus_dict)
print()
trained_model = apply_word2vec(corpus_bigrams)

Number of Tokens before processing: 66,087,551
Number of Tokens after processing: 24,808
Process Sentence Examples:
20338: [['good', 'morning', 'ladies', 'gentlemen'], ['welcome', 'amcore', 'financial', 'first', 'quarter', 'earnings', 'result', 'conference', 'call'], ['time', 'participants', 'listen', 'mode'], ['later', 'conduct', 'question-and-answer', 'session', 'analysts'], ['please', 'note', 'conference', 'recorded'], ['conference', 'call', 'also', 'webcast', 'accessed', 'wwwamcorecom', 'archived', 'additional', 'four', 'weeks'], ['statements', 'made', 'course', 'conference', 'call', 'stating', 'company', 'management', 'intentions', 'hopes', 'beliefs', 'expectations', 'predictions', 'future', 'considered', 'forward-looking', 'statements'], ['important', 'note', 'company', 'actual', 'results', 'could', 'differ', 'materially', 'projected', 'forward-looking', 'statements'], ['additional', 'information', 'concerning', 'factors', 'could', 'cause', 'actual', 'result', 'differ', 'material

In [40]:
x = 0
for key, value in dict_bigrams.items():


    print(f'{key}: {value}')
    x +=1

    if x == 10:
        break

20338: [['good', 'morning', 'ladies_gentlemen'], ['welcome', 'amcore', 'financial', 'first', 'quarter', 'earnings', 'result', 'conference', 'call'], ['time', 'participants', 'listen', 'mode'], ['later_conduct', 'question-and-answer_session', 'analysts'], ['please', 'note', 'conference', 'recorded'], ['conference', 'call', 'also', 'webcast', 'accessed', 'wwwamcorecom', 'archived', 'additional', 'four', 'weeks'], ['statements', 'made', 'course', 'conference', 'call', 'stating', 'company', 'management', 'intentions', 'hopes_beliefs', 'expectations', 'predictions', 'future', 'considered', 'forward-looking_statements'], ['important', 'note', 'company', 'actual', 'results', 'could', 'differ_materially', 'projected', 'forward-looking_statements'], ['additional', 'information_concerning', 'factors', 'could', 'cause_actual', 'result', 'differ_materially', 'forward-looking_statements', 'contained', 'time-to-time', 'company', 'sec_filings', 'within', 'press_release'], ['conducting', 'call', 'toda

In [44]:
similar_words = trained_model.wv.most_similar('uncertainty', topn=100)
word_list = [word for word, number in similar_words]

# Print the most similar words and their similarity scores
for word, similarity in similar_words:
    print(f"{word}: Similarity = {similarity:.4f}")

geopolitical: Similarity = 0.7189
uncertain: Similarity = 0.6741
unknown: Similarity = 0.6477
trepidation: Similarity = 0.6428
certainty: Similarity = 0.6380
unrest: Similarity = 0.6344
unpredictability: Similarity = 0.6303
concerns: Similarity = 0.6234
unknowns: Similarity = 0.6221
paralysis: Similarity = 0.6196
geopolitical_tensions: Similarity = 0.6188
war_ukraine: Similarity = 0.6164
cautious: Similarity = 0.6125
economic: Similarity = 0.6108
gridlock: Similarity = 0.6079
fragile: Similarity = 0.6046
uncertainty_surrounding: Similarity = 0.6029
fiscal_cliff: Similarity = 0.6004
stalling: Similarity = 0.5980
optimism: Similarity = 0.5966
uncertainly: Similarity = 0.5940
macro: Similarity = 0.5939
swirling: Similarity = 0.5924
volatility: Similarity = 0.5906
trade_wars: Similarity = 0.5875
debt_ceiling: Similarity = 0.5833
economy: Similarity = 0.5780
posturing: Similarity = 0.5766
anxiety: Similarity = 0.5759
sluggishness: Similarity = 0.5745
nervousness: Similarity = 0.5739
watchwo

In [42]:
uncertainty_score_dict = {}

for transcript_id, value in dict_bigrams.items():

    transcript_text = list(itertools.chain.from_iterable(value))


    neg_count = 0
    for word in transcript_text:
        if word in word_list:
            neg_count += 1
            if transcript_id == 3006899:
                print(word)



    neg_score = neg_count/len(transcript_text)
    if transcript_id == 3006899:
        print(f'------------{neg_count}/{len(transcript_text)}={neg_score}')

    uncertainty_score_dict[transcript_id] = neg_score




caution
political
political
political
political
political
political
political
political
political
political
macroeconomic
macroeconomic
political
cautious
macro
political
political
political
political
political
cautious
certainty
political
macroeconomic
political
------------26/3848=0.006756756756756757


In [None]:
print('geopolitical' in similar_words.keys())

In [45]:
x = 0
for key, value in uncertainty_score_dict.items():

    if value > 0:
        print(f'{key}: {value}')
        x +=1

    if x == 10:
        break

20338: 0.009300444803881924
33217: 0.004149377593360996
51454: 0.008869179600886918
2476476: 0.014435695538057743
15591: 0.005563282336578581
19660: 0.0013398838767306833
29898: 0.002890869669959046
50262: 0.003806623524933384
15906: 0.0029455081001472753
22399: 0.005157593123209169


In [None]:
sentences = all_sentences

flattened_list = [word for sentence in sentences for word in sentence]
print(f'Number of Tokens before processing: {len(flattened_list):,}')
print()

num_tokens = 0
processed_sents = []
for sent in sentences:
   p_sentence = []
   for word in sent:
     text = word.lower()
     if text in stop_words or text == '':
        continue
     text = re.sub(r'(?<!\w)-(?!\w)|[^\w\s-]', '', text)
     p_sentence.append(text)
   processed_sents.append(p_sentence)
   num_tokens += len(p_sentence)

print(f'Number of Tokens after processing: {num_tokens:,}')
print(f'Process Sentence Example:')
for i in processed_sents[:3]:
    print(i)


In [None]:

for i in range(10):
    print(sentences[i])
    print(processed_sents[i])
    print()

In [14]:
import pickle

# Your list to be saved

# Save the list to a file using pickle
with open('my_list.pkl', 'wb') as file:
    pickle.dump(processed_sents, file)

In [22]:
model = gensim.models.Word2Vec (
    vector_size=100,    # Number of features in word vector

    window=10,   # Context window size (in each direction). Default is 5


    min_count=5, # Words must appear this many times to be in vocab.
                 #   Default is 5

    workers=10,  # Training thread count

    sg=1,        # 0: CBOW, 1: Skip-gram.

    hs=0,        # 0: Negative Sampling, 1: Hierarchical Softmax
                 #   Default is 0, NS

    negative=5   # Nmber of negative samples
                 #   Default is 5
)

In [23]:
model.build_vocab(
    processed_sents,
    progress_per=20000  # Tweaks how often progress is reported

)

In [None]:
print('Training the model...')

model.train(
    processed_sents,
    total_examples=len(processed_sents),
    epochs=10,        # How many training passes to take.
    report_delay=10.0 # Report progress every 10 seconds.
)

print('  Done.')
print('')