In [9]:
import nltk
from nltk.corpus import brown
from nltk.corpus import stopwords
import string
# nltk.download('punkt') # Download the 'punkt' tokenizer
from nltk.tokenize import word_tokenize
from nltk.util import bigrams
import numpy as np
import gensim
import re
import pandas as pd
import os
from tabulate import tabulate
from gensim.models.phrases import Phrases, Phraser
nltk.download('stopwords')
import pickle



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Steve\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
nltk_stop_words = set(stopwords.words('english'))
words_to_remove = ['below', 'haven']
stop_words = list(filter(lambda word: word not in words_to_remove, nltk_stop_words))
print(stop_words)

corpus = brown.sents()


["you've", 'who', "shouldn't", 'been', "haven't", 'your', 'a', 'couldn', 'have', 'other', 'theirs', "won't", 'mightn', "she's", 'were', 'him', 'each', 'i', 'all', 'after', "doesn't", 'myself', 'so', 'them', 'very', 'off', "mightn't", 'be', 't', 'does', 'than', "isn't", 'any', 'now', 'an', 'same', 'between', 'you', 'once', 'their', 'm', 'mustn', 'my', "weren't", 'whom', 'at', "wouldn't", 'such', "hadn't", 'these', 'ain', 'needn', "aren't", 'most', 'over', 're', 'herself', 'which', 'too', 'his', 'both', 'won', 'more', 'ma', 'isn', 'but', 'in', 'above', 'why', 'yours', 'themselves', "that'll", 'against', 'will', 'into', 'just', "you're", 'what', 'had', 'did', 'by', 's', 'they', 'she', 'll', 'until', 'those', 'during', 'if', 'can', 'where', 'no', 'shouldn', 'when', "wasn't", "shan't", 'do', 'down', 'wouldn', 'was', 'as', 'this', 'shan', 'while', 'hasn', 'itself', 'is', "didn't", 'didn', "couldn't", 'has', 'not', 've', 'y', 'aren', "don't", 'am', 'being', 'with', "hasn't", 'only', 'doesn', 

In [3]:
directory = "data"
csv_files = [os.path.join(directory, file) for file in os.listdir(directory)]
# print(csv_files)

dfs = []
for file in csv_files:
    df = pd.read_csv(file, encoding='utf-8')
    dfs.append(df)
large_df = pd.concat(dfs, ignore_index=True)

num_rows = large_df.shape[0]
print(f"Number of rows: {num_rows}")
print(large_df.head(2))

['data\\bank_keydev_transcript_2009.csv', 'data\\bank_keydev_transcript_2010.csv', 'data\\bank_keydev_transcript_2011.csv', 'data\\bank_keydev_transcript_2012.csv', 'data\\bank_keydev_transcript_2013.csv', 'data\\bank_keydev_transcript_2014.csv', 'data\\bank_keydev_transcript_2015.csv', 'data\\bank_keydev_transcript_2016.csv', 'data\\bank_keydev_transcript_2017.csv', 'data\\bank_keydev_transcript_2018.csv', 'data\\bank_keydev_transcript_2019.csv', 'data\\bank_keydev_transcript_2020.csv', 'data\\bank_keydev_transcript_2021.csv', 'data\\bank_keydev_transcript_2022.csv', 'data\\bank_keydev_transcript_2023.csv', 'data\\bank_keydev_transcript_2024.csv']


In [11]:
def split_text_to_sentences_words(text):
    # Split into sentences
    sentences = nltk.sent_tokenize(text)

    # Split each sentence into a list of words
    sentences_words = [nltk.word_tokenize(sentence) for sentence in sentences]

    return sentences_words

def get_corpus(directory):
    csv_files = [os.path.join(directory, file) for file in os.listdir(directory)]
# print(csv_files)

    dfs = []
    for file in csv_files:
        df = pd.read_csv(file, encoding='utf-8')
        dfs.append(df)
    large_df = pd.concat(dfs, ignore_index=True)

    num_rows = large_df.shape[0]
    print(f"Number of rows: {num_rows}")
    print(tabulate(large_df.head(2), headers='keys', tablefmt='pretty'))
    large_df['COMPONENTTEXT_SPLIT'] = large_df['COMPONENTTEXT'].apply(split_text_to_sentences_words)

    all_sentences = []

    for sentences in large_df['COMPONENTTEXT_SPLIT']:
    # Add each sentence (which is a list of words) to the master list
        all_sentences.extend(sentences)

    print(f'length of all_sentences: {len(all_sentences)}')
    return(all_sentences)



In [12]:
def process_the_data(corpus, stop_words):

   #count tokens before processing
   flattened_list = [word for sentence in corpus for word in sentence]
   print(f'Number of Tokens before processing: {len(flattened_list):,}')
   print()

   num_tokens = 0
   processed_sents = []
   for sent in corpus:
      p_sentence = []
      for word in sent:
         text = word.lower()                                  #lowercase the text
         text = re.sub(r'(?<!\w)-(?!\w)|[^\w\s-]', '', text)  #remove punctuation but keep hyphens
         if text in stop_words or len(text) == 0:             #ignore if word has no length (ie was punctuation only) or in stop words
            continue
         p_sentence.append(text)
      processed_sents.append(p_sentence)
      num_tokens += len(p_sentence)

   #count tokens after processing
   print(f'Number of Tokens after processing: {num_tokens:,}')
   print(f'Process Sentence Examples:')
   for i in processed_sents[:3]:
      print(i)

   return processed_sents

def create_bigrams(processed_corpus):
   phrases = Phrases(processed_corpus, min_count=10, threshold=100, scoring='default')
   bigram_phraser = Phraser(phrases)

   corpus_bigrams = [bigram_phraser[sentence] for sentence in processed_corpus]

   print('bigrams created')

   return phrases, bigram_phraser, corpus_bigrams

def apply_word2vec(corpus_bigrams):
   model = gensim.models.Word2Vec (
    vector_size=100,    # Number of features in word vector

    window=10,   # Context window size (in each direction). Default is 5


    min_count=5, # Words must appear this many times to be in vocab.
                 #   Default is 5

    workers=10,  # Training thread count

    sg=1,        # 0: CBOW, 1: Skip-gram.

    hs=0,        # 0: Negative Sampling, 1: Hierarchical Softmax
                 #   Default is 0, NS

    negative=5   # Nmber of negative samples
                 #   Default is 5
   )

   model.build_vocab(
    corpus_bigrams,
    progress_per=20000  # Tweaks how often progress is reported
   )

   print('Training the model...')

   model.train(
    corpus_bigrams,
    total_examples=len(corpus_bigrams),
    epochs=10,        # How many training passes to take.
    report_delay=10.0 # Report progress every 10 seconds.
   )

   print(' Modeling Training Done.')
   print('')

   return(model)

In [13]:
corpus = get_corpus('data')

Number of rows: 553450
+---+-----------+------------------------+---------------+---------------------------+----------+---------------------+------------------------------------------------------------+------------------------------+--------------+----------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|   | COMPANYID |      COMPANYNAME       |    COUNTRY    | SIMPLEINDUSTRYDESCRIPTION | KEYDEVID | KEYDEVEVENTTYPENAME |                          HEADL

In [14]:
processed_corpus = process_the_data(corpus, stop_words)
print()
phrases, bigram_phraser, corpus_bigrams = create_bigrams(processed_corpus)
print()
trained_model = apply_word2vec(corpus_bigrams)

Number of Tokens before processing: 66,087,551

Number of Tokens after processing: 29,573,897
Process Sentence Examples:
['good', 'morning', 'ladies', 'gentlemen']
['welcome', 'amcore', 'financial', 'first', 'quarter', 'earnings', 'result', 'conference', 'call']
['time', 'participants', 'listen', 'mode']

bigrams created

Training the model...
 Modeling Training Done.



In [16]:
similar_words = trained_model.wv.most_similar('uncertain', topn=300)

# Print the most similar words and their similarity scores
for word, similarity in similar_words:
    if 'uncertain' not in word:
        print(f"{word}: Similarity = {similarity:.4f}")

unclear: Similarity = 0.7379
severity_duration: Similarity = 0.7350
geopolitical_events: Similarity = 0.7257
geopolitical_macroeconomic: Similarity = 0.6985
lack_clarity: Similarity = 0.6974
unsettled: Similarity = 0.6945
stressful_economic: Similarity = 0.6929
cautious: Similarity = 0.6886
remain_cautious: Similarity = 0.6876
certainty: Similarity = 0.6868
duration_severity: Similarity = 0.6853
watchword: Similarity = 0.6850
geopolitical_risks: Similarity = 0.6845
challenging: Similarity = 0.6818
unpredictability: Similarity = 0.6811
geopolitical_tensions: Similarity = 0.6805
unknown: Similarity = 0.6791
depth_duration: Similarity = 0.6777
highly_influenced: Similarity = 0.6751
clarity_certainty: Similarity = 0.6726
fragile: Similarity = 0.6709
length_depth: Similarity = 0.6696
geopolitical_concerns: Similarity = 0.6680
geopolitical_situation: Similarity = 0.6665
lingering_impacts: Similarity = 0.6638
fragile_nature: Similarity = 0.6638
lasting_effects: Similarity = 0.6634
broader_mac

length of all_sentences: 6456550


In [12]:
sentences = all_sentences

flattened_list = [word for sentence in sentences for word in sentence]
print(f'Number of Tokens before processing: {len(flattened_list):,}')
print()

num_tokens = 0
processed_sents = []
for sent in sentences:
   p_sentence = []
   for word in sent:
     text = word.lower()
     if text in stop_words or text == '':
        continue
     text = re.sub(r'(?<!\w)-(?!\w)|[^\w\s-]', '', text)
     p_sentence.append(text)
   processed_sents.append(p_sentence)
   num_tokens += len(p_sentence)

print(f'Number of Tokens after processing: {num_tokens:,}')
print(f'Process Sentence Example:')
for i in processed_sents[:3]:
    print(i)


Number of Tokens before processing: 132,175,102

Number of Tokens after processing: 76,675,022
Process Sentence Example:
['good', 'morning', '', 'ladies', 'gentlemen', '']
['welcome', 'amcore', 'financial', 'first', 'quarter', 'earnings', 'result', 'conference', 'call', '']
['time', '', 'participants', 'listen', 'mode', '']


In [16]:

for i in range(10):
    print(sentences[i])
    print(processed_sents[i])
    print()

['Good', 'morning', ',', 'ladies', 'and', 'gentlemen', '.']
['good', 'morning', '', 'ladies', 'gentlemen', '']

['And', 'welcome', 'to', 'the', 'AMCORE', 'Financial', 'first', 'quarter', 'earnings', 'result', 'conference', 'call', '.']
['welcome', 'amcore', 'financial', 'first', 'quarter', 'earnings', 'result', 'conference', 'call', '']

['At', 'this', 'time', ',', 'participants', 'are', 'in', 'a', 'listen', 'only', 'mode', '.']
['time', '', 'participants', 'listen', 'mode', '']

['Later', 'we', 'will', 'conduct', 'a', 'question-and-answer', 'session', 'for', 'analysts', 'only', '.']
['later', 'conduct', 'question-and-answer', 'session', 'analysts', '']

['Please', 'note', 'that', 'this', 'conference', 'is', 'being', 'recorded', '.']
['please', 'note', 'conference', 'recorded', '']

['This', 'conference', 'call', 'is', 'also', 'being', 'webcast', 'and', 'can', 'be', 'accessed', 'at', 'www.amcore.com', ',', 'and', 'will', 'be', 'archived', 'for', 'additional', 'four', 'weeks', '.']
['co

In [14]:
import pickle

# Your list to be saved

# Save the list to a file using pickle
with open('my_list.pkl', 'wb') as file:
    pickle.dump(processed_sents, file)

In [22]:
model = gensim.models.Word2Vec (
    vector_size=100,    # Number of features in word vector

    window=10,   # Context window size (in each direction). Default is 5


    min_count=5, # Words must appear this many times to be in vocab.
                 #   Default is 5

    workers=10,  # Training thread count

    sg=1,        # 0: CBOW, 1: Skip-gram.

    hs=0,        # 0: Negative Sampling, 1: Hierarchical Softmax
                 #   Default is 0, NS

    negative=5   # Nmber of negative samples
                 #   Default is 5
)

In [23]:
model.build_vocab(
    processed_sents,
    progress_per=20000  # Tweaks how often progress is reported

)

In [None]:
print('Training the model...')

model.train(
    processed_sents,
    total_examples=len(processed_sents),
    epochs=10,        # How many training passes to take.
    report_delay=10.0 # Report progress every 10 seconds.
)

print('  Done.')
print('')

In [None]:
similar_words = model.wv.most_similar('uncertain', topn=150)

# Print the most similar words and their similarity scores
for word, similarity in similar_words:
    print(f"{word}: Similarity = {similarity:.4f}")

In [27]:
from gensim.models.phrases import Phrases, Phraser

In [28]:
phrases = Phrases(processed_sents, min_count=10, threshold=10, scoring='default')