In [1]:
import zipfile
import pandas as pd
import os
import re
from wordcloud import WordCloud

# Gensim and LDA
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.parsing.preprocessing import STOPWORDS
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# NLP stuff
import contractions
import string
import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from nltk.corpus import stopwords
from nltk.util import ngrams
from nltk import word_tokenize, sent_tokenize
#nltk.download('stopwords')
#nltk.download('wordnet')
#nltk.download('omw-1.4')
#nltk.download('punkt')
stop_words = stopwords.words('english')
import pandas as pd

from gensim.models import CoherenceModel

import pickle 
from pprint import pprint

import numpy as np
import tqdm

import matplotlib.pyplot as plt
import csv

ModuleNotFoundError: No module named 'pandas'

In [None]:
# Constants
# root_path = "C:/Users/jonat/Dropbox/Academia/Research/SEC Investigations Returns/Topic Analysis/Extracted Datasets"
root_path = "C:/Users/kalodimj/Dropbox/Academia/Research/SEC Investigations Returns/Topic Analysis"
cleaned_text = root_path + "/Extracted Datasets/" + "sec_speeches_statements_combined.csv"



In [None]:
# Import the speeches as a pandas dataframe
speeches = pd.read_csv(cleaned_text)
speeches = speeches[['file_name', 'text', 'year']].sort_values(by=['year'])
print(f'Speeches has {speeches.shape[0]} rows.')
print(speeches.head())
#print(speeches.tail())


Speeches has 3700 rows.
         file_name                                               text  year
1608  spch152.txt    "mutual funds and the international marketpla...  1997
1601  spch144.txt    oral statement by chairman arthur levitt unit...  1997
1603  spch147.txt    whither puhca?-[1]- barry p. barbash, directo...  1997
1604  spch170.txt    "plain english and the u.s. securities market...  1997
1605  spch149.txt    "preserving municipal market trust --the road...  1997


In [None]:
# Isolate just the sample of texts we want; this is based off the hand cleaning; this will also import the dates and speaker
# I am hand cleaning an xlsx version of these and then doing a "Save As" to csv

meta_pre2011 = pd.read_csv( root_path+ "/Extracted Datasets/Pre2011_First_Words_cleaned_20230717.csv")
meta_pre2011 = meta_pre2011[['file_name', 'date', 'speaker']].copy()
meta_pre2011['date'] = pd.to_datetime(meta_pre2011['date'])
print(meta_pre2011.head())


meta_post2011 = pd.read_csv( root_path+ "/Extracted Datasets/Post2011_First_Words_Speeches_Statements_cleaned_20230717.csv")
meta_post2011 = meta_post2011[['file_name', 'date', 'speaker']].copy()
meta_post2011['date'] = pd.to_datetime(meta_post2011['date'])
print(meta_post2011.head())

meta_speeches = pd.concat([meta_pre2011, meta_post2011], ignore_index=True)
print(meta_speeches.head())
print(f'Meta has {meta_speeches.shape[0]} rows.')
meta_speeches = meta_speeches.drop_duplicates(subset=['file_name'])
print(f'Meta has {meta_speeches.shape[0]} rows.')

# this is based two years before the beginning of our sample
meta_speeches = meta_speeches[meta_speeches['date'] > pd.to_datetime('1997-10-01')]
meta_speeches = meta_speeches.sort_values(by=['date'])
print(f'Meta has {meta_speeches.shape[0]} rows.')

#meta_speeches.to_csv(root_path + '/xtemp_meta_speeches.csv', index=False)

      file_name       date  speaker
0  spch136.txt  1997-01-30      NaN
1  spch173.txt  1997-08-06      NaN
2  spch174.txt  1997-08-17      NaN
3  spch175.txt  1997-08-18      NaN
4  spch176.txt  1997-09-29      NaN
              file_name                date    speaker
0  022015-spchcdmg.txt  2015-02-20 20:06:29   Unknown 
1  022015-spchckms.txt  2015-02-20 20:17:37   Unknown 
2  022015-spchclaa.txt  2015-02-20 16:53:06   Unknown 
3  022015-spchcmsp.txt  2015-02-20 21:31:54   Unknown 
4   022015-spchraf.txt  2015-02-20 14:48:53   Unknown 
      file_name       date speaker
0  spch136.txt  1997-01-30     NaN
1  spch173.txt  1997-08-06     NaN
2  spch174.txt  1997-08-17     NaN
3  spch175.txt  1997-08-18     NaN
4  spch176.txt  1997-09-29     NaN
Meta has 3700 rows.
Meta has 3700 rows.
Meta has 3665 rows.


In [None]:
# Create the final sample of speeches with meta data

print(f'Speeches before merge has {speeches.shape[0]} rows.')
speeches_sample = pd.merge(speeches, meta_speeches, on='file_name')
#print(speeches.sort_values(by=['date']).head())
print(f'speeches_sample after merge has {speeches_sample.shape[0]} rows.')

print(speeches_sample.head())

Speeches before merge has 3700 rows.
speeches_sample after merge has 3665 rows.
      file_name                                               text  year  \
0  spch180.txt    "recent initiatives related to supervisory pr...  1997   
1  spch178.txt    "a declaration of (accounting) independence" ...  1997   
2  spch181.txt    "the risks and rewards of technology" remarks...  1997   
3  spch183.txt    insurance products: the responsibilities of a...  1997   
4  spch184.txt    "a renewed precedent: the securities bar and ...  1997   

        date speaker  
0 1997-10-09     NaN  
1 1997-10-08     NaN  
2 1997-10-13     NaN  
3 1997-10-22     NaN  
4 1997-11-06     NaN  


In [None]:
def preprocess(text_col):
    """This function will apply NLP preprocessing lambda functions over a pandas series such as df['text'].
       These functions include converting text to lowercase, removing emojis, expanding contractions, removing punctuation,
       removing numbers, removing stopwords, lemmatization, etc."""
    
    # convert to lowercase
    text_col = text_col.apply(lambda x: ' '.join([w.lower() for w in x.split()]))
    
    # expand contractions  
    text_col = text_col.apply(lambda x: ' '.join([contractions.fix(word) for word in x.split()]))

    # remove punctuation
    text_col = text_col.apply(lambda x: ''.join([i for i in x if i not in string.punctuation]))
    
    # remove numbers
    text_col = text_col.apply(lambda x: ' '.join(re.sub("[^a-zA-Z]+", " ", x).split()))

    # remove stopwords
    stopwords = [sw for sw in nltk.corpus.stopwords.words('english') if sw not in ['not', 'no']]
    text_col = text_col.apply(lambda x: ' '.join([w for w in x.split() if w not in stopwords]))

    # lemmatization
    text_col = text_col.apply(lambda x: ' '.join([WordNetLemmatizer().lemmatize(w) for w in x.split()]))

    # remove short words
    text_col = text_col.apply(lambda x: ' '.join([w.strip() for w in x.split() if len(w.strip()) >= 3]))

    return text_col

speeches_sample['text_processed'] = preprocess(speeches_sample['text'])
print(speeches_sample.head())
print(speeches_sample['text_processed'].head())

      file_name                                               text  year  \
0  spch180.txt    "recent initiatives related to supervisory pr...  1997   
1  spch178.txt    "a declaration of (accounting) independence" ...  1997   
2  spch181.txt    "the risks and rewards of technology" remarks...  1997   
3  spch183.txt    insurance products: the responsibilities of a...  1997   
4  spch184.txt    "a renewed precedent: the securities bar and ...  1997   

        date speaker                                     text_processed  
0 1997-10-09     NaN  recent initiative related supervisory practice...  
1 1997-10-08     NaN  declaration accounting independence remark art...  
2 1997-10-13     NaN  risk reward technology remark chairman arthur ...  
3 1997-10-22     NaN  insurance product responsibility growing indus...  
4 1997-11-06     NaN  renewed precedent security bar sec remark chai...  
0    recent initiative related supervisory practice...
1    declaration accounting independence rem

In [None]:
""" # Create a Word Cloud
# Join the different processed titles together.
long_string = ','.join(list(speeches_sample['text_processed'].values))

# Create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=1000, contour_width=3, contour_color='steelblue')

# Generate a word cloud
wordcloud.generate(long_string)

# Visualize the word cloud
wordcloud.to_image()
# """

In [None]:
# Create helper functions

# Create tokens for the words
def sent_to_words(texts):
    for text in texts:
        tokens = word_tokenize(text)
        yield(tokens)

data = speeches_sample['text_processed'].values.tolist()
data_words = list(sent_to_words(data)) # text represented as tokens
print(data_words[:1][0][:30])

# parameters for making bigrams
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
bigram_mod = gensim.models.phrases.Phraser(bigram)

def make_bigrams(texts):
    return [bigram_mod[text] for text in texts]

#data_words_bigrams = make_bigrams(data_words)
speeches_sample['data_words_bigrams'] = make_bigrams(data_words)



def exclude_words(word_list, exclusion_list):
    return [word for word in word_list if word not in exclusion_list]

# words that I am excluding based on manual review of the topics; this is an iterative process.

exclusion_list = ["january", "february", "march", \
                  "april", "may", "june", \
                  "july", "august", "september", \
                  "october", "november", "december", \
                  'html', 'available_httpswwwsecgovnewspressrelease', \
                  'nonyous', 'still', 'mdash', \
                  'using', 'ass', 'although', \
                  'pdf', 'really', 'conclusion',\
                  'conference', 'percent', 'sept',
                  'specific', 'upon', 'generally',
                  'likely', 'through', 'currently',
                  'across', 'upon', 'indeed', 
                  'increasingly', 'toward', 'truly', 
                  'always', 'far', 'effectively', 
                  'thus', 'ago', 'york', 
                  'never', 'simply', 'perhaps']

# use apply with a lambda function to remove the words
speeches_sample['data_words_bigrams'] = speeches_sample['data_words_bigrams'].apply(exclude_words, args=(exclusion_list,))

print(speeches_sample.head())

['recent', 'initiative', 'related', 'supervisory', 'practice', 'remark', 'isaac', 'hunt', 'commissioner', 'yous', 'security', 'exchange', 'commission', 'washington', 'nscp', 'national', 'membership', 'meeting', 'national', 'society', 'compliance', 'professional', 'washington', 'october', 'view', 'expressed', 'herein', 'commissioner', 'hunt', 'not']
      file_name                                               text  year  \
0  spch180.txt    "recent initiatives related to supervisory pr...  1997   
1  spch178.txt    "a declaration of (accounting) independence" ...  1997   
2  spch181.txt    "the risks and rewards of technology" remarks...  1997   
3  spch183.txt    insurance products: the responsibilities of a...  1997   
4  spch184.txt    "a renewed precedent: the securities bar and ...  1997   

        date speaker                                     text_processed  \
0 1997-10-09     NaN  recent initiative related supervisory practice...   
1 1997-10-08     NaN  declaration accounti

In [None]:
# Important code that creates the dictionary and corpus

# "texts" will be the text dataset for the rest of the code
#texts = data_words_bigrams
texts = speeches_sample['data_words_bigrams']

# Create Dictionary
id2word = corpora.Dictionary(texts) # a mapping between words and their integer ids
id2word.filter_extremes(no_below=15, no_above=0.4, keep_n=80000)

# Create Corpus
# Term Document Frequency
# corpus = [id2word.doc2bow(text) for text in texts] # Convert document into the bag-of-words (BoW) format = list of (token_id, token_count) tuples; format of list (word integer identifer, number of occurances).
speeches_sample['corpus'] = [id2word.doc2bow(text) for text in texts] # Convert document into the bag-of-words (BoW) format = list of (token_id, token_count) tuples; format of list (word integer identifer, number of occurances).
# View
#print(corpus[:1][0][:30])


In [None]:
# This was to hand check is the speech corpus list was appropriately merged into the pandas dataframe. I believe it is.
print(speeches_sample)
print(speeches_sample['corpus'])
print(id2word[4])

                                              file_name  \
0                                          spch180.txt    
1                                          spch178.txt    
2                                          spch181.txt    
3                                          spch183.txt    
4                                          spch184.txt    
...                                                 ...   
3660                    gensler-remarks-iac-030223.txt    
3661     peirce-statement-electronic-filing-032223.txt    
3662  peirce-statement-enhanced-cybersecurity-031523...   
3663  crenshaw-statement-prohibition-against-conflic...   
3664  lizarraga-statement-prohibiting-conflicts-inte...   

                                                   text  year  \
0      "recent initiatives related to supervisory pr...  1997   
1      "a declaration of (accounting) independence" ...  1997   
2      "the risks and rewards of technology" remarks...  1997   
3      insurance products: the 

In [None]:
# supporting function
# https://towardsdatascience.com/evaluate-topic-model-in-python-latent-dirichlet-allocation-lda-7d57484bb5d0

def compute_coherence_values(corpus, dictionary, k, a, b):
    
    
    #previous version passed parameters of alpha and beta; this take long to run; I am going to leave at default
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=4000,
                                           passes=20,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()


# To find the optimal parameters # this takes a while because it is a grid search
# https://towardsdatascience.com/evaluate-topic-model-in-python-latent-dirichlet-allocation-lda-7d57484bb5d0

#### based on LDA tuning results 7/17/2023 the optimal number of topics seems to be 9 ####
if 0 == 1:
    grid = {}
    grid['Validation_Set'] = {}

    # Topics range
    min_topics = 2
    max_topics = 21
    step_size = 1
    topics_range = range(min_topics, max_topics, step_size)

    # Alpha parameter
    alpha = list(np.arange(0.01, 1, 0.3))
    alpha.append('symmetric')
    alpha.append('asymmetric')
    
    # Beta parameter
    beta = list(np.arange(0.01, 1, 0.3))
    beta.append('symmetric')

    
    # Validation sets
    num_of_docs = len(speeches_sample['corpus'])
    corpus_sets = [gensim.utils.ClippedCorpus(speeches_sample['corpus'], int(num_of_docs*0.75)), 
                   speeches_sample['corpus']]

    corpus_title = ['75% Corpus', '100% Corpus']

    model_results = {'Validation_Set': [],
                     'Topics': [],
                     'Alpha': [],
                     'Beta': [],
                     'Coherence': []
                    }

    # Can take a long time to run
    if 1 == 1:
        pbar = tqdm.tqdm(total=(len(beta)*len(alpha)*len(topics_range)*len(corpus_title)))

        # iterate through validation corpuses
        for i in range(len(corpus_sets)):
            # iterate through number of topics
            for k in topics_range:
                # iterate through alpha values
                for a in alpha:
                    # iterare through beta values
                    for b in beta:
                        # get the coherence score for the given parameters
                        cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, 
                                                      k=k, a=a, b=b)
                        # Save the model results
                        model_results['Validation_Set'].append(corpus_title[i])
                        model_results['Topics'].append(k)
                        model_results['Alpha'].append(a)
                        model_results['Beta'].append(b)
                        model_results['Coherence'].append(cv)

                        pbar.update(1)
        pd.DataFrame(model_results).to_csv(root_path + '/lda_tuning_results_2_20_20230718.csv', index=False)
        pbar.close()

# visual the optimal topics
if 0 == 1:
    def compute_coherence_values(dictionary, corpus, texts, limit, start=5, step=1):

        coherence_values = []
        model_list = []
        for num_topics in range(start, limit, step):
            model=gensim.models.LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=num_topics)
            model_list.append(model)
            coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
            coherence_values.append(coherencemodel.get_coherence())

        return model_list, coherence_values



    model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=speeches_sample['corpus'], texts=texts, start=2, limit=25, step=1)

    limit=25; start=2; step=1;
    x = range(start, limit, step)
    plt.plot(x, coherence_values)
    plt.xlabel("Num Topics")
    plt.ylabel("Coherence score")
    plt.legend(("coherence_values"), loc='best')
    plt.show()

100%|██████████| 1140/1140 [68:21:42<00:00, 215.88s/it]  


In [None]:
# number of topics
num_topics = 9

# Build LDA model
# random_state makes sure the model is reproducable

lda_model = gensim.models.LdaMulticore(corpus=speeches_sample['corpus'],
                                       id2word=id2word,
                                       num_topics=num_topics,
                                       chunksize=4000,
                                       passes=20,
                                       random_state=100,
                                       alpha='asymmetric',
                                       eta='symmetric')

# Print the Keyword in the topics
pprint(lda_model.print_topics())
doc_lda = lda_model[speeches_sample['corpus']]

# Tune the model to find the optimal number of topics
# https://medium.com/@patrickhk/build-a-lda-model-for-classification-with-gensim-80ca6343c4b9
# https://github.com/fiyero/LDA_gensim/blob/master/LDA%20with%20Gensim_git.ipynb

coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


[(0,
  '0.039*"adviser" + 0.011*"client" + 0.010*"examination" + 0.009*"hedge" + '
  '0.008*"conflict" + 0.008*"customer" + 0.007*"procedure" + '
  '0.007*"brokerdealers" + 0.006*"mutual" + 0.006*"account"'),
 (1,
  '0.013*"structure" + 0.012*"price" + 0.011*"data" + 0.010*"equity" + '
  '0.009*"trade" + 0.007*"competition" + 0.006*"customer" + 0.006*"execution" '
  '+ 0.006*"option" + 0.006*"liquidity"'),
 (2,
  '0.014*"municipal" + 0.009*"bond" + 0.008*"doddfrank" + 0.007*"crisis" + '
  '0.007*"data" + 0.007*"bank" + 0.006*"securitybased_swap" + '
  '0.005*"transparency" + 0.004*"reform" + 0.004*"entity"'),
 (3,
  '0.025*"shareholder" + 0.013*"proxy" + 0.010*"mutual" + 0.007*"money" + '
  '0.006*"voting" + 0.005*"vote" + 0.005*"compensation" + 0.005*"product" + '
  '0.004*"independent" + 0.004*"fee"'),
 (4,
  '0.011*"fraud" + 0.007*"penalty" + 0.007*"violation" + 0.007*"investigation" '
  '+ 0.006*"million" + 0.005*"charge" + 0.005*"conduct" + 0.004*"misconduct" + '
  '0.004*"federal

In [None]:
# Export topics to a text file
with open(root_path + '/topics_9_20230724.txt', 'w') as f:
    for topic in lda_model.print_topics(num_words=100):
        f.write(str(topic))
        f.write('\n')

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()

LDAvis_data_filepath = os.path.join(root_path + '/LDA_results/ldavis_prepared_'+str(num_topics))
print(LDAvis_data_filepath)
    
# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
if 1 == 1:
    LDAvis_prepared = gensimvis.prepare(lda_model, speeches_sample['corpus'], id2word)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)

# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)

pyLDAvis.save_html(LDAvis_prepared, root_path + '/LDA_results/ldavis_prepared_'+ str(num_topics) +'.html')

LDAvis_prepared

C:/Users/kalodimj/Dropbox/Academia/Research/SEC Investigations Returns/Topic Analysis/LDA_results/ldavis_prepared_9


In [None]:
def format_topics_sentences(ldamodel, corpus, doc_id, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic; if j == 1 it is the second most common topic, j == 2 is third most common topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                new_row = pd.DataFrame([[int(topic_num), round(prop_topic, 4), topic_keywords]]) # ChatGPT generated code which creates a new row for the reported output
                sent_topics_df = pd.concat([sent_topics_df, new_row], ignore_index=True) # ChatGPT generated code which fills in reported output
                # sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True) # replaced with two lines of ChatGPT code above
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    ids = pd.Series(doc_id)
    sent_topics_df = pd.concat([ids, sent_topics_df], axis=1)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=speeches_sample['corpus'], doc_id=speeches_sample['file_name'], texts=speeches_sample['text_processed'])

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'File Name', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(100)
df_dominant_topic.tail(100)

Unnamed: 0,Document_No,File Name,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
3565,3565,lizarraga-statement-amendments-form-npx-110222...,3,0.9929,"shareholder, proxy, mutual, money, voting, vot...",today commission adopts important enhancement ...
3566,3566,gensler-sbcfac-statement-021022.txt,7,0.8831,"small, private, offering, people, world, job, ...",thank carla garrett committee member good appr...
3567,3567,gensler-sec-speaks-090822.txt,6,0.5093,"rating, hedge, credit_rating, product, price, ...",thank good back sec speaks like thank practisi...
3568,3568,lizarraga-statement-clawbacks-102622.txt,3,0.4520,"shareholder, proxy, mutual, money, voting, vot...",today commission adopts rule implement key con...
3569,3569,gensler-sifma-speech-102422.txt,1,0.3574,"structure, price, data, equity, trade, competi...",thank ken bentsen customary like note view not...
...,...,...,...,...,...,...
3660,3660,gensler-remarks-iac-030223.txt,0,0.4233,"adviser, client, examination, hedge, conflict,...",good morning pleased join investor advisory co...
3661,3661,peirce-statement-electronic-filing-032223.txt,2,0.4331,"municipal, bond, doddfrank, crisis, data, bank...",thank chair today proposal represents importan...
3662,3662,peirce-statement-enhanced-cybersecurity-031523...,4,0.4206,"fraud, penalty, violation, investigation, mill...",thank chair gensler one question threat cyberc...
3663,3663,crenshaw-statement-prohibition-against-conflic...,2,0.5660,"municipal, bond, doddfrank, crisis, data, bank...",commission proposing implement section doddfra...


In [None]:
# This code creates a dataset

def export_doc_proportions(ldamodel, corpus, doc_id, doc_date):
    # Transform the corpus into the topic space
    corpus_transformed = lda_model[corpus]

    # Prepare a list to hold the data
    data = []

    for i, doc in enumerate(corpus_transformed):
        for topic, proportion in doc:
            file_name = doc_id[i]
            date = doc_date[i]
            data.append({"Document": i, "FileName": file_name, "Date": date, "Topic": topic, "Proportion": proportion})

    # Convert the list into a DataFrame
    df = pd.DataFrame(data)

    # Write the DataFrame to a CSV file
    df.to_csv(root_path + '/topic_proportions_20230724.csv', index=False)

print(speeches_sample.tail())    
doc_topic_output = export_doc_proportions(ldamodel=lda_model, corpus=speeches_sample['corpus'], doc_id=speeches_sample['file_name'], doc_date=speeches_sample['date'])

                                              file_name  \
3660                    gensler-remarks-iac-030223.txt    
3661     peirce-statement-electronic-filing-032223.txt    
3662  peirce-statement-enhanced-cybersecurity-031523...   
3663  crenshaw-statement-prohibition-against-conflic...   
3664  lizarraga-statement-prohibiting-conflicts-inte...   

                                                   text  year  \
3660   Good morning. I am pleased to join the Invest...  2023   
3661   Thank you  Mr. Chair. Todays proposal represe...  2023   
3662   Thank you  Chair Gensler. No one questions th...  2023   
3663   The Commission is proposing to implement Sect...  2023   
3664   While asset-backed securities may not be a ho...  2023   

                    date                               speaker  \
3660 2023-03-02 15:05:00                   Chair Gary Gensler    
3661 2023-03-22 14:04:00        Commissioner Hester M. Peirce    
3662 2023-03-15 14:09:00        Commissioner Hester M. P

In [None]:
print(speeches_sample['file_name'][0])
print(corpus[1])