In [112]:
from bertopic import BERTopic
from hdbscan import HDBSCAN
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd
import re
import spacy
import string
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim import corpora, models, utils
from gensim.corpora.dictionary import Dictionary
from gensim.models.coherencemodel import CoherenceModel 
from gensim.parsing.preprocessing import STOPWORDS, remove_stopwords
from umap import UMAP

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize, sent_tokenize

[nltk_data] Downloading package punkt to /Users/yeonsoo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Preprocessing

In [113]:
NLP_finn = pd.read_csv(os.path.join("..", "Project2", "data", "agg_article_info.csv"))

# Creating a corpus with all abstracts
corpus = [NLP_finn['Abst'][i] for i in range(len(NLP_finn['Abst']))]
        
# Cleaning the unnecessary terms and creating a cleaned corpus
corpusn = [i.replace('Research Summary','').replace('Research Abstract','').replace('Research summary','') for i in corpus]

# Modified code of Junki Hong's original code
corpusnn = []
for i in corpusn:
    sentences = sent_tokenize(i)
    for j in sentences:
        if 'Copyright (' in j:
            sentences.remove(j)
    cleanedAbst = ' '.join(sentences)
    corpusnn.append(cleanedAbst)
    
### Removing punctuations
corpus_P = [i.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))).replace(' '*4, ' ').replace(' '*3, ' ').replace(' '*2, ' ').strip() for i in corpusnn]

### Removing numbers
# removing numbers: N
corpus_PN = [i.translate(str.maketrans('','',string.digits)) for i in corpus_P]
# lowercasing: L
corpus_PNL = [i.lower() for i in corpus_PN]

### Lemmatizing (M) the bill with spaCy library instead of stemming
# Use the text before tokenizing; a string need to be provided. spacy does both tokening and lemmatizing.
# It takes 4 to 5 minutes and increase nlp.max length to the length set above.
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 1000000
corpus_PNLM = [[j.lemma_ for j in nlp(i)] for i in corpus_PNL]
abst_wclm = [len(i) for i in corpus_PNLM]
print(len(abst_wclm))

# Removing empty strings from the results of lemmatizing
corpus_PNLMf = [' '.join(i).split() for i in corpus_PNLM]
print(len(corpus_PNLMf))
bill_wclms = [len(i) for i in corpus_PNLMf]
print(len(bill_wclms))

print(corpus[0])
print(corpus_PNLMf[0], end="")

# Stopword removal with gensim Library: W
# This applied to the lemmatized words! Not to the stemmed words.
all_stopwords_add = STOPWORDS.union(set(['x', 'y', 'I', 's', 'e', 'study', 'paper', 'research', 'study', 'literatures', 'article', 'ii', 'iii', 'john',
                                        'wiley', 'sons', 'use', 'examine', 'investigate', 'approach', 'argue', 'effect', 'positive', 'negative', 'result',
                                        'high', 'increase', 'subsequent', 'subsequently', 'r', 'r d', 'specific', 'specifically', 'think', 'test',
                                        'substantially', 'robust', 'second', 'report', 'zero', 'aaa', 'z', 'ab','abd', 'yes', 'year', 'writing', 'd', 'r']))
print(len(all_stopwords_add))
corpus_PNLMW = [[j for j in i if not j in all_stopwords_add] for i in corpus_PNLMf]
corpus_PNLMWs = [' '.join(i) for i in corpus_PNLMW]

# Checking if stopword removal reduced words in each bill
print(len(corpus_PNLMW))
print(corpus_PNLMW[0], end = '')
bill_wc1 = [len(i) for i in corpus_PNLMW]

### latent semantic analysis (LSA) with scikit-learn
# Making document title for each bill
P_names = ['p' + str(i) for i in range(len(corpus_PNLMW))]

# Making document-term frequency matrix
def feed(wordlist): # Use this for feeding preprocessed tokens
    return wordlist

dtm_md = CountVectorizer(tokenizer = feed, min_df = 0.01, max_df = 0.7, ngram_range = (1,3), token_pattern = None, lowercase = False)
dtfm = dtm_md.fit_transform(corpus_PNLMW)
print(type(dtfm), dtfm.shape)

dtfmx = pd.DataFrame(dtfm.toarray(), index= P_names, columns= dtm_md.get_feature_names_out())
print(dtfmx.head())

# DTM to DTL by document
fin_words = dtm_md.inverse_transform(dtfm)
print(len(fin_words[0]), fin_words[0])
# Mapping between unique words and word id (making a dictionary)
pp_dict = Dictionary(fin_words)
# Given a dictionary (pp_dict), make a word frequency table for each document.
wordfreq_doc = [pp_dict.doc2bow(text) for text in fin_words] #corpus_PNLMW]
print(len(wordfreq_doc))
print(wordfreq_doc[0], end = " ")

11231
11231
11231
Many decision makers operate in dynamic environments in which markets, competitors, and technology change regularly. The ability to detect and respond to these regime shifts is critical for economic success. We conduct three experiments to test how effective individuals are at detecting such regime shifts. Specifically, we investigate when individuals are most likely to underreact to change and when they are most likely to overreact to it. We develop a system-neglect hypothesis: Individuals react primarily to the signals they observe and secondarily to the environmental system that produced the signal. The experiments, two involving probability estimation and one involving prediction, reveal a behavioral pattern consistent with our system-neglect hypothesis: Underreaction is most common in unstable environments with precise signals, and overreaction is most common in stable environments with noisy signals. We test this pattern formally in a statistical comparison of t

LSA

In [115]:
n_topics = 50
n_words = 10
topics = ['topic'+ str(i+1) for i in range(n_topics)]
pp_LSA_sklm = TruncatedSVD(n_components= n_topics, n_iter=7, random_state=42)
pp_LSA_dtm = pp_LSA_sklm.fit_transform(dtfm)
pp_topictermMtx = pd.DataFrame(pp_LSA_sklm.components_, index = topics, columns= dtm_md.get_feature_names_out())
idx_nmax_rf = [abs(row).nlargest(n_words).index for index, row in pp_topictermMtx.iterrows()]

LDA

In [116]:
ldamodel = models.ldamodel.LdaModel(corpus=wordfreq_doc, num_topics= n_topics, id2word= pp_dict, passes= 15,
                                    alpha='auto', update_every=1, chunksize=64, random_state=100)
coherencemode1 = CoherenceModel(model= ldamodel, texts=fin_words, dictionary=pp_dict, coherence= 'c_v')
ldaTopics = ldamodel.print_topics(num_topics= n_topics, num_words = n_words)

BERTopic

In [117]:
topic_model = BERTopic(language="english", calculate_probabilities=True, verbose=True, top_n_words=n_words, nr_topics=n_topics)
topics, probs = topic_model.fit_transform(corpus_PNLMWs)

Batches:   0%|          | 0/351 [00:00<?, ?it/s]

2023-04-30 02:23:13,545 - BERTopic - Transformed documents to Embeddings
2023-04-30 02:23:16,833 - BERTopic - Reduced dimensionality
2023-04-30 02:23:22,581 - BERTopic - Clustered reduced embeddings
2023-04-30 02:23:25,679 - BERTopic - Reduced number of topics from 147 to 50


In [118]:
# randomly pick five topics per model
idx_lsa = np.random.choice(n_topics, 5, replace=False)
idx_lda = np.random.choice(n_topics, 5, replace=False)
idx_BERTopic = np.random.choice(n_topics, 5, replace=False)

topics = []
for name in ['LSA', 'LDA', 'BERTopic']:
    for i in range(5):
        topics.append(name+ str(i+1))
topic_lsa = pd.DataFrame(idx_nmax_rf).loc[idx_lsa].T #LSA
topic_lda = pd.DataFrame([re.sub(r'\d+', '', ldaTopics[i][1].replace("*", "").replace('"', '').replace('.', '').replace(' ', '')).split('+')  for i in idx_lda]).replace('\d+', '').T #LDA
topic_BERTopic = pd.concat([pd.DataFrame(topic_model.get_topic(i))[0] for i in idx_BERTopic], axis=1)# BERTopic
df = pd.concat([topic_lsa, topic_lda, topic_BERTopic], axis=1)
df.columns = topics


In [119]:
df

Unnamed: 0,LSA1,LSA2,LSA3,LSA4,LSA5,LDA1,LDA2,LDA3,LDA4,LDA5,BERTopic1,BERTopic2,BERTopic3,BERTopic4,BERTopic5
0,status,technology,strategic,status,ceo,external,shape,interaction,drive,experience,diffusion,flexibility,leadership,family,patch
1,technology,manager,platform,theory,board,supplier,future,appropriate,significantly,question,adoption,flexible,leader,ownership,vendor
2,policy,consumer,strategy,organization,group,component,diversity,configuration,complementary,participate,innovation,plant,abusive,divestiture,security
3,supplier,relationship,business,strategy,risk,failure,basis,raise,especially,win,internet,product,supervision,firm,software
4,new,theory,experience,consumer,new,responsibility,direction,signal,capture,transactioncost,adopter,resource,team,owner,vulnerability
5,experience,contract,relationship,policy,change,buyer,imply,fact,innovative,power,practice,chain,supervisor,performance,liability
6,time,innovation,industry,search,employee,hazard,ignore,world,respect,yield,bandwagon,manufacturing,transformational,non,release
7,buyer,supplier,change,quality,model,criterion,actually,induce,timing,multilevel,network,configuration,subordinate,control,ransom
8,retailer,new,contract,social,director,meet,regulation,reputation,quantity,horizon,iso,budget,follower,agency,attack
9,process,process,quality,time,decision,sufficiently,consequently,game,horizon,difficult,adopt,production,behavior,shareholder,ransomware
