In [1]:
import pickle
import os
import docx
from collections import defaultdict
import gensim
from textblob import Word
import pandas as pd
import random
import numpy as np
from gensim.models import TfidfModel
from textblob import Word
from nltk.tokenize.casual import casual_tokenize
from nltk.corpus import stopwords
import string
import re
import texthero as hero
from sklearn.feature_extraction.text import CountVectorizer
from itertools import compress
from lda import guidedlda as glda

# Create useful functions

In [2]:
def clean_data(text):
    text = text[:text.find('References')]
    text = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', text, flags=re.MULTILINE)
    text = re.sub(r'[\w\.-]+@[\w\.-]+', ' ',text)
    text = re.sub(r'-\n\s', '',text)
    text = re.sub(r'-\n','',text)
    text = re.sub(r'- \n\n','',text)
    text = re.sub(r'[0-9]+','',text)
    text = re.sub(r'The Review of Financial Studies','',text)
    text = re.sub(r'The Journal of Finance','',text)
    text = re.sub(r'Journal of Financial Economics','',text)
    text = re.sub(r'This content downloaded from \n�������������... on .*?,  .*?  :: UTC������������� \n\nAll use subject to https://about.jstor.org/terms','',text)
    text = re.sub(r'%s(.+?)%s'%('\n\n\n\n\n\n ','\n\n '),'',text)
    text = re.sub(r'%s(.+?)%s'%('\n\n ','\n\n '),'',text)
    text = re.sub("\W",' ',text)
    #text = re.sub(r'[^A-Za-z0-9 -]+','',text)
    return text

In [3]:
def clean_str(text):
    text = re.sub(r'\b\w{{{}}}\b'.format(1), '',text)
    text = re.sub(r'\b\w{{{}}}\b'.format(2), '',text)
    text = re.sub(r'\W',' ',text)
    
    return text

In [4]:
def clean_documents(documents):
    documents = [clean_data(x) for x in documents]
    text=pd.DataFrame({'documents':documents})
    documents_clean = hero.remove_html_tags(text.documents)
    documents_clean = hero.clean(text.documents)
    documents_clean = documents_clean.apply(clean_str)
    documents_clean = hero.remove_whitespace(documents_clean)
    
    return documents_clean

In [10]:
# Create the function that transform the raw documents into document-term matrix (term could be any n-gram by choosing ngram_range)
def Ngram_frequency(documents,threshold,batch_size=100000,ngram_range=(1,2)):
    model = CountVectorizer(tokenizer=casual_tokenize,ngram_range=ngram_range)
    docs = model.fit_transform(raw_documents=documents)
    L = int(np.floor(docs.shape[1]/batch_size))
    bigram_feature_name=model.get_feature_names()
    high_freq_index = np.array(True)
    for i in range(1,L):
        tfidf_matrix = docs[:,((i-1)*batch_size):(i*batch_size)].toarray()
        matrix = tfidf_matrix!=0
        high_freq = np.sum(matrix,axis=0) > threshold
        high_freq_index = np.append(high_freq_index,high_freq)
    tfidf_matrix = docs[:,(L*batch_size):].toarray()
    matrix = tfidf_matrix!= 0
    high_freq = np.sum(matrix,axis=0) > threshold
    high_freq_index = np.append(high_freq_index,high_freq)
    high_freq_index = high_freq_index[1:]
    bigram_matrix = docs[:,high_freq_index].toarray()
    bigram_vocabulary = list(compress(bigram_feature_name, high_freq_index))
    r = re.compile("[a-z]+")
    index = bigram_vocabulary.index(list(filter(r.match, bigram_vocabulary))[0])
    bigram_matrix = bigram_matrix[:,index:]
    bigram_vocabulary = bigram_vocabulary[index:]
    
    return bigram_matrix,bigram_vocabulary

# Load the data

In [6]:
pkl_file_path = 'C:/Users/brave/OneDrive/Desktop/Summer 2021/data.pkl'
with open(pkl_file_path, 'rb') as f:
    docs = pickle.load(f, encoding='bytes')

In [16]:
documents = []
time_slice = []
for i in range(1995,2021):
    num_docs=0
    year=str(i)
    for journal in docs:
        for month in docs[journal][year]:
            for paper in docs[journal][year][month]:
                documents.append(paper)
                num_docs+=1
    time_slice.append(num_docs)

In [17]:
documents_clean = clean_documents(documents)

In [None]:
dtm_matrix,dtm_vocabulary = Ngram_frequency(documents_clean, threshold=np.floor(0.01*len(documents)))

In [12]:
vocab = tuple(dtm_vocabulary)
dictionary = dict(zip(dtm_vocabulary, list(range(len(dtm_vocabulary)))))

In [13]:
seed_topic_list = [['book market', 'book tomarket'],
                   ['earnings ratio','earnings ratios'],
                   ['earnings surprise', 'earnings surprises'],
                   ['capm beta','capm betas','beta','beta asset','beta market','beta risk','beta stock','beta stocks','betas market','betas portfolios','beta coefficient','beta coefficients'],
                   ['accruals','accrual'],
                   ['dividend announcements'],
                   ['active traders','active trading'],
                   ['advertising expenses','advertising expenditures'],
                   ['assets growth'],
                   ['capital expenditure','capital expenditures'],
                   ['cash holding','cash holdings'],
                   ['cash flow','cash flows','cash inflow','cash inflows'],
                   ['concentrated industries','concentration measure','concentration measures','industry concentration','measure concentration'],
                   ['debt issuance','debt issuances','debt issue','debt issued','debt issuers','debt issues','issuance debt'],
                   ['earnings announced','earnings announcement','earnings announcements'],
                   ['earnings forecast','earnings forecasts'],
                   ['eps forecasts'],
                   ['fazzari','fazzari hubbard'],
                   ['gross profit','gross profitability'],
                   ['hml','hml factor','hml factors','hml high','hml mom','hml momentum','hml return'],
                   ['idiosyncratic volatility'],
                   ['idiosyncratic return','idiosyncratic returns'],
                   ['idiosyncratic risk','idiosyncratic risks'],
                   ['intangible','intangible assets'],
                   ['momentum','momentum effect','momentum effects','momentum factor','momentum factors','momentum investing','momentum portfolio','momentum portfolios','momentum profits','momentum return','momentum returns','momentum strategies','momentum strategy','momentum traders','momentum trading'],
                   ['ohlson'],
                   ['operating profit','operating profitability','operating profits'],
                   ['pension fund','pension funds'],
                   ['real estate'],
                   ['seasonalities','seasonality'],
                   ['smb','smb factor','smb high','smb small','smb value','smbt','smbt hmlt'],
                   ['share issuance','share issues'],
                   ['share purchases','share repurchase','share repurchases'],
                   ['short interest'],
                   ['sin'],
                   ['systematic risk','systematic risks','systemic risk'],
                   ['tail risk'],
                   ['tangible','tangibility','tangency portfolio'],
                   ['volatility liquidity','liquidity volatility'],
                   ['volume market']]

In [14]:
model = glda.GuidedLDA(n_topics=200, n_iter=100, random_state=7, refresh=20)

In [15]:
seed_topics = {}
for t_id, st in enumerate(seed_topic_list):
    for word in st:
        seed_topics[dictionary[word]] = t_id

KeyError: 'gross profitability'

In [35]:
model.fit(dtm_matrix, seed_topics=seed_topics, seed_confidence=0.15)

INFO:lda:n_documents: 5862
INFO:lda:vocab_size: 66963
INFO:lda:n_words: 59285614
INFO:lda:n_topics: 200
INFO:lda:n_iter: 100
INFO:lda:<0> log likelihood: -883987351


KeyboardInterrupt: 

In [None]:
topics_200_40=[]
n_top_words = 10
topic_word = model.topic_word_
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    topics_200_40.append(topic_words.tolist())
    print('Topic {}: {}'.format(i, ', '.join(topic_words)))

In [None]:
pd.DataFrame(topics_200_40).to_csv('C:/Users/brave/OneDrive/Desktop/Comp 755/lda.txt')