In [7]:
import pandas as pd
import numpy as np
from numpy import loadtxt
from keras.models import Sequential
from keras.layers import Dense
import os
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import gensim
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import re

[nltk_data] Downloading package stopwords to /Users/rich/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
def cleantxt(txt):
    """
    Cleans the string passed. Cleaning Includes-
    1. remove special characters/symbols
    2. convert text to lower-case
    3. retain only alphabets
    4. remove words less than 3 characters
    5. remove stop-words
    """  
    # collecting english stop words from nltk-library
    stpw = stopwords.words('english')
    
    # Adding custom stop-words
    stpw.extend(['www','http','utc'])
    stpw = set(stpw)
    
    # using regex to clean the text
    txt = re.sub(r"\n", " ", txt)
    txt = re.sub("[\<\[].*?[\>\]]", " ", txt)
    txt = txt.lower()
    txt = re.sub(r"[^a-z ]", " ", txt)
    txt = re.sub(r"\b\w{1,3}\b", " ",txt)
    txt = " ".join([x for x in txt.split() if x not in stpw])
    return txt


In [9]:
# create an array with (CIK: strategy text)
corpus_dict = dict()
raw_strategy = [] # list of strategy text
ncols = 0
with open("ceo_gender_training.csv") as f:
    ncols = len(f.readline().split(','))
ceo_data = pd.read_csv("ceo_gender_training.csv")
ceo_data.rename(columns={ceo_data.columns[0]: "pid" }, inplace = True)
ceo_data['has_file'] = 0
company_doc = []
for index, row in ceo_data.iterrows():
    folder_path = row['folder_path']
    QTR = folder_path.split("\\")[0]
    CIK = folder_path.split("\\")[1]
    try:
        f1 = open(QTR + "/" + CIK + "/" + "business-section.txt", "r")
        business = f1.read()
        f2 = open(QTR + "/" + CIK + "/" + "risk-factors-section.txt", "r")
        risk = f2.read()
        strategy = business + risk
        ceo_data.at[index,'has_file'] = 1
        clean_strategy = cleantxt(strategy)
        corpus_dict[CIK] = clean_strategy
        raw_strategy.append(clean_strategy)
    except FileNotFoundError:
        ceo_data.at[index,'has_file'] = 0

ceo_data['has_file'].unique()
# consider only rows with the file data
ceo_data2 = ceo_data.loc[ceo_data['has_file'] == 1].reset_index()
print(ceo_data2.shape)

(1030, 39)


In [12]:
# Prepare text for LDA analysis
strategy = pd.DataFrame(raw_strategy, columns =['text_processed']) 

stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'part', 'ii'])

def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]


data = strategy.text_processed.values.tolist()
data_words = list(sent_to_words(data))

# remove stop words
data_words = remove_stopwords(data_words)

print(data_words[:1][0][:30])

['form', 'summary', 'signatures', 'table', 'contents', 'item', 'business', 'general', 'believe', 'world', 'largest', 'provider', 'health', 'care', 'products', 'services', 'primarily', 'office', 'based', 'dental', 'medical', 'practitioners', 'serve', 'million', 'customers', 'worldwide', 'including', 'dental', 'practitioners', 'laboratories']


In [13]:
import gensim.corpora as corpora

# Create Dictionary
id2word = corpora.Dictionary(data_words)

# Create Corpus
texts = data_words

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1][0][:30])

[(0, 3), (1, 1), (2, 28), (3, 10), (4, 1), (5, 1), (6, 1), (7, 2), (8, 16), (9, 1), (10, 3), (11, 5), (12, 1), (13, 10), (14, 1), (15, 4), (16, 1), (17, 5), (18, 1), (19, 2), (20, 1), (21, 3), (22, 1), (23, 1), (24, 5), (25, 4), (26, 2), (27, 1), (28, 4), (29, 2)]


In [None]:
from pprint import pprint

# number of topics
num_topics = 7

# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)

# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]