# LDA Topic Analysis to classify 10k Section 1A Risk Factors

In [389]:
import pandas as pd
import pickle

df = pd.read_pickle('10kData.pkl')
df['texts'] = df['texts'].astype(str) # assert string type

In [390]:
df.head(1)

Unnamed: 0,id,accessionNo,companyName,companyNameLong,ticker,cik,filedAt,items,formType,periodOfReport,...,linkToFilingDetails,linkToTxt,description,documentFormatFiles,dataFiles,seriesAndClassesContractsInformation,linkToXbrl,entities,effectivenessDate,texts
0,adfd38b73247c5c3d782aa318ebfbc1b,0000002488-23-000047,ADVANCED MICRO DEVICES INC,ADVANCED MICRO DEVICES INC (Filer),AMD,2488,2023-02-27T16:20:39-05:00,,10-K,12/31/22,...,https://www.sec.gov/Archives/edgar/data/2488/0...,https://www.sec.gov/Archives/edgar/data/2488/0...,Form 10-K - Annual report [Section 13 and 15(d...,"[{'sequence': '1', 'description': '10-K', 'doc...","[{'sequence': '10', 'description': 'XBRL TAXON...",[],,[{'companyName': 'ADVANCED MICRO DEVICES INC (...,,ITEM 1A. RISK FACTORS \n\nThe risks and uncer...


## Step 1: Pre-Pre Processing
### Trying to filter on paragraphs which represent a risk statement
### Save risk statements into "risks" column

In [393]:
# Limiting to paragraphs 40 char or longer to get rid of the subsection titles and footers etc.
for index, row in df.iterrows():
    texts = row['texts']
    texts = re.sub(r'&#\d+;', '', texts).  # remove the special chars
    text = texts.split('\n')
    filtered_paragraphs = [sentence for sentence in text if len(sentence) > 40]
    filtered_text = '\n'.join(filtered_paragraphs)
    df.loc[index,'risks'] = filtered_text

In [394]:
df.head(1)

Unnamed: 0,id,accessionNo,companyName,companyNameLong,ticker,cik,filedAt,items,formType,periodOfReport,...,linkToTxt,description,documentFormatFiles,dataFiles,seriesAndClassesContractsInformation,linkToXbrl,entities,effectivenessDate,texts,risks
0,adfd38b73247c5c3d782aa318ebfbc1b,0000002488-23-000047,ADVANCED MICRO DEVICES INC,ADVANCED MICRO DEVICES INC (Filer),AMD,2488,2023-02-27T16:20:39-05:00,,10-K,12/31/22,...,https://www.sec.gov/Archives/edgar/data/2488/0...,Form 10-K - Annual report [Section 13 and 15(d...,"[{'sequence': '1', 'description': '10-K', 'doc...","[{'sequence': '10', 'description': 'XBRL TAXON...",[],,[{'companyName': 'ADVANCED MICRO DEVICES INC (...,,ITEM 1A. RISK FACTORS \n\nThe risks and uncer...,The risks and uncertainties described below ar...


## Step 2: Pre Processing
### Iterates through each row named "Risks" that was created last step to clean and return noun features

In [395]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import contractions


stopWords = stopwords.words('english')
stopWords.extend(['risk','risks','could','result','business'])

lemmatizer = WordNetLemmatizer()


cleaned_samples = []

# Iterate over the rows of the DataFrame
for index, row in df.iterrows():
    
    sample_text = row['risks'].split('\n')
    
    cleaned_sample =[]
    
    for p in sample_text:
        p = re.sub(r'&#\d+;', '', p)
        p = contractions.fix(p)
        words = nltk.word_tokenize(p)
        tagged_words = nltk.pos_tag(words)
        nouns = [word for word, pos in tagged_words if pos.startswith('NN')]
        nouns = [w for w in nouns if w not in stopWords]
        nouns = [w for w in nouns if len(w) > 3]
        nouns = [lemmatizer.lemmatize(w) for w in nouns]
        nouns = [w for w in nouns if w.isalpha()]
        nouns = ' '.join(nouns)
        cleaned_sample.append(nouns)
    
    cleaned_samples.append('\n'.join(cleaned_sample))

# Add the cleaned text to the DataFrame as a new column
df['cleaned_risks'] = cleaned_samples

# Display the DataFrame with cleaned text
display(df['cleaned_risks'].head(10))

0    uncertainty one condition result operation add...
1    uncertainty one condition result operation add...
2    uncertainty one condition result operation add...
3    uncertainty one condition result operation add...
4    uncertainty one condition result operation add...
5    uncertainty one condition result operation add...
6    uncertainty one condition result operation add...
7    uncertainty one condition result operation add...
8    ITEM RISK FACTORS uncertainty one condition re...
9    ITEM RISK FACTORS uncertainty one condition re...
Name: cleaned_risks, dtype: object

In [396]:
print(stopWords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [397]:
print(df['risks'][1])

The risks and uncertainties described below are not the only ones we face. If any of the following risks actually occurs, our business, financial condition or results of operations could be materially adversely affected. In addition, you should consider the interrelationship and compounding effects of two or more risks occurring simultaneously. 
The following is a summary of the principal risks that could adversely affect our business, operations and financial results. 
 Intel Corporations dominance of the microprocessor market and its aggressive business practices may limit our ability to compete effectively on a level playing field. 
 Global economic and market uncertainty may adversely impact our business and operating results. 
 The loss of a significant customer may have a material adverse effect on us. 
 The ongoing novel coronavirus (COVID-19) pandemic could materially adversely affect our business, financial condition and results of operations. 
 The markets in which our produc

In [398]:
print(df['cleaned_risks'][1])

uncertainty one condition result operation addition interrelationship effect
following summary operation result
Intel Corporations dominance microprocessor market practice ability playing field
market uncertainty operating result
loss customer effect
novel coronavirus pandemic condition result operation
market product
demand product part market condition industry Fluctuations demand product market decline industry effect result operation
semiconductor industry downturn future
operating result sale pattern
technology property United States patent copyright trade secret trademark measure advantage incur expense
currency exchange rate fluctuation
party product basis quantity technology
equipment material substrate manufacturing process product
Failure manufacturing yield product result
success ability product basis feature performance level value customer industry transition
revenue product product customer product success product
product security vulnerability effect
data loss data breac

# Step 3: Run the LDA Model
### Explode the rows which contain the cleaned risks separated by new line
### Fit the TFIDF on every risk in the corpus
### Fit the LDA model on every risk in the corpus in order to find the best classes

In [399]:
cleaned_sample = df['cleaned_risks'].str.split('\n').explode().tolist()
print(cleaned_sample)

['uncertainty one condition result operation addition interrelationship effect', 'following summary operation result', 'Intel Corporations dominance microprocessor market practice ability playing field', 'market uncertainty operating result', 'semiconductor industry downturn future', 'demand product part market condition industry Fluctuations demand product market decline industry effect result operation', 'loss customer effect', 'novel coronavirus pandemic condition result operation', 'market product', 'operating result sale pattern', 'technology property United States patent copyright trade secret trademark measure advantage incur expense', 'currency exchange rate fluctuation', 'party product basis quantity technology', 'equipment material substrate manufacturing process product', 'Failure manufacturing yield product result', 'success ability product basis feature performance level value customer industry transition', 'revenue product product customer product success product', 'produ

In [400]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_df=.8,min_df=5,ngram_range = (1,1),stop_words=stopWords)

vector = vectorizer.fit_transform(cleaned_sample)
print(vector.shape)
vocab = vectorizer.vocabulary_
vocab


(14892, 2203)


{'uncertainty': 2085,
 'one': 1387,
 'condition': 392,
 'operation': 1394,
 'addition': 30,
 'interrelationship': 1090,
 'effect': 672,
 'following': 840,
 'summary': 1963,
 'intel': 1067,
 'corporations': 455,
 'dominance': 645,
 'microprocessor': 1286,
 'market': 1247,
 'practice': 1521,
 'ability': 0,
 'playing': 1497,
 'field': 817,
 'operating': 1393,
 'semiconductor': 1815,
 'industry': 1030,
 'downturn': 649,
 'future': 883,
 'demand': 541,
 'product': 1556,
 'part': 1442,
 'fluctuations': 838,
 'decline': 520,
 'loss': 1205,
 'customer': 493,
 'novel': 1358,
 'coronavirus': 453,
 'pandemic': 1436,
 'sale': 1775,
 'pattern': 1454,
 'technology': 1999,
 'property': 1577,
 'united': 2092,
 'states': 1919,
 'patent': 1452,
 'copyright': 451,
 'trade': 2041,
 'secret': 1799,
 'trademark': 2042,
 'measure': 1258,
 'advantage': 43,
 'incur': 1016,
 'expense': 777,
 'currency': 489,
 'exchange': 761,
 'rate': 1619,
 'fluctuation': 837,
 'party': 1449,
 'basis': 177,
 'quantity': 1609,


In [401]:
terms = vectorizer.get_feature_names_out()
terms

array(['ability', 'absence', 'absent', ..., 'york', 'yuan', 'zuken'],
      dtype=object)

In [402]:
from sklearn.decomposition import LatentDirichletAllocation as LDA
lda_model = LDA(n_components=5, random_state=42)
lda_model.fit(vector)
lda_model.components_


array([[63.91029732,  0.21474943,  2.15416857, ...,  0.20001362,
         3.68484997,  1.83598348],
       [34.91051702,  0.62225763,  0.20001708, ...,  0.89440357,
         0.20000745,  0.20021259],
       [33.45776318,  0.20485699,  0.20003043, ...,  0.20001113,
         0.2000069 ,  0.20001133],
       [84.40295831,  1.2965813 ,  0.20187039, ...,  0.20235923,
         0.20001124,  0.20001615],
       [96.68781065,  0.30460276,  0.20000909, ...,  0.20002362,
         0.20000551,  0.20179487]])

In [416]:
categories = {}

for i, comp in enumerate(lda_model.components_):
    terms_comp = zip(terms, comp)
    sorted_terms = sorted(terms_comp, key=lambda x: x[1], reverse=True)[:7]
    out = "Topic " + str(i) + ": "
    for t in sorted_terms:
        out = out + t[0] + " "
    categories[i] = out

# Print the dictionary of topic terms
print(categories)

{0: 'Topic 0: acquisition right investment company property currency asset ', 1: 'Topic 1: regulation law operation change policy condition trade ', 2: 'Topic 2: product claim system information customer security damage ', 3: 'Topic 3: indebtedness debt credit cash income rate ability ', 4: 'Topic 4: product customer market revenue demand industry price '}


In [404]:
#!pip install pyldavis   

import pyLDAvis.lda_model

pyLDAvis.enable_notebook()

In [405]:
vis = pyLDAvis.lda_model.prepare(lda_model, vector, vectorizer)
pyLDAvis.display(vis)

# Step 4: Tag the risks with topics
### Loop through each row (10K filing) and assign a topic to each individual risk statement

In [417]:
# Initialize an empty list to store the tags for each row
tags_list = []

# Iterate over each row in the DataFrame
for index, row in df.iterrows():
    # Split the cleaned risks text into individual documents
    sample = row['cleaned_risks'].split('\n')
    
    # Fit TF-IDF values on each filing
    vector = vectorizer.transform(sample)
    
    # Obtain document-topic distribution from the LDA model
    doc_topic_distribution = lda_model.transform(vector)
    
    # Categorize documents
    document_categories = []
    for doc_topic_dist in doc_topic_distribution:
        dominant_topic = max(enumerate(doc_topic_dist), key=lambda x: x[1])[0]
        category = categories.get(dominant_topic, "Other")
        document_categories.append(category)
    
    # Append the document categories to the tags list
    tags_list.append('\n'.join(document_categories))

# Assign the tags to the DataFrame
df['tags'] = tags_list

# Step 5: Fetch tags for a doc
### Use this code to fetch the tagged risks for a 10k

In [418]:
def fetch_tags(doc_index):
    risks = df.loc[doc_index,'risks'].split('\n')
    tags =  df.loc[doc_index,'tags'].split('\n')

    for i in range(len(risks)):
        print(risks[i],' - ',tags[i],'\n')

In [419]:
fetch_tags(0)

The risks and uncertainties described below are not the only ones we face. If any of the following risks actually occurs, our business, financial condition or results of operations could be materially adversely affected. In addition, you should consider the interrelationship and compounding effects of two or more risks occurring simultaneously.   -  Topic 1: regulation law operation change policy condition trade  

The following is a summary of the principal risks that could adversely affect our business, operations and financial results.   -  Topic 4: product customer market revenue demand industry price  

 Intel Corporations dominance of the microprocessor market and its aggressive business practices may limit our ability to compete effectively on a level playing field.   -  Topic 4: product customer market revenue demand industry price  

 Global economic and market uncertainty may adversely impact our business and operating results.   -  Topic 4: product customer market revenue de