## Generate fake data

In [3]:
import random
from datetime import datetime, timedelta
from faker import Faker
import pandas as pd

fake = Faker()

# Generate a list of 100 company names
company_names = [fake.company() for _ in range(100)]

# Generate a list of 100 filed dates, spanning the past 5 years
start_date = datetime.today() - timedelta(days=1825) # 5 years ago
end_date = datetime.today()
filed_dates = [fake.date_between_dates(date_start=start_date, date_end=end_date) for _ in range(100)]

# Generate random text for each section
sections = ['Section1', 'Section1A', 'Section7']
section_texts = [[fake.paragraphs(nb=2), fake.paragraphs(nb=3), fake.paragraphs(nb=4)] for _ in range(100)]

# Create the DataFrame
df = pd.DataFrame({
    'companyName': company_names,
    'filedAt': filed_dates,
    'Section1': [section[0] for section in section_texts],
    'Section1A': [section[1] for section in section_texts],
    'Section7': [section[2] for section in section_texts]
})

# Print the first 5 rows of the DataFrame
print(df.head())

           companyName     filedAt  \
0        Ross-Gonzalez  2022-12-31   
1       Huynh-Thompson  2019-06-20   
2  Mclaughlin-Phillips  2019-03-19   
3        Pope and Sons  2023-01-03   
4          Parsons Ltd  2019-10-07   

                                            Section1  \
0  [Issue conference drop forget reduce affect fe...   
1  [Fact during network always interview communit...   
2  [Pressure situation land where Congress speak....   
3  [Treat through million fine campaign same. Win...   
4  [Letter lead reason challenge series amount ki...   

                                           Section1A  \
0  [Where prepare think control fly. Suddenly mus...   
1  [Similar success where tough head improve tabl...   
2  [Activity live condition popular court window ...   
3  [Ask lose four national expert thank each. Aga...   
4  [Clearly others over., Nation doctor according...   

                                            Section7  
0  [Interview rise on something. Do sea re

In [4]:
import pandas as pd
import nltk
import gensim
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer



### QUESTION 1: *What are the common topics mentioned in the SEC filings?

In [5]:
# Load the generated dataset (Replace with your dataset loading code if necessary)
# data = pd.read_csv('sample_data.csv')
data = df

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /home/pengfei/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/pengfei/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /home/pengfei/nltk_data...


True

In [6]:
# Prepare stop words
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    tokens = [word for sent in sent_tokenize(text) for word in word_tokenize(sent)]
    tokens = [word.lower() for word in tokens if word.isalpha()]
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens

In [7]:
# Prepare dataset for LDA analysis
all_texts = data["Section1"].tolist() + data["Section1A"].tolist() + data["Section7"].tolist()
all_texts = [" ".join(section) for section in all_texts]
processed_texts = [preprocess(text) for text in all_texts]


In [8]:
# Create Gensim dictionary, filter extremes, and convert to a bag-of-words corpus
dictionary = Dictionary(processed_texts)
dictionary.filter_extremes(no_below=10, no_above=0.5, keep_n=100000)
corpus = [dictionary.doc2bow(text) for text in processed_texts]

In [9]:
# Parameters for LDA
num_topics = 5
random_state = 100

In [10]:
# Train LDA model
lda = LdaModel(
    corpus,
    num_topics=num_topics,
    id2word=dictionary,
    random_state=random_state,
    update_every=1,
    passes=10,
    alpha='auto',
    per_word_topics=True
)

In [11]:
# Print topics
topics = lda.print_topics(num_words=5)

for topic in topics:
    print(topic)

(0, '0.009*"new" + 0.007*"establish" + 0.006*"sell" + 0.006*"decision" + 0.006*"free"')
(1, '0.006*"start" + 0.005*"within" + 0.005*"lose" + 0.004*"kind" + 0.004*"grow"')
(2, '0.006*"structure" + 0.006*"yes" + 0.005*"high" + 0.005*"industry" + 0.005*"available"')
(3, '0.007*"whose" + 0.006*"return" + 0.006*"letter" + 0.006*"evidence" + 0.005*"science"')
(4, '0.006*"mr" + 0.005*"professor" + 0.005*"value" + 0.004*"language" + 0.004*"accept"')


### QUESTION 2: What are the emerging topics or financial themes over the last 5 years?

In [13]:
# Add a "year" column for grouping

data['filedAt'] = data['filedAt'].astype(str)
data['year'] = data['filedAt'].apply(lambda x: x[:4]).astype(int)

In [15]:
data.head()

Unnamed: 0,companyName,filedAt,Section1,Section1A,Section7,year
0,Ross-Gonzalez,2022-12-31,[Issue conference drop forget reduce affect fe...,[Where prepare think control fly. Suddenly mus...,[Interview rise on something. Do sea remember ...,2022
1,Huynh-Thompson,2019-06-20,[Fact during network always interview communit...,[Similar success where tough head improve tabl...,[Fast leg trade able behavior fight. Never air...,2019
2,Mclaughlin-Phillips,2019-03-19,[Pressure situation land where Congress speak....,[Activity live condition popular court window ...,[Method officer rest especially. Investment ab...,2019
3,Pope and Sons,2023-01-03,[Treat through million fine campaign same. Win...,[Ask lose four national expert thank each. Aga...,[Fall attorney type begin audience leave teach...,2023
4,Parsons Ltd,2019-10-07,[Letter lead reason challenge series amount ki...,"[Clearly others over., Nation doctor according...",[Give boy film. Particularly style key more hi...,2019


In [36]:

# LDA analysis for a given year
def analyze_year(year, num_topics=5, random_state=100):
    year_data = data[data['year'] == year]
#     print(len(year_data))
    all_texts = year_data["Section1"].tolist() + year_data["Section1A"].tolist() + year_data["Section7"].tolist()
    all_texts = [" ".join(section) for section in all_texts]
    processed_texts = [preprocess(text) for text in all_texts]
#     print(processed_texts)
    dictionary = Dictionary(processed_texts)
#     dictionary.filter_extremes()
    corpus = [dictionary.doc2bow(text) for text in processed_texts]
#     print(corpus)
    lda = LdaModel(
        corpus,
        num_topics=num_topics,
        id2word=dictionary,
        random_state=random_state,
        update_every=1,
        passes=10,
        alpha='auto',
        per_word_topics=True
    )

    topics = lda.print_topics(num_words=5)
    return topics



In [20]:
data[data['year'] == 2018]

Unnamed: 0,companyName,filedAt,Section1,Section1A,Section7,year
6,Chapman PLC,2018-10-14,"[Own face but move result general mean., Perso...",[Rather food deal difference explain sing brea...,[Often body method whether democratic little r...,2018
15,Allison LLC,2018-10-23,[Including agent Congress strong large. Large ...,[It memory compare health. Eat bill through im...,[Something four exist red. Sign senior page. P...,2018
26,Moore-Cisneros,2018-11-20,[Then want thing important six require signifi...,[Make per nearly increase. Beat tell by daught...,[Health on clear. Year Congress actually avail...,2018
41,Gonzales-Carpenter,2018-10-20,[Wish draw deep exactly gas environmental. Sou...,[Break even even onto side. World of often sho...,[Law according culture identify same environme...,2018
43,Dominguez Group,2018-11-30,[Several southern buy example mean four. Groun...,[Then garden sometimes series across center po...,[Success whether affect. Mouth able tree debat...,2018
91,"Webb, Kaufman and Wagner",2018-09-30,[True our nation try describe sister. Sense me...,[Help likely bag necessary. Thus table reduce ...,[Spend up choice. Force some message away disc...,2018


In [21]:
unique_years

[2018, 2019, 2020, 2021, 2022, 2023]

In [37]:
# Group documents by year and analyze each year
unique_years = sorted(data['year'].unique())

for year in unique_years:
    print(f"Topics for year {year}:")
    topics = analyze_year(year, num_topics=5, random_state=100)
    for topic in topics:
        print(topic)
    print("\n")

Topics for year 2018:
(0, '0.016*"health" + 0.008*"table" + 0.008*"task" + 0.008*"choice" + 0.008*"upon"')
(1, '0.011*"run" + 0.008*"main" + 0.008*"whether" + 0.008*"way" + 0.008*"professional"')
(2, '0.013*"history" + 0.013*"market" + 0.013*"congress" + 0.013*"large" + 0.007*"check"')
(3, '0.010*"million" + 0.010*"building" + 0.010*"point" + 0.010*"single" + 0.010*"big"')
(4, '0.014*"later" + 0.014*"reduce" + 0.009*"space" + 0.009*"cold" + 0.009*"try"')


Topics for year 2019:
(0, '0.008*"high" + 0.008*"try" + 0.008*"scientist" + 0.008*"drive" + 0.008*"site"')
(1, '0.008*"item" + 0.008*"capital" + 0.007*"central" + 0.006*"significant" + 0.006*"network"')
(2, '0.007*"tough" + 0.006*"book" + 0.006*"view" + 0.006*"indicate" + 0.005*"think"')
(3, '0.006*"exactly" + 0.006*"present" + 0.006*"anything" + 0.006*"week" + 0.006*"letter"')
(4, '0.007*"language" + 0.007*"ten" + 0.006*"time" + 0.006*"western" + 0.006*"writer"')


Topics for year 2020:
(0, '0.009*"kitchen" + 0.009*"within" + 0.009*

### QUESTION 3:  **What are the declining themes?*

In [51]:
def analyze_year(year, num_topics=5, random_state=100):
    year_data = data[data["year"] == year]
    all_texts = year_data["Section1"].tolist() + year_data["Section1A"].tolist() + year_data["Section7"].tolist()
    all_texts = [" ".join(section) for section in all_texts]
    processed_texts = [preprocess(text) for text in all_texts]
#     print(processed_texts)
    dictionary = Dictionary(processed_texts)
#     dictionary.filter_extremes(no_below=10, no_above=0.5, keep_n=100000)
    corpus = [dictionary.doc2bow(text) for text in processed_texts]

    lda = LdaModel(
        corpus,
        num_topics=num_topics,
        id2word=dictionary,
        random_state=random_state,
        update_every=1,
        passes=10,
        alpha="auto",
        per_word_topics=True,
    )

    topics = lda.print_topics(num_words=5)
    return topics, processed_texts

In [43]:
from collections import defaultdict

In [54]:
def get_word_frequencies(words, processed_texts):
    word_frequencies = defaultdict(int)
    for word in words:
        for text in processed_texts:
            word_frequencies[word] += text.count(word)
    return word_frequencies

unique_years = sorted(data["year"].unique())
all_topics = []
all_processed_texts = []

In [55]:
for year in unique_years:
    topics, processed_texts = analyze_year(year, num_topics=5, random_state=100)
    all_topics.extend(topics)
    all_processed_texts.extend(processed_texts)

In [56]:
all_processed_texts

[['face',
  'move',
  'result',
  'general',
  'mean',
  'person',
  'hundred',
  'study',
  'later',
  'cold',
  'local',
  'trouble',
  'stand',
  'space',
  'set',
  'clear',
  'reflect',
  'certain',
  'try',
  'player',
  'agreement'],
 ['including',
  'agent',
  'congress',
  'strong',
  'large',
  'large',
  'month',
  'wife',
  'fear',
  'know',
  'market',
  'upon',
  'around',
  'politics',
  'boy'],
 ['want',
  'thing',
  'important',
  'six',
  'require',
  'significant',
  'must',
  'building',
  'feeling',
  'loss',
  'model',
  'purpose',
  'professor',
  'indeed',
  'early',
  'color',
  'parent',
  'single',
  'officer',
  'read',
  'like',
  'point',
  'protect',
  'worker',
  'care',
  'true',
  'whether'],
 ['wish',
  'draw',
  'deep',
  'exactly',
  'gas',
  'environmental',
  'southern',
  'appear',
  'million',
  'however',
  'watch',
  'size',
  'prepare',
  'official',
  'new',
  'kitchen',
  'group',
  'million',
  'big',
  'tax',
  'avoid'],
 ['several',
  's

In [63]:
all_topics

[(0,
  '0.016*"health" + 0.008*"table" + 0.008*"task" + 0.008*"choice" + 0.008*"upon"'),
 (1,
  '0.011*"run" + 0.008*"main" + 0.008*"whether" + 0.008*"way" + 0.008*"professional"'),
 (2,
  '0.013*"history" + 0.013*"market" + 0.013*"congress" + 0.013*"large" + 0.007*"check"'),
 (3,
  '0.010*"million" + 0.010*"building" + 0.010*"point" + 0.010*"single" + 0.010*"big"'),
 (4,
  '0.014*"later" + 0.014*"reduce" + 0.009*"space" + 0.009*"cold" + 0.009*"try"'),
 (0,
  '0.008*"high" + 0.008*"try" + 0.008*"scientist" + 0.008*"drive" + 0.008*"site"'),
 (1,
  '0.008*"item" + 0.008*"capital" + 0.007*"central" + 0.006*"significant" + 0.006*"network"'),
 (2,
  '0.007*"tough" + 0.006*"book" + 0.006*"view" + 0.006*"indicate" + 0.005*"think"'),
 (3,
  '0.006*"exactly" + 0.006*"present" + 0.006*"anything" + 0.006*"week" + 0.006*"letter"'),
 (4,
  '0.007*"language" + 0.007*"ten" + 0.006*"time" + 0.006*"western" + 0.006*"writer"'),
 (0,
  '0.009*"kitchen" + 0.009*"within" + 0.009*"heavy" + 0.007*"least" + 0

In [57]:
# Extract topic words
topic_words = set()
for topic in all_topics:
    words = topic[1].split(" + ")
    for word in words:
        topic_words.add(word.split("*")[1][1:-1])

In [58]:

# Get word frequencies over time
word_frequencies = get_word_frequencies(topic_words, all_processed_texts)

In [59]:
# Evaluate declining themes
declining_themes = []
ratios = [(word, word_frequencies[word] / len(all_processed_texts)) for word in topic_words]
ratios.sort(key=lambda x: x[1], reverse=True)


In [61]:
ratios

[('high', 0.09),
 ('mr', 0.08),
 ('within', 0.07666666666666666),
 ('south', 0.07333333333333333),
 ('letter', 0.07333333333333333),
 ('exactly', 0.07),
 ('million', 0.07),
 ('lose', 0.06666666666666667),
 ('smile', 0.06666666666666667),
 ('least', 0.06666666666666667),
 ('table', 0.06333333333333334),
 ('later', 0.06333333333333334),
 ('source', 0.06333333333333334),
 ('consumer', 0.06333333333333334),
 ('reduce', 0.06333333333333334),
 ('free', 0.06333333333333334),
 ('yes', 0.06333333333333334),
 ('professor', 0.06333333333333334),
 ('experience', 0.06),
 ('task', 0.06),
 ('various', 0.06),
 ('travel', 0.06),
 ('run', 0.06),
 ('central', 0.06),
 ('day', 0.06),
 ('grow', 0.06),
 ('boy', 0.06),
 ('network', 0.06),
 ('national', 0.056666666666666664),
 ('carry', 0.056666666666666664),
 ('detail', 0.056666666666666664),
 ('upon', 0.056666666666666664),
 ('five', 0.056666666666666664),
 ('able', 0.056666666666666664),
 ('congress', 0.056666666666666664),
 ('tend', 0.056666666666666664),


In [62]:



for word, ratio in ratios:
    if ratio > 0.05:  # Adjust threshold according to the analysis needs
        declining_themes.append(word)

print("Declining themes:", declining_themes)

Declining themes: ['high', 'mr', 'within', 'south', 'letter', 'exactly', 'million', 'lose', 'smile', 'least', 'table', 'later', 'source', 'consumer', 'reduce', 'free', 'yes', 'professor', 'experience', 'task', 'various', 'travel', 'run', 'central', 'day', 'grow', 'boy', 'network', 'national', 'carry', 'detail', 'upon', 'five', 'able', 'congress', 'tend', 'staff', 'turn', 'modern', 'language', 'growth', 'cell', 'throughout', 'road', 'present', 'start', 'individual', 'foot', 'view', 'benefit', 'approach', 'water', 'brother', 'large', 'tough', 'rule', 'skin', 'house', 'painting']


### question 4  **What is the sentiment over those themes?

In [71]:
from textblob import TextBlob

In [64]:


# Replace with the list of declining themes from question 3
declining_themes = ['high', 'mr', 'within', 'south', 'letter', 'exactly', 'million', 'lose', 'smile']



In [67]:
data['tokens'][0]

['Issue conference drop forget reduce affect feel doctor. Owner head hope expert probably either art stuff.',
 'Director series threat effort technology. Truth manage various long game society.',
 'Where prepare think control fly. Suddenly must because follow too middle. Must pattern consumer administration which who.',
 'Ever change though new. Military east why low. Guy dog positive fall my.',
 'Husband exist thank little break. New career surface blood American. Could couple executive position record experience. Party front phone keep hear material.',
 'Interview rise on something. Do sea remember mention source car no. Wonder media could college admit phone.',
 'Commercial pay late point require child. Camera more picture step fight.',
 'Serious oil much certainly indeed vote security. Total group month offer government. Message near baby skill either can production.',
 'Imagine onto light culture. Standard particularly cup enter eight second.']

In [68]:
# Preprocess the text to obtain the tokens for each section
data['tokens'] = data['Section1'] + data['Section1A'] + data['Section7']
data['tokens'] = data['tokens'].apply(lambda x: preprocess(x[0]))


In [69]:

# Check if a section contains any of the declining themes
data['contains_declining_theme'] = data['tokens'].apply(lambda x: any(theme in x for theme in declining_themes))


In [72]:

# Compute sentiment polarity and subjectivity for each section using TextBlob
data['sentiment_polarity'] = data['tokens'].apply(lambda x: TextBlob(' '.join(x)).sentiment.polarity)
data['sentiment_subjectivity'] = data['tokens'].apply(lambda x: TextBlob(' '.join(x)).sentiment.subjectivity)

In [73]:
# Filter the sections that contain declining themes
declining_theme_sections = data[data['contains_declining_theme']]

In [75]:

# Calculate average sentiment polarity and subjectivity
average_sentiment_polarity = declining_theme_sections['sentiment_polarity'].mean()
average_sentiment_subjectivity = declining_theme_sections['sentiment_subjectivity'].mean()

In [76]:




print(f'Average sentiment polarity for sections containing declining themes: {average_sentiment_polarity:.2f}')
print(f'Average sentiment subjectivity for sections containing declining themes: {average_sentiment_subjectivity:.2f}')

Average sentiment polarity for sections containing declining themes: 0.12
Average sentiment subjectivity for sections containing declining themes: 0.44
