# Imports

In [1]:
import os
import glob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import pandas as pd

# Load Data

In [15]:
# 1. Load all .txt files and metadata
base_dir = "../data"
documents = []
companies = []
quarters = []

for company in os.listdir(base_dir):
    company_path = os.path.join(base_dir, company)
    if os.path.isdir(company_path):
        for file_path in glob.glob(f"{company_path}/*.txt"):
            with open(file_path, "r", encoding="latin1") as f:
                text = f.read()
            documents.append(text)
            companies.append(company)
            quarters.append(os.path.basename(file_path).split('-')[0] + "-" + os.path.basename(file_path).split('-')[1])  # e.g., "Q1-2023"

# 2. TF-IDF Vectorization with ngrams
vectorizer = TfidfVectorizer(
    stop_words='english',
    ngram_range=(1, 2),
    max_features=1000,
    min_df=2,
    max_df=0.9,
    token_pattern=r'(?u)\b[a-zA-Z]+\b'
)

tfidf_matrix = vectorizer.fit_transform(documents)
tfidf_terms = vectorizer.get_feature_names_out()

svd = TruncatedSVD(n_components=50, random_state=42)
svd_matrix = svd.fit_transform(tfidf_matrix)

# 4. Save metadata and features
df = pd.DataFrame(svd_matrix)
df["company"] = companies
df["quarter"] = quarters

In [16]:
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_terms)
tfidf_df["company"] = companies
tfidf_df["quarter"] = quarters

# TODO
We need to figure out which words to filter out from these (like single letters, "LISA" - name for a CEO, etc)

In [18]:
print(tfidf_terms)

['ability' 'able' 'absolutely' 'accelerate' 'accelerated' 'access'
 'acquisition' 'actions' 'activities' 'activity' 'actually' 'ad' 'add'
 'added' 'addition' 'additional' 'address' 'adjusted' 'adoption' 'ads'
 'advanced' 'advantage' 'advertisers' 'advertising' 'affiliated'
 'affiliated companies' 'afternoon' 'ago' 'ai' 'air' 'air lines'
 'aircraft' 'airline' 'airlines' 'airlines earnings' 'airlines group'
 'amd' 'america' 'american' 'american airlines' 'analyst' 'andrew'
 'announced' 'annual' 'answer' 'applications' 'appreciate' 'approach'
 'approximately' 'approximately billion' 'apps' 'april' 'area' 'areas'
 'ask' 'asset' 'assets' 'associated' 'average' 'azure' 'balance'
 'balance sheet' 'bank' 'banking' 'barnum' 'barnum chief' 'base'
 'basically' 'basis' 'basis points' 'begin' 'beginning' 'benefit'
 'benefits' 'best' 'big' 'bigger' 'billion billion' 'billion year' 'board'
 'bob' 'boeing' 'book' 'bookings' 'brand' 'brett' 'bring' 'bringing'
 'broad' 'bruce' 'build' 'building' 'built'

In [17]:
tfidf_df.head()

Unnamed: 0,ability,able,absolutely,accelerate,accelerated,access,acquisition,actions,activities,activity,...,www,www callstreet,www refinitiv,yeah,year s,yes,yield,youtube,company,quarter
0,0.003738,0.018072,0.0,0.0,0.00498,0.005291,0.0,0.200377,0.00629,0.0,...,0.0,0.0,0.0,0.0,0.004696,0.068186,0.00559,0.0,3M,Q1-2023
1,0.003669,0.028387,0.005444,0.004712,0.009777,0.0,0.005623,0.0638,0.0,0.0,...,0.0,0.0,0.0,0.0,0.018441,0.119004,0.0,0.0,3M,Q1-2024
2,0.004187,0.020244,0.0,0.0,0.0,0.005926,0.0,0.115259,0.0,0.012325,...,0.0,0.0,0.0,0.0,0.0,0.076378,0.012524,0.0,3M,Q2-2023
3,0.018205,0.017606,0.0,0.0,0.0,0.0,0.006975,0.01319,0.022979,0.013399,...,0.0,0.0,0.0,0.006202,0.005719,0.018452,0.013615,0.0,3M,Q2-2024
4,0.012224,0.039406,0.0,0.005233,0.005429,0.005768,0.0,0.094467,0.0,0.005998,...,0.0,0.0,0.0,0.0,0.0,0.070207,0.030473,0.0,3M,Q3-2023


In [11]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,42,43,44,45,46,47,48,49,company,quarter
0,0.386147,-0.089628,-0.02964,-0.232534,0.078826,-0.164527,0.18076,-0.068727,0.250279,-0.16735,...,-0.039736,-0.058058,-0.038541,-0.033137,0.054544,0.041494,0.027568,-0.010882,3M,Q1-2023
1,0.368425,-0.096676,-0.037689,-0.253953,0.088661,-0.174781,0.154138,-0.063419,0.24532,-0.156168,...,0.020505,0.004807,0.072851,0.02626,-0.03361,0.006023,-0.064832,0.0354,3M,Q1-2024
2,0.383213,-0.088868,-0.034083,-0.251979,0.103883,-0.180724,0.18135,-0.07221,0.244447,-0.171659,...,-0.043107,-0.040514,-0.040197,0.029044,0.022915,0.021052,-0.007279,-0.045466,3M,Q2-2023
3,0.459565,-0.074357,-0.056754,-0.241421,0.020082,-0.178733,0.122672,-0.063598,0.238666,-0.177607,...,0.111287,0.063363,-0.066542,0.037426,0.032765,-0.023974,-0.017382,0.0023,3M,Q2-2024
4,0.371547,-0.086633,-0.037332,-0.246429,0.074185,-0.176502,0.182363,-0.071818,0.257649,-0.174626,...,-0.06054,0.012191,0.005026,-0.030554,-0.037274,0.010453,-0.010907,0.000605,3M,Q3-2023


# Preprocessing