In [5]:
from sec_edgar_downloader import Downloader
import os
import pandas as pd
import re

In [7]:
suchbegriffe = [
    # Digital (uncategorized)
    "algorithm", "analytics", "app", "artificial intelligence", "AI", "automate", "automatize", "beta", "version",
    "big data", "bitcoin", "byte", "cashless", "chip", "cloud", "cloud computing", "cloud-management", "code",
    "coding", "computer", "crypto", "cyber", "data call", "data science", "digital", "digitalization",
    "digitization", "digitize", "dot com", "download", "e-learning", "e-mail", "email", "electronic network",
    "geo-fencing", "gigabyte", "inbox", "information technology", "IT", "interface", "internet",
    "internet of things", "IoT", "internet protocol", "IP", "laptop", "metaverse", "mobile", "mobile phone",
    "non-fungible token", "NFT", "online", "open source", "operating system", "OS", "platform", "post",
    "posting", "remote", "robot", "search engine advertising", "SEA", "search engine optimization", "SEO",
    "self-driving", "smartphone", "SMS", "social network", "stream", "streaming", "tablet", "tag", "tagging",
    "techie", "track", "tracking", "user experience", "virtual", "web", "WiFi", "wireless", "world wide web", "WWW",

    # Digital (external)
    "blog", "chatbot", "click", "content filter", "content marketing", "cookie", "crowdsource", "crowdsourcing",
    "e-book", "ecommerce", "e-commerce", "electronic market", "homepage", "landing page",
    "location based targeting", "new media", "paid search", "product review", "recommender system", "retarget",
    "retargeting", "search engine", "sharing economy", "smart home", "social media", "tweet",
    "user generated content", "website",

    # Digital (internal)
    "automatic", "blockchain", "compute", "data", "data scientist", "developer", "global positioning system",
    "GPS", "hardware", "machine learning", "programmer", "programming", "remote work", "server", "software",
    "videoconference", "videoconferencing", "webinar"
]

In [8]:
# 📍 Speicherort und Kontaktadresse
dl = Downloader("sec-edgar-filings", "jaspeb97@zedat.fu-berlin.de")

# Apple CIK
unternehmen = "Apple"
cik = "0000320193"

# Lade alle 10-K Berichte seit 2013
dl.get("10-K", cik, after="2013-01-01")


12

In [16]:
from typing import Any


class Company:
    def __init__(self, name: str, cik: str):
        self.name = name
        self.cik = cik
        self.years = []
        self.filings = []

    def add_filing(self, filing: Any):
        self.filings.append(filing)

    def add_year(self, year: str):
        if year not in self.years:
            self.years.append(year)

    def __repr__(self):
        return f"Company(name={self.name}, cik={self.cik}, filings={len(self.filings)})"


df = pd.read_excel("Unternehmensliste Masterarbeit Mai 2025.xlsx", skiprows=3)
print(df.head())

companies: dict[str, Company] = {}

for index, row in df.iterrows():
    company_name = row["conm"]
    cik = str(row["cik"]).zfill(10)
    year = str(row["fyear"])

    if companies.get(company_name) is None:
        company = Company(company_name, cik)
        companies[company_name] = company
    else:
        company = companies[company_name]
        if cik == "0000000nan":
            print(f"CIK is NaN for {company_name} in year {year}")

    company.add_year(year)

print(f"Found {len(companies)} companies in the list.")

companies = {k: v for k, v in companies.items() if v.cik != "0000000nan"}
print(len(companies))

   Unnamed: 0  gvkey  tic                    conm                       conml  \
0         NaN   1161  AMD  ADVANCED MICRO DEVICES  Advanced Micro Devices Inc   
1         NaN   1161  AMD  ADVANCED MICRO DEVICES  Advanced Micro Devices Inc   
2         NaN   1161  AMD  ADVANCED MICRO DEVICES  Advanced Micro Devices Inc   
3         NaN   1161  AMD  ADVANCED MICRO DEVICES  Advanced Micro Devices Inc   
4         NaN   1161  AMD  ADVANCED MICRO DEVICES  Advanced Micro Devices Inc   

        weburl addzip         city  county state  ...       ppent       emp  \
0  www.amd.com  95054  Santa Clara     NaN    CA  ...  38090000.0  134000.0   
1  www.amd.com  95054  Santa Clara     NaN    CA  ...   7000000.0  111000.0   
2  www.amd.com  95054  Santa Clara     NaN    CA  ...   7260000.0  110930.0   
3  www.amd.com  95054  Santa Clara     NaN    CA  ...   6580000.0  103400.0   
4  www.amd.com  95054  Santa Clara     NaN    CA  ...   3460000.0  106710.0   

      cik ggroup    gind  gsector   gs

In [17]:
import html

# 📁 Pfad zu Apple-Berichten
base_path = "./sec-edgar-filings/0000320193/10-K"
ergebnisse = []
# Apple CIK
unternehmen = "Apple"
cik = "0000320193"
patterns = {
    word: re.compile(rf"\b{re.escape(word.lower())}\b") for word in suchbegriffe
}
# 🔁 Durchsuche jeden Bericht
from concurrent.futures import ProcessPoolExecutor
import time


file_path = os.path.join(base_path, "0000320193-21-000105", "full-submission.txt")

with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
    content = f.read().lower()
    text = html.unescape(re.sub(r'<[^>]+>', '', content))

    jahr = "Unbekannt"
    if (m := re.search(r"filed as of date:\s*(20\d{2})", content)):
        jahr = m.group(1)
    elif (m := re.search(r"filed:\s.*?(20\d{2})", content)):
        jahr = m.group(1)

    # Compile patterns locally (picklable-safe)
    # Measure time for the first method
    start_time = time.time()
    eintrag = {"Unternehmen": unternehmen, "Jahr": jahr, "Bericht": "0000320193-21-000105"}
    for wort in suchbegriffe:
        pattern = re.compile(rf"\b{re.escape(wort.lower())}\b")
        eintrag[wort] = len(pattern.findall(text))
    first_method_time = time.time() - start_time
    print("First method time:", first_method_time, "seconds")
    print(eintrag)

    # Measure time for the second method
    start_time = time.time()
    eintrag = {"Unternehmen": unternehmen, "Jahr": jahr, "Bericht": "0000320193-21-000105"}
    combined_pattern = re.compile(r"|".join(rf"\b{re.escape(word.lower())}\b" for word in suchbegriffe))
    matches = combined_pattern.findall(text)
    for wort in suchbegriffe:
        eintrag[wort] = matches.count(wort.lower())
    second_method_time = time.time() - start_time
    print("Second method time:", second_method_time, "seconds")

    # Calculate and print percentage difference
    percentage_difference = ((first_method_time) / second_method_time) 
    print(f"Percentage difference: {percentage_difference:.2f}%")
    
    print(eintrag)



# 🧾 In DataFrame umwandeln & als Excel speichern
df = pd.DataFrame(ergebnisse)
print(df.head())  # Vorschau
df.to_excel("apple_digital_begriffe_vollständig.xlsx", index=False)

First method time: 5.393694162368774 seconds
{'Unternehmen': 'Apple', 'Jahr': '2021', 'Bericht': '0000320193-21-000105', 'algorithm': 0, 'analytics': 0, 'app': 20, 'artificial intelligence': 0, 'AI': 32, 'automate': 0, 'automatize': 0, 'beta': 0, 'version': 1, 'big data': 0, 'bitcoin': 0, 'byte': 0, 'cashless': 1, 'chip': 3, 'cloud': 9, 'cloud computing': 0, 'cloud-management': 0, 'code': 36, 'coding': 0, 'computer': 15, 'crypto': 0, 'cyber': 0, 'data call': 0, 'data science': 0, 'digital': 33, 'digitalization': 0, 'digitization': 0, 'digitize': 0, 'dot com': 0, 'download': 1, 'e-learning': 0, 'e-mail': 0, 'email': 0, 'electronic network': 0, 'geo-fencing': 0, 'gigabyte': 0, 'inbox': 0, 'information technology': 9, 'IT': 112, 'interface': 0, 'internet': 7, 'internet of things': 0, 'IoT': 0, 'internet protocol': 0, 'IP': 19, 'laptop': 0, 'metaverse': 0, 'mobile': 2, 'mobile phone': 0, 'non-fungible token': 0, 'NFT': 1, 'online': 3, 'open source': 0, 'operating system': 7, 'OS': 22, 'pla