In [33]:
from sec_edgar_downloader import Downloader
import os
import matplotlib.pyplot as plt
import pandas as pd
import requests
import nltk
from nltk.corpus import stopwords
from lxml import etree
import io
import re
from bs4 import BeautifulSoup
import openai
import requests
import json


### Download the complete submission data from the SEC-EDGAR

In [7]:
# My email address for making requests to the SEC
email = "shradhagodse125@gmail.com"

# Initialize a downloader instance with my email address
dl = Downloader(os.path.join(os.getcwd(), "sec_filings"), email)

# List of company tickers, I selected Apple, Microsoft and Google
tickers = ["AAPL", "MSFT", "GOOGL"] 

# Function to download 10-K filings
def download_10k(tickers, start_year=1995, end_year=2023):
    for ticker in tickers:
        for year in range(start_year, end_year + 1):
            # Download 10-K forms for the specified year
            dl.get("10-K", ticker, after=f"{year}-01-01", before=f"{year}-12-31")

# Run the download function
download_10k(tickers)

This created a folder called sec_filings in my local and saved the data. Let us try to preview the contents of some 10-K filings for Apple to confirm that the data is downloaded correctly.

In [66]:
def preview_file(file_path, lines=10):
    """ Print the first 'lines' lines of the file located at 'file_path' """
    with open(file_path, 'r', encoding='utf-8') as file:
        for _ in range(lines):
            print(file.readline().strip())

# Path to a specific 10-K file
file_path = os.path.join(os.getcwd(), "sec_filings", "AAPL", "10-K", "D:/sec-edgar-filings/AAPL/10-K/0000320193-17-000070/full-submission.txt")
preview_file(file_path)


<SEC-DOCUMENT>0000320193-17-000070.txt : 20171103
<SEC-HEADER>0000320193-17-000070.hdr.sgml : 20171103
<ACCEPTANCE-DATETIME>20171103080137
ACCESSION NUMBER:		0000320193-17-000070
CONFORMED SUBMISSION TYPE:	10-K
PUBLIC DOCUMENT COUNT:		97
CONFORMED PERIOD OF REPORT:	20170930
FILED AS OF DATE:		20171103
DATE AS OF CHANGE:		20171103



To further perform analysis and generate meaningful visualizations, let us merge the data from 3 different companies and store it in a pandas dataframe.

### Data Merge

In [3]:
# Set the correct directory
download_dir = "D:/sec-edgar-filings/"

def read_and_merge_files(directory):
    data_frames = []
    print(f"Looking for files in {directory}")
    for root, dirs, files in os.walk(directory):
        if not files:
            print(f"No files found in {root}")
            continue
        for file in files:
            path = os.path.join(root, file)
            print(f"Processing file: {path}")
            try:
                with open(path, 'r', encoding='utf-8') as f:
                    text = f.read()
                # The company name is extracted by going up two directories
                company = os.path.basename(os.path.dirname(os.path.dirname(root)))
                # Extract the year component from the filing folder name
                partial_year = os.path.basename(root).split('-')[1]  # Assumes year is second part of the hyphenated string
                # Determine the correct century prefix
                if partial_year.isdigit():  # Ensures the extracted part is numerical
                    year = int(partial_year)
                    if year < 95:
                        full_year = 2000 + year
                    else:
                        full_year = 1900 + year
                else:
                    full_year = partial_year  # Use the full year if available
                data_frames.append(pd.DataFrame({'Company': [company], 'Filing Year': [full_year], 'Text': [text]}))
            except Exception as e:
                print(f"Failed to read file {file}: {e}")
    if data_frames:
        return pd.concat(data_frames, ignore_index=True)
    else:
        print("No data to concatenate. Returning empty DataFrame.")
        return pd.DataFrame()

# Use the function with the correct path
df = read_and_merge_files(download_dir)

if df.empty:
    print("DataFrame is empty.")
else:
    print(f"DataFrame created with {len(df)} records.")

Looking for files in D:/sec-edgar-filings/
No files found in D:/sec-edgar-filings/
No files found in D:/sec-edgar-filings/AAPL
No files found in D:/sec-edgar-filings/AAPL\10-K
Processing file: D:/sec-edgar-filings/AAPL\10-K\0000320193-17-000070\full-submission.txt
Processing file: D:/sec-edgar-filings/AAPL\10-K\0000320193-18-000145\full-submission.txt
Processing file: D:/sec-edgar-filings/AAPL\10-K\0000320193-19-000119\full-submission.txt
Processing file: D:/sec-edgar-filings/AAPL\10-K\0000320193-20-000096\full-submission.txt
Processing file: D:/sec-edgar-filings/AAPL\10-K\0000320193-21-000105\full-submission.txt
Processing file: D:/sec-edgar-filings/AAPL\10-K\0000320193-22-000108\full-submission.txt
Processing file: D:/sec-edgar-filings/AAPL\10-K\0000320193-23-000106\full-submission.txt
Processing file: D:/sec-edgar-filings/AAPL\10-K\0000320193-95-000016\full-submission.txt
Processing file: D:/sec-edgar-filings/AAPL\10-K\0000320193-96-000023\full-submission.txt
Processing file: D:/sec

Extracted the Company name, Year and the Text in 3 different columns.

The folder name 0000320193-18-000145 is a SEC filing identifier. This identifier is a unique sequence used by the Securities and Exchange Commission (SEC) to label specific filings. Here’s a breakdown of what the components typically represent:

CIK (Central Index Key): 0000320193 is likely the CIK, which is a unique number assigned to all companies who file with the SEC. This particular CIK corresponds to Apple Inc.
Year: 18 suggests the filing was made in 2018.
Sequence Number: 000145 is a unique sequence number that identifies this specific filing among all filings made by the company in that year.

### Read the first 5 records of the dataset

In [5]:
df.head()

Unnamed: 0,Company,Filing Year,Text
0,AAPL,2017,<SEC-DOCUMENT>0000320193-17-000070.txt : 20171...
1,AAPL,2018,<SEC-DOCUMENT>0000320193-18-000145.txt : 20181...
2,AAPL,2019,<SEC-DOCUMENT>0000320193-19-000119.txt : 20191...
3,AAPL,2020,<SEC-DOCUMENT>0000320193-20-000096.txt : 20201...
4,AAPL,2021,<SEC-DOCUMENT>0000320193-21-000105.txt : 20211...


### Data Preprocessing

In [6]:
def extract_and_clean_text(xml_content):
    try:
        # Parse using lxml which can handle more complex and broken XML structures
        parser = etree.XMLParser(recover=True)  # recover from errors
        tree = etree.parse(io.StringIO(xml_content), parser)
        root = tree.getroot()
        if root is None:
            return "XML Parsing Error: No root element found"
        return extract_text_from_xml(root)
    except etree.XMLSyntaxError as e:
        return f"XML Parsing Error: {str(e)}"
    except Exception as e:
        return f"Unexpected Error: {str(e)}"

def extract_text_from_xml(element):
    text_parts = []
    if element is not None:
        if element.text:
            text_parts.append(element.text.strip())
        for child in element:
            text_parts.append(extract_text_from_xml(child))
        if element.tail:
            text_parts.append(element.tail.strip())
    return ' '.join(filter(None, text_parts))

# Assuming 'df' is your existing DataFrame and it has a column named 'Text'
# containing the XML and HTML content.

# Apply the extraction and cleaning process to each entry in the 'Text' column
df['Cleaned Text'] = df['Text'].apply(extract_and_clean_text)

# Print the DataFrame to see the original and cleaned text
print(df[['Text', 'Cleaned Text']])

                                                 Text  \
0   <SEC-DOCUMENT>0000320193-17-000070.txt : 20171...   
1   <SEC-DOCUMENT>0000320193-18-000145.txt : 20181...   
2   <SEC-DOCUMENT>0000320193-19-000119.txt : 20191...   
3   <SEC-DOCUMENT>0000320193-20-000096.txt : 20201...   
4   <SEC-DOCUMENT>0000320193-21-000105.txt : 20211...   
..                                                ...   
59  <SEC-DOCUMENT>0001564590-18-019062.txt : 20180...   
60  <SEC-DOCUMENT>0001564590-19-027952.txt : 20190...   
61  <SEC-DOCUMENT>0001564590-20-034944.txt : 20200...   
62  <SEC-DOCUMENT>0001564590-21-039151.txt : 20210...   
63  <SEC-DOCUMENT>0001564590-22-026876.txt : 20220...   

                                         Cleaned Text  
0   0000320193-17-000070.txt : 20171103 0000320193...  
1   0000320193-18-000145.txt : 20181105 0000320193...  
2   0000320193-19-000119.txt : 20191031 0000320193...  
3   0000320193-20-000096.txt : 20201030 0000320193...  
4   0000320193-21-000105.txt : 2021

### Write the cleaned data to a local directory

In [8]:
# Directory where text files will be stored
output_dir = 'output_texts'
os.makedirs(output_dir, exist_ok=True)

# Iterate over the DataFrame rows
for index, row in df.iterrows():
    # Create a file path
    file_path = os.path.join(output_dir, f"text_{index}.txt")
    # Write text to a file
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(str(row['Cleaned Text']))

In [22]:
def clean_financial_text(input_file):
    # Read the entire file
    with open(input_file, 'r', encoding='utf-8') as file:
        text_data = file.read()

    # Remove HTML tags
    clean_text = re.sub('<.*?>', '', text_data)

    # Replace specific patterns with "Yes" or "No"
    clean_text = re.sub(r'Yes ☒ No ☐', ':Yes.', clean_text)
    clean_text = re.sub(r'Yes ☐ No ☒', ':No.', clean_text)

    # Replace multiple newlines with a single newline
    clean_text = re.sub(r'\n\s*\n', '\n', clean_text)

    # Remove all types of brackets and their contents
    clean_text = re.sub(r'\[.*?\]|\{.*?}|\(.*?\)', '', clean_text)

    # Remove unnecessary white spaces
    clean_text = re.sub(' +', ' ', clean_text.strip())

    return clean_text

# Directory containing the text files
input_dir = 'D:/output_texts'
output_dir = 'D:/cleaned_output_texts'

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Process each file in the directory
for filename in os.listdir(input_dir):
    file_path = os.path.join(input_dir, filename)
    if filename.endswith('.txt'):
        cleaned_text = clean_financial_text(file_path)
        # Create a path for the cleaned file in the output directory
        output_file_path = os.path.join(output_dir, filename)
        # Save the cleaned text
        with open(output_file_path, 'w', encoding='utf-8') as output_file:
            output_file.write(cleaned_text)
        print(f"Cleaned {filename} and saved to {output_file_path}")

Cleaned text_0.txt and saved to D:/cleaned_output_texts\text_0.txt
Cleaned text_1.txt and saved to D:/cleaned_output_texts\text_1.txt
Cleaned text_10.txt and saved to D:/cleaned_output_texts\text_10.txt
Cleaned text_11.txt and saved to D:/cleaned_output_texts\text_11.txt
Cleaned text_12.txt and saved to D:/cleaned_output_texts\text_12.txt
Cleaned text_13.txt and saved to D:/cleaned_output_texts\text_13.txt
Cleaned text_14.txt and saved to D:/cleaned_output_texts\text_14.txt
Cleaned text_15.txt and saved to D:/cleaned_output_texts\text_15.txt
Cleaned text_16.txt and saved to D:/cleaned_output_texts\text_16.txt
Cleaned text_17.txt and saved to D:/cleaned_output_texts\text_17.txt
Cleaned text_18.txt and saved to D:/cleaned_output_texts\text_18.txt
Cleaned text_19.txt and saved to D:/cleaned_output_texts\text_19.txt
Cleaned text_2.txt and saved to D:/cleaned_output_texts\text_2.txt
Cleaned text_20.txt and saved to D:/cleaned_output_texts\text_20.txt
Cleaned text_21.txt and saved to D:/clea

In [23]:
def clean_financial_text(text_data):
    # Remove HTML tags
    clean_text = re.sub('<.*?>', '', text_data)
    # Replace specific patterns with "Yes" or "No"
    clean_text = re.sub(r'Yes ☒ No ☐', ':Yes.', clean_text)
    clean_text = re.sub(r'Yes ☐ No ☒', ':No.', clean_text)
    # Replace multiple newlines with a single newline
    clean_text = re.sub(r'\n\s*\n', '\n', clean_text)
    # Remove all types of brackets and their contents
    clean_text = re.sub(r'\[.*?\]|\{.*?}|\(.*?\)', '', clean_text)
    # Remove unnecessary white spaces
    clean_text = re.sub(' +', ' ', clean_text.strip())
    return clean_text

# Assuming df is your existing DataFrame
# Apply the cleaning function to the 'Text' column
df['Updated Text'] = df['Cleaned Text'].apply(clean_financial_text)

In [32]:
df.head()

Unnamed: 0,Company,Filing Year,Text,Cleaned Text,Updated Text
0,AAPL,2017,<SEC-DOCUMENT>0000320193-17-000070.txt : 20171...,0000320193-17-000070.txt : 20171103 0000320193...,0000320193-17-000070.txt : 20171103 0000320193...
1,AAPL,2018,<SEC-DOCUMENT>0000320193-18-000145.txt : 20181...,0000320193-18-000145.txt : 20181105 0000320193...,0000320193-18-000145.txt : 20181105 0000320193...
2,AAPL,2019,<SEC-DOCUMENT>0000320193-19-000119.txt : 20191...,0000320193-19-000119.txt : 20191031 0000320193...,0000320193-19-000119.txt : 20191031 0000320193...
3,AAPL,2020,<SEC-DOCUMENT>0000320193-20-000096.txt : 20201...,0000320193-20-000096.txt : 20201030 0000320193...,0000320193-20-000096.txt : 20201030 0000320193...
4,AAPL,2021,<SEC-DOCUMENT>0000320193-21-000105.txt : 20211...,0000320193-21-000105.txt : 20211029 0000320193...,0000320193-21-000105.txt : 20211029 0000320193...


In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64 entries, 0 to 63
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Company       64 non-null     object
 1   Filing Year   64 non-null     int64 
 2   Text          64 non-null     object
 3   Cleaned Text  64 non-null     object
 4   Updated Text  64 non-null     object
dtypes: int64(1), object(4)
memory usage: 2.6+ KB


In [40]:
print(df[(df['Company'] == "AAPL") & (df['Filing Year'] == 2020)])

  Company  Filing Year                                               Text  \
3    AAPL         2020  <SEC-DOCUMENT>0000320193-20-000096.txt : 20201...   

                                        Cleaned Text  \
3  0000320193-20-000096.txt : 20201030 0000320193...   

                                        Updated Text  
3  0000320193-20-000096.txt : 20201030 0000320193...  


In [41]:
# Set your API key
API_URL = "https://api-inference.huggingface.co/models/consciousAI/question-answering-roberta-base-s-v2"
headers = {"Authorization": "Bearer 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"}

# Specify the target company and year
target_company = "AAPL"
target_year = 2020

# Filter the DataFrame to get the relevant text
filtered_df = df[(df['Company'] == target_company) & (df['Filing Year'] == target_year)]

if not filtered_df.empty:
    text_data = filtered_df.iloc[0]['Updated Text']  # Assuming there's only one match
    print(f"Processing data for {target_company} from {target_year}:")
    
    # Define questions
    questions = [
        "What was the total revenue of Apple in the fiscal year 2020?",
        "What are the key competitive factors for Apple according to the report?",
        "What risk factors does Apple face according to their 10-K report?"
    ]

    # Iterate through questions and get answers using the Hugging Face model
    for question in questions:
        payload = json.dumps({
            "inputs": {
                "question": question,
                "context": text_data
            }
        })
        response = requests.post(API_URL, headers=headers, data=payload)
        response_data = response.json()
        
        # Check if 'answer' key is in the response
        if 'answer' in response_data:
            print(f"Question: {question}\nAnswer: {response_data['answer']}\n")
        else:
            # Print error or entire response if 'answer' key is not present
            print("Error or unexpected response:", response_data)
else:
    print(f"No data found for {target_company} in {target_year}.")

Processing data for AAPL from 2020:
Question: What was the total revenue of Apple in the fiscal year 2020?
Answer: $14.3 billion

Question: What are the key competitive factors for Apple according to the report?
Answer: increased competition

Question: What risk factors does Apple face according to their 10-K report?
Answer: risks and uncertainties



For the SEC document, we can focus on the sections individually, like the business summary, financial data, or risk factors.

#### Analysis Goals
Determine what insights or information you are interested in extracting. For example:

Sentiment Analysis: Assess the sentiment of the text regarding Microsoft's market outlook or financial health.
Entity Recognition: Extract names of companies, places, or dates mentioned in the text.
Summarization: Get a concise summary of the entire document or specific sections.
Keyword Extraction: Identify key terms that frequently appear in the document to understand focus areas.




This analysis can help stakeholders understand key aspects of the company's health and strategy without needing to parse through the entire document manually. For investors or analysts, this can save time and provide enhanced decision-making tools based on the extracted data trends.

This approach allows you to leverage the power of modern NLP tools to efficiently process and analyze large volumes of text data, turning unstructured text into actionable insights.

### Using Hugging Face's API for Sentiment Analysis