Installing the necessary libraries

In [None]:
!pip install sec-edgar-downloader

Downloading the 10-K files

In [2]:
from sec_edgar_downloader import Downloader
import os

def download_10k_filings(ticker_company_info, start_year=1995, end_year=2023):
    for ticker, (company_name, email_address) in ticker_company_info.items():
        downloader = Downloader(company_name, email_address, "C:/Users/shivp/OneDrive/Desktop/GTech2/final_extraction")

        # Create a directory for each ticker
        os.makedirs(ticker, exist_ok=True)

        try:
            # Download the 10-K filings for the specified range of years
            print(f"Downloading 10-K filings for {ticker} for the years {start_year}-{end_year}...")
            downloader.get("10-K", ticker, after=f"{start_year}-01-01", before=f"{end_year+1}-01-01",download_details=False)
        except Exception as e:
            print(f"Error downloading {ticker} filings: {e}")

# Example usage:
ticker_company_info = {
    "AAPL": ("Apple Inc.", "shivp036@gmail.com"),
    "MSFT": ("Microsoft Corporation", "shivp036@gmail.com"),
    "GOOGL": ("Alphabet Inc.", "shivp036@gmail.com")
}

download_10k_filings(ticker_company_info)


Downloading 10-K filings for AAPL for the years 1995-2023...
Downloading 10-K filings for MSFT for the years 1995-2023...
Downloading 10-K filings for GOOGL for the years 1995-2023...


Cleaning the data files

In [8]:
import os
import re

# Function to remove HTML tags and unnecessary characters from a string
def clean_text(text):
    # Remove HTML tags
    clean_text = re.sub(r'<.*?>', '', text)
    
    # Remove unnecessary characters (e.g., special characters, multiple spaces)
    clean_text = re.sub(r'[^\w\s.]', '', clean_text)
    clean_text = re.sub(r'\s+', ' ', clean_text).strip()  # Remove extra spaces and strip leading/trailing spaces
    
    return clean_text

# Function to read text from a file, clean it, and write to another file
def process_text_file(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        text = f.read()

    clean_text_content = clean_text(text)

    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(clean_text_content)

# Function to process all text files in a directory
def process_files_in_directory(directory):
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".txt"):  # Check if the file is a text file
                input_file = os.path.join(root, file)
                output_file = os.path.join(root, "cleaned_" + file)  # New filename for the cleaned text
                process_text_file(input_file, output_file)

# Example usage:
directory = 'C:/Users/shivp/OneDrive/Desktop/GTech2/final_extraction/sec-edgar-filings/'  # Directory containing the files
process_files_in_directory(directory)
