<a href="https://colab.research.google.com/github/sharmaanj200/FS-Innovation-Lab/blob/main/GIT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Process of Extracting the CIK values of a company

In [1]:
pip install sec-edgar-downloader

Collecting sec-edgar-downloader
  Downloading sec_edgar_downloader-5.0.2-py3-none-any.whl (14 kB)
Collecting pyrate-limiter>=3.1.0 (from sec-edgar-downloader)
  Downloading pyrate_limiter-3.6.1-py3-none-any.whl (26 kB)
Installing collected packages: pyrate-limiter, sec-edgar-downloader
Successfully installed pyrate-limiter-3.6.1 sec-edgar-downloader-5.0.2


In [2]:
!pip install yfinance



In [3]:
from sec_edgar_downloader import Downloader
import yfinance as yf
import os
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
import numpy as np
import pprint
import sys

In [4]:
# create request header
headers = {'User-Agent': "sharmaanj200@gmail.com"}

# get all companies data
companyTickers = requests.get(
    "https://www.sec.gov/files/company_tickers.json",
    headers=headers
    )

In [5]:
# parse CIK // without leading zeros
directCik = companyTickers.json()['0']['cik_str']

# dictionary to dataframe
companyData = pd.DataFrame.from_dict(companyTickers.json(),
                                     orient='index')

# add leading zeros to CIK
companyData['cik_str'] = companyData['cik_str'].astype(
                           str).str.zfill(10)

# Taking in the company ticker as input and downloading the 10K filings from sec-edgar database

In [6]:
ticker = input('Enter the company ticker you want to analyze : ')

Enter the company ticker you want to analyze : NKE


In [7]:
ticker_cik = (companyData[companyData['ticker'] == ticker]['cik_str'].values)[0]

In [8]:
name = yf.Ticker(ticker)
company_name = name.info['longName']

#generated a dummy address for the downloader parameter using company's name
company_email_address = f"xyz@{company_name.replace(' ', '').replace(',', '').replace('.', '')}.com"

dl = Downloader(company_name, company_email_address)

dl.get("10-K", ticker, after="1995-01-01", before="2023-12-31")

27

# Parsing the text content from the HTML syntax based files (Cleaning)

In [9]:
base_dir = f'/content/sec-edgar-filings/{ticker}/10-K'

# checking with a sample example
year = "2001"
year = year[-2:]
for folder in os.listdir(base_dir):
    middle_part = folder.split('-')[1]
    if year == middle_part:
        print(f"Year {year} found in folder {folder}")

Year 01 found in folder 0001095811-01-503828


Parsing XML/XBRL

In [10]:
def restore_windows_1252_characters(restore_string):
    """
        Replace C1 control characters in the Unicode string s by the
        characters at the corresponding code points in Windows-1252,
        where possible.
    """

    def to_windows_1252(match):
        try:
            return bytes([ord(match.group(0))]).decode('windows-1252')
        except UnicodeDecodeError:
            # No character at the corresponding code point: remove it.
            return ''

    return re.sub(r'[\u0080-\u0099]', to_windows_1252, restore_string)

In [11]:
master_filings_dict = {}

for year in range(2001, 2024):
    year = str(year)
    year = year[-2:]

    for folder in os.listdir(base_dir):
        middle_part = folder.split('-')[1]
        if year == middle_part:
            master_filings_dict[folder] = {}
            master_filings_dict[folder]['sec_header_content'] = {}
            master_filings_dict[folder]['filing_documents'] = None

In [12]:
for year in range(2001, 2024):
    year_str = str(year)
    year_short = year_str[-2:]

    for folder in os.listdir(base_dir):
        middle_part = folder.split('-')[1]
        if year_short == middle_part:
            path = os.path.join(base_dir, folder)
            text_file_path = os.path.join(path, 'full-submission.txt')
            with open(text_file_path, 'r') as f:
                content = f.read()
            soup = BeautifulSoup(content, 'lxml')

            master_document_dict = {}

            for filing_document in soup.find_all('document'):

                document_id = filing_document.type.find(string=True, recursive=False).strip()
                document_sequence = filing_document.sequence.find(string=True, recursive=False).strip()
                document_filename = filing_document.filename.find(string=True, recursive=False).strip()

                if filing_document.description:
                    document_description = filing_document.description.find(string=True, recursive=False).strip()
                else:
                    document_description = ""

                master_document_dict[document_id] = {}
                master_document_dict[document_id]['document_sequence'] = document_sequence
                master_document_dict[document_id]['document_filename'] = document_filename
                master_document_dict[document_id]['document_description'] = document_description
                master_document_dict[document_id]['document_code'] = filing_document.extract()

                filing_doc_text = filing_document.find('text').extract()

                all_thematic_breaks = filing_doc_text.find_all('hr', {'width':'100%'})
                all_page_numbers = []
                for thematic_break in all_thematic_breaks:
                    prev_sibling = thematic_break.find_previous_sibling()
                    if prev_sibling:
                        page_number = prev_sibling.get_text(strip=True)
                        all_page_numbers.append(page_number)

                all_thematic_breaks = [str(thematic_break) for thematic_break in all_thematic_breaks]

                filing_doc_string = str(filing_doc_text)

                if len(all_thematic_breaks) > 0:
                    regex_delimiter_pattern = '|'.join(map(re.escape, all_thematic_breaks))
                    split_filing_string = re.split(regex_delimiter_pattern, filing_doc_string)
                    master_document_dict[document_id]['pages_code'] = split_filing_string

                elif len(all_thematic_breaks) == 0:
                    split_filing_string = all_thematic_breaks
                    master_document_dict[document_id]['pages_code'] = [filing_doc_string]

                master_filings_dict[folder]['filing_documents'] = master_document_dict

In [13]:
import unicodedata

for year in range(2001, 2024):
    year_str = str(year)
    year_short = year_str[-2:]

    for folder in os.listdir(base_dir):
        middle_part = folder.split('-')[1]
        if year_short == middle_part:
            # path = os.path.join(base_dir, folder)
            # text_file_path = os.path.join(path, 'full-submission.txt')
            filing_documents = master_filings_dict[folder]['filing_documents']

            for document_id in filing_documents:

                document_pages = filing_documents[document_id]['pages_code']
                pages_length = len(filing_documents[document_id]['pages_code'])
                repaired_pages = {}
                normalized_text = {}

                for index, page in enumerate(document_pages):
                    page_soup = BeautifulSoup(page,'html5')
                    page_text = page_soup.html.body.get_text(' ',strip = True)
                    page_text_norm = restore_windows_1252_characters(unicodedata.normalize('NFKD', page_text))
                    page_text_norm = page_text_norm.replace('  ', ' ').replace('\n',' ')
                    page_number = index + 1
                    normalized_text[page_number] = page_text_norm
                    repaired_pages[page_number] = page_soup

            filing_documents[document_id]['pages_normalized_text'] = normalized_text

            filing_documents[document_id]['pages_code'] = repaired_pages

            gen_page_numbers = list(repaired_pages.keys())

            filing_documents[document_id]['pages_numbers_generated'] = gen_page_numbers

In [14]:
import json

for year in range(2001, 2024):
    year_str = str(year)
    year_short = year_str[-2:]

    for folder in os.listdir(base_dir):
        middle_part = folder.split('-')[1]
        if year_short == middle_part:
            path = os.path.join(base_dir, folder)
            text_file_path = os.path.join(path, 'full-submission.txt')
            with open(text_file_path, 'w', encoding='utf-8', errors='ignore') as f:
                filing_documents = master_filings_dict[folder]['filing_documents']

                # Convert BeautifulSoup Tag objects to text
                filing_documents_text = {}
                for document_id, document_info in filing_documents.items():
                    document_text = str(document_info)
                    filing_documents_text[document_id] = document_text

                # Convert the dictionary to a JSON string
                json_string = json.dumps(filing_documents_text)
                f.write(json_string)

HTML parser

In [15]:
from bs4 import BeautifulSoup
import os

for year in range(1995, 2024):
    year_str = str(year)
    year_short = year_str[-2:]

    for folder in os.listdir(base_dir):
        middle_part = folder.split('-')[1]
        if year_short == middle_part:
            path = os.path.join(base_dir, folder)
            text_file_path = os.path.join(path, 'full-submission.txt')

            with open(text_file_path, 'r') as f:
                html_content = f.read()

            soup = BeautifulSoup(html_content, 'html.parser')

            plain_text = soup.get_text(separator=' ', strip=True)

            with open(text_file_path, 'w', encoding='utf-8', errors='ignore') as f:
                f.write(plain_text)

# Combining files of all years into a single text file for better analysis (Merging)

In [27]:
def merge_files(output_file):
    with open(output_file, 'w') as outfile:
        for year in range(1995, 2024):
            year_str = str(year)
            year_short = year_str[-2:]
            for folder in os.listdir(base_dir):
                middle_part = folder.split('-')[1]
                if year_short == middle_part:
                    path = os.path.join(base_dir, folder)
                    filename = os.path.join(path, 'full-submission.txt')
                    with open(filename, 'r') as infile:
                        outfile.write(f"=== Year {year} ===\n\n")
                        outfile.write(infile.read())
                        outfile.write("\n\n=== End of Year ===\n\n")

output_file = "merged_data.txt"

merge_files(output_file)

print("Data merged successfully!")

Data merged successfully!


# LLM Inference API from hugging face is used

In [1]:
!pip install transformers



In [1]:
from transformers import LongformerConfig, LongformerModel

In [4]:
summarizer = pipeline('summarization')

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

In [6]:
with open('/content/merged_data.txt', 'r') as f:
    content = f.read()

In [None]:
summary = summarizer(content, max_length=500, min_length=100, do_sample=False)

Token indices sequence length is longer than the specified maximum sequence length for this model (7145814 > 1024). Running this sequence through the model will result in indexing errors


In [None]:
summary[0]['summary_text']