<a href="https://colab.research.google.com/github/sharmaanj200/FS-Innovation-Lab/blob/main/GIT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Process of Extracting the CIK values of a company

In [None]:
pip install sec-edgar-downloader

Collecting sec-edgar-downloader
  Downloading sec_edgar_downloader-5.0.2-py3-none-any.whl (14 kB)
Collecting pyrate-limiter>=3.1.0 (from sec-edgar-downloader)
  Downloading pyrate_limiter-3.6.1-py3-none-any.whl (26 kB)
Installing collected packages: pyrate-limiter, sec-edgar-downloader
Successfully installed pyrate-limiter-3.6.1 sec-edgar-downloader-5.0.2


In [None]:
!pip install yfinance



In [None]:
from sec_edgar_downloader import Downloader
import yfinance as yf
import os
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
import numpy as np
import pprint
import sys

In [None]:
# create request header
headers = {'User-Agent': "sharmaanj200@gmail.com"}

# get all companies data
companyTickers = requests.get(
    "https://www.sec.gov/files/company_tickers.json",
    headers=headers
    )

In [None]:
# parse CIK // without leading zeros
directCik = companyTickers.json()['0']['cik_str']

# dictionary to dataframe
companyData = pd.DataFrame.from_dict(companyTickers.json(),
                                     orient='index')

# add leading zeros to CIK
companyData['cik_str'] = companyData['cik_str'].astype(
                           str).str.zfill(10)

# Taking in the company ticker as input and downloading the 10K filings from sec-edgar database

In [None]:
ticker = input('Enter the company ticker you want to analyze : ')

Enter the company ticker you want to analyze : TSLA


In [None]:
ticker_cik = (companyData[companyData['ticker'] == ticker]['cik_str'].values)[0]

In [None]:
name = yf.Ticker(ticker)
company_name = name.info['longName']

#generated a dummy address for the downloader parameter using company's name
company_email_address = f"xyz@{company_name.replace(' ', '').replace(',', '').replace('.', '')}.com"

dl = Downloader(company_name, company_email_address)

dl.get("10-K", ticker, after="1995-01-01", before="2023-12-31")

8

# Parsing the text content from the HTML syntax based files (Cleaning)

In [None]:
base_dir = f'/content/sec-edgar-filings/{ticker}/10-K'

# checking with a sample example
year = "2001"
year = year[-2:]
for folder in os.listdir(base_dir):
    middle_part = folder.split('-')[1]
    if year == middle_part:
        print(f"Year {year} found in folder {folder}")

Parsing XML/XBRL

In [None]:
def restore_windows_1252_characters(restore_string):
    """
        Replace C1 control characters in the Unicode string s by the
        characters at the corresponding code points in Windows-1252,
        where possible.
    """

    def to_windows_1252(match):
        try:
            return bytes([ord(match.group(0))]).decode('windows-1252')
        except UnicodeDecodeError:
            # No character at the corresponding code point: remove it.
            return ''

    return re.sub(r'[\u0080-\u0099]', to_windows_1252, restore_string)

In [None]:
master_filings_dict = {}

for year in range(1995, 2024):
    year = str(year)
    year = year[-2:]

    for folder in os.listdir(base_dir):
        middle_part = folder.split('-')[1]
        if year == middle_part:
            master_filings_dict[folder] = {}
            master_filings_dict[folder]['sec_header_content'] = {}
            master_filings_dict[folder]['filing_documents'] = None

In [None]:
for year in range(1995, 2024):
    year_str = str(year)
    year_short = year_str[-2:]

    for folder in os.listdir(base_dir):
        middle_part = folder.split('-')[1]
        if year_short == middle_part:
            path = os.path.join(base_dir, folder)
            text_file_path = os.path.join(path, 'full-submission.txt')
            with open(text_file_path, 'r') as f:
                content = f.read()
            soup = BeautifulSoup(content, 'lxml')

            master_document_dict = {}

            for filing_document in soup.find_all('document'):

                document_id = filing_document.type.find(string=True, recursive=False).strip()
                document_sequence = filing_document.sequence.find(string=True, recursive=False).strip()
                document_filename = filing_document.filename.find(string=True, recursive=False).strip()

                if filing_document.description:
                    document_description = filing_document.description.find(string=True, recursive=False).strip()
                else:
                    document_description = ""

                master_document_dict[document_id] = {}
                master_document_dict[document_id]['document_sequence'] = document_sequence
                master_document_dict[document_id]['document_filename'] = document_filename
                master_document_dict[document_id]['document_description'] = document_description
                master_document_dict[document_id]['document_code'] = filing_document.extract()

                filing_doc_text = filing_document.find('text').extract()

                all_thematic_breaks = filing_doc_text.find_all('hr', {'width':'100%'})
                all_page_numbers = []
                for thematic_break in all_thematic_breaks:
                    prev_sibling = thematic_break.find_previous_sibling()
                    if prev_sibling:
                        page_number = prev_sibling.get_text(strip=True)
                        all_page_numbers.append(page_number)

                all_thematic_breaks = [str(thematic_break) for thematic_break in all_thematic_breaks]

                filing_doc_string = str(filing_doc_text)

                if len(all_thematic_breaks) > 0:
                    regex_delimiter_pattern = '|'.join(map(re.escape, all_thematic_breaks))
                    split_filing_string = re.split(regex_delimiter_pattern, filing_doc_string)
                    master_document_dict[document_id]['pages_code'] = split_filing_string

                elif len(all_thematic_breaks) == 0:
                    split_filing_string = all_thematic_breaks
                    master_document_dict[document_id]['pages_code'] = [filing_doc_string]

                master_filings_dict[folder]['filing_documents'] = master_document_dict

In [None]:
import unicodedata

for year in range(1995, 2024):
    year_str = str(year)
    year_short = year_str[-2:]

    for folder in os.listdir(base_dir):
        middle_part = folder.split('-')[1]
        if year_short == middle_part:
            # path = os.path.join(base_dir, folder)
            # text_file_path = os.path.join(path, 'full-submission.txt')
            filing_documents = master_filings_dict[folder]['filing_documents']

            for document_id in filing_documents:

                document_pages = filing_documents[document_id]['pages_code']
                pages_length = len(filing_documents[document_id]['pages_code'])
                repaired_pages = {}
                normalized_text = {}

                for index, page in enumerate(document_pages):
                    page_soup = BeautifulSoup(page,'html5')
                    page_text = page_soup.html.body.get_text(' ',strip = True)
                    page_text_norm = restore_windows_1252_characters(unicodedata.normalize('NFKD', page_text))
                    page_text_norm = page_text_norm.replace('  ', ' ').replace('\n',' ')
                    page_number = index + 1
                    normalized_text[page_number] = page_text_norm
                    repaired_pages[page_number] = page_soup

            filing_documents[document_id]['pages_normalized_text'] = normalized_text

            filing_documents[document_id]['pages_code'] = repaired_pages

            gen_page_numbers = list(repaired_pages.keys())

            filing_documents[document_id]['pages_numbers_generated'] = gen_page_numbers

In [None]:
import json

for year in range(1995, 2024):
    year_str = str(year)
    year_short = year_str[-2:]

    for folder in os.listdir(base_dir):
        middle_part = folder.split('-')[1]
        if year_short == middle_part:
            path = os.path.join(base_dir, folder)
            text_file_path = os.path.join(path, 'full-submission.txt')
            with open(text_file_path, 'w', encoding='utf-8', errors='ignore') as f:
                filing_documents = master_filings_dict[folder]['filing_documents']

                # Convert BeautifulSoup Tag objects to text
                filing_documents_text = {}
                for document_id, document_info in filing_documents.items():
                    document_text = str(document_info)
                    filing_documents_text[document_id] = document_text

                # Convert the dictionary to a JSON string
                json_string = json.dumps(filing_documents_text)
                f.write(json_string)

HTML parser

In [None]:
from bs4 import BeautifulSoup
import os

for year in range(1995, 2024):
    year_str = str(year)
    year_short = year_str[-2:]

    for folder in os.listdir(base_dir):
        middle_part = folder.split('-')[1]
        if year_short == middle_part:
            path = os.path.join(base_dir, folder)
            text_file_path = os.path.join(path, 'full-submission.txt')

            with open(text_file_path, 'r') as f:
                html_content = f.read()

            soup = BeautifulSoup(html_content, 'html.parser')

            for style_tag in soup.find_all('style'):
                style_tag.decompose()

            extracted_text = ''
            for tag in soup.find_all():
                extracted_text += tag.get_text(separator=' ', strip=True) + '\n'

            with open(text_file_path, 'w', encoding='utf-8', errors='ignore') as f:
                f.write(extracted_text)

# Combining files of all years into a single text file for better analysis (Merging)

In [None]:
def merge_files(output_file):
    with open(output_file, 'w') as outfile:
        for year in range(1995, 2024):
            year_str = str(year)
            year_short = year_str[-2:]
            for folder in os.listdir(base_dir):
                middle_part = folder.split('-')[1]
                if year_short == middle_part:
                    path = os.path.join(base_dir, folder)
                    filename = os.path.join(path, 'full-submission.txt')
                    with open(filename, 'r') as infile:
                        outfile.write(f"=== Year {year} ===\n\n")
                        outfile.write(infile.read())
                        outfile.write("\n\n=== End of Year ===\n\n")

output_file = "merged_data.txt"

merge_files(output_file)

print("Data merged successfully!")

Data merged successfully!


# LLM Inference API from hugging face is used

In [None]:
# Install the transformers package from Hugging Face:
!pip install transformers
!pip install torch
!pip install tensorflow

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [None]:
from transformers import BartForConditionalGeneration, BartTokenizer

In [None]:
# Load the model and tokenizer
model = BartForConditionalGeneration.from_pretrained(
    'facebook/bart-large-cnn')
tokenizer = BartTokenizer.from_pretrained(
    'facebook/bart-large-cnn')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
def summarize(text, maxSummarylength=10000):
    # Encode the text and summarize
    inputs = tokenizer.encode("summarize: " +
                              text,
                              return_tensors="pt",
                              max_length=10240, truncation=True)
    summary_ids = model.generate(inputs, max_length=maxSummarylength,
                                 min_length=int(maxSummarylength/5),
                                 length_penalty=10.0,
                                 num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

In [None]:
def split_text_into_pieces(text,
                           max_tokens=50000,
                           overlapPercent=10):
    # Tokenize the text
    tokens = tokenizer.tokenize(text)

    # Calculate the overlap in tokens
    overlap_tokens = int(max_tokens * overlapPercent / 100)

    # Split the tokens into chunks of size
    # max_tokens with overlap
    pieces = [tokens[i:i + max_tokens]
              for i in range(0, len(tokens),
                             max_tokens - overlap_tokens)]

    # Convert the token pieces back into text
    text_pieces = [tokenizer.decode(
        tokenizer.convert_tokens_to_ids(piece),
        skip_special_tokens=True) for piece in pieces]

    return text_pieces

In [None]:
def recursive_summarize(text, max_length=20000, recursionLevel=0):
    recursionLevel=recursionLevel+1
    print("######### Recursion level: ",
          recursionLevel,"\n\n######### ")
    tokens = tokenizer.tokenize(text)
    expectedCountOfChunks = len(tokens)/max_length
    max_length=int(len(tokens)/expectedCountOfChunks)+2

    # Break the text into pieces of max_length
    pieces = split_text_into_pieces(text, max_tokens=max_length)

    # Summarize each piece
    summaries=[]
    k=0
    for k in range(0, len(pieces)):
        piece=pieces[k]
        summary =summarize(piece, maxSummarylength=max_length/3*2)
        print(summary)
        summaries.append(summary)

    concatenated_summary = ' '.join(summaries)

    tokens = tokenizer.tokenize(concatenated_summary)

    if len(tokens) > max_length:
        # If the concatenated_summary is too long, repeat the process
        print("############# GOING RECURSIVE ##############")
        return recursive_summarize(concatenated_summary,
                                   max_length=max_length,
                                   recursionLevel=recursionLevel)
    else:
      # Concatenate the summaries and summarize again
        final_summary=concatenated_summary
        if len(pieces)>1:
            final_summary = summarize(concatenated_summary,
                                  maxSummarylength=max_length)
        return final_summary

In [None]:
with open('/content/merged_data.txt', 'r') as f:
    content = f.read()
final_summary = recursive_summarize(content)
print("\n%%%%%%%%%%%%%%%%%%%%%\n")
print("Final summary:", final_summary)