In [1]:
"""
Task #2 – Gen AI
Dataset
( Year: 2018-2020
( Filing type: 10K
( Sections: All
( Company: Choose 1.
( Choose 5 data attributes to extract from a single year.
Steps
( Convert documents to chunks
( Covert chunks to embeddings
( Create a query
( Create a prompt to extract data from chunks from a specific year.
( Create a validation dataset (5 true values from chunks).
( Demonstrate that your LLM can retrieve the correct chunks from your
embedding object for the correct year
"""


'\nTask #2 – Gen AI\nDataset\n( Year: 2018-2020\n( Filing type: 10K\n( Sections: All\n( Company: Choose 1.\n( Choose 5 data attributes to extract from a single year.\nSteps\n( Convert documents to chunks\n( Covert chunks to embeddings\n( Create a query\n( Create a prompt to extract data from chunks from a specific year.\n( Create a validation dataset (5 true values from chunks).\n( Demonstrate that your LLM can retrieve the correct chunks from your\nembedding object for the correct year\n'

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
import datasets

# Load year-specific datasets
dataset_2018 = datasets.load_dataset("eloukas/edgar-corpus", "year_2018", split="train", trust_remote_code=True)
dataset_2019 = datasets.load_dataset("eloukas/edgar-corpus", "year_2019", split="train", trust_remote_code=True)
dataset_2020 = datasets.load_dataset("eloukas/edgar-corpus", "year_2020", split="train", trust_remote_code=True)

# Merge all three years into one dataset
dataset = datasets.concatenate_datasets([dataset_2018, dataset_2019, dataset_2020])

# Print structure of dataset
print(dataset[0])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.




In [2]:
# Initialize PySpark with optimized memory settings
spark = SparkSession.builder \
    .appName("pysparksession") \
    .config("spark.driver.memory", "6g") \
    .config("spark.executor.memory", "5g") \
    .config("spark.executor.memoryOverhead", "1g") \
    .config("spark.sql.shuffle.partitions", "50") \
    .config("spark.driver.maxResultSize", "2g") \
    .getOrCreate()


# Define PySpark schema
schema = StructType([
    StructField("filename", StringType(), True),
    StructField("cik", StringType(), True),
    StructField("year", StringType(), True),
    StructField("section_1", StringType(), True),
    StructField("section_1A", StringType(), True),
    StructField("section_1B", StringType(), True),
    StructField("section_2", StringType(), True),
    StructField("section_3", StringType(), True),
    StructField("section_4", StringType(), True),
    StructField("section_5", StringType(), True),
    StructField("section_6", StringType(), True),
    StructField("section_7", StringType(), True),
    StructField("section_7A", StringType(), True),
    StructField("section_8", StringType(), True),
    StructField("section_9", StringType(), True),
    StructField("section_9A", StringType(), True),
    StructField("section_9B", StringType(), True),
    StructField("section_10", StringType(), True),
    StructField("section_11", StringType(), True),
    StructField("section_12", StringType(), True),
    StructField("section_13", StringType(), True),
    StructField("section_14", StringType(), True),
    StructField("section_15", StringType(), True)
])

# Function to load dataset in batches and merge efficiently
def load_dataset_in_batches(dataset, batch_size=1000):
    df_final = None
    for i in range(0, len(dataset), batch_size):
        batch = dataset.select(range(i, min(i + batch_size, len(dataset))))
        batch_dict = [dict(record) for record in batch]  # Convert batch to list of dicts
        batch_df = spark.createDataFrame(batch_dict, schema=schema)  # Convert batch to PySpark DataFrame

        # Merge batches efficiently
        if df_final is None:
            df_final = batch_df
        else:
            df_final = df_final.union(batch_df)  # Efficiently append batch to final DataFrame

    return df_final  # Return final merged DataFrame

# Load dataset in batches
df_final = load_dataset_in_batches(dataset)

# Show sample data
df_final.show(5)


+----------------+-------+----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|        filename|    cik|year|           section_1|          section_1A|          section_1B|           section_2|           section_3|           section_4|           section_5|           section_6|           section_7|          section_7A|           section_8|           section_9|          section_9A|          section_9B|          section_10|          section_11|          section_12|          section_13|          section_14|          section_15|
+----------------+-------+----+--------------------+--------------------+--------------------+--

In [11]:
#Choose a company and 5 attributes for embeddings
company_cik = '1566373'
year = "2018"
columns_to_extract = ["section_1", "section_1A", "section_1B", "section_2", "section_3"]

df_company = df_final.filter(df_final.cik == company_cik)
df_year = df_company.filter(df_company.year == year)
df_selected = df_year.select(columns_to_extract)


In [12]:
#validate of retieved any data
print(f"Rows matching cik={company_cik} and year={year}: {df_year.count()}")


Rows matching cik=1566373 and year=2018: 1


In [13]:
df_selected.show(5)

+--------------------+--------------------+--------------------+--------------------+--------------------+
|           section_1|          section_1A|          section_1B|           section_2|           section_3|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|Item 1. Business\...|Item 1A. Risk Fac...|Item 1B. Unresolv...|Item 2. Propertie...|Item 3. Legal Pro...|
+--------------------+--------------------+--------------------+--------------------+--------------------+



In [15]:
import nltk
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize

def chunk_text(text, chunk_size=512):
    sentences = sent_tokenize(text)
    chunks, current_chunk, current_length = [], [], 0

    for sentence in sentences:
        if current_length + len(sentence.split()) > chunk_size:
            chunks.append(" ".join(current_chunk))
            current_chunk, current_length = [], 0
        current_chunk.append(sentence)
        current_length += len(sentence.split())

    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

# Apply to the selected columns
text_chunks = []
for row in df_selected.collect():  # Collecting the data into a local list
    for col in columns_to_extract:
        text_chunks.extend(chunk_text(row[col]))  # Extend chunks for each column

print(f"Total Chunks: {len(text_chunks)}")


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Total Chunks: 110


In [16]:
#Convert chunks to embeddings
from sentence_transformers import SentenceTransformer
import numpy as np

# Load the pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for the chunks
def create_embeddings_with_sentence_transformer(chunks):
    embeddings = model.encode(chunks, convert_to_numpy=True)
    return embeddings

# Create embeddings for the text chunks
embeddings = create_embeddings_with_sentence_transformer(text_chunks)

# Check the embeddings
print(f"Generated {len(embeddings)} embeddings.")
print(f"Embedding shape: {embeddings.shape}")


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Generated 110 embeddings.
Embedding shape: (110, 384)


In [28]:
from sklearn.metrics.pairwise import cosine_similarity

# Example query
query = "What information is available about section 3 in 2018?"

# Step 1: Create query embedding
query_embedding = model.encode([query], convert_to_numpy=True)

# Step 2: Calculate cosine similarity between query and chunk embeddings
similarities = cosine_similarity(query_embedding, embeddings)

# Step 3: Get top 5 most similar chunks based on cosine similarity
top_n = 5
top_n_indices = similarities[0].argsort()[-top_n:][::-1]

# Step 4: Show the most relevant chunks
relevant_chunks = [text_chunks[i] for i in top_n_indices]

print(f"Top {top_n} Relevant Chunks for the Query: '{query}'")
for i, chunk in enumerate(relevant_chunks, 1):
    print(f"Chunk {i}: {chunk}")


Top 5 Relevant Chunks for the Query: 'What information is available about section 3 in 2018?'
Chunk 1: Item 1B. Unresolved Staff Comments. Not applicable. Item 2.
Chunk 2: Item 3. Legal Proceedings. From time to time, we may become involved in legal proceedings arising in the ordinary course of our business. We are not presently a party to any material litigation. Item 4.
Chunk 3: These changes include the Budget Control Act of 2011, which, among other things, led to aggregate reductions to Medicare payments to providers of up to 2% per fiscal year that started in 2013 and, due to subsequent legislation, will continue until 2025, and the American Taxpayer Relief Act of 2012, which, among other things, reduced Medicare payments to several providers and increased the statute of limitations period for the government to recover overpayments to providers from three to five years. Some of the provisions of the PPACA have yet to be implemented, and there have been legal and political challeng

In [43]:
# Validation Query based on chunks retirved manually
# Example query: "compatibility of inarigivir with Viread"
query = "compatibility of inarigivir with Viread"
query_embedding = model.encode([query], convert_to_numpy=True)
similarities = cosine_similarity(query_embedding, embeddings)
top_n = 1
top_n_indices = similarities[0].argsort()[-top_n:][::-1]
relevant_chunks = [text_chunks[i] for i in top_n_indices]

print(f"Top {top_n} Relevant Chunks for the Query: '{query}'")
for index, chunk in zip(top_n_indices, relevant_chunks):
    print(f"Chunk Number: {index} - {chunk}")


Top 1 Relevant Chunks for the Query: 'compatibility of inarigivir with Viread'
Chunk Number: 3 - We have conducted early development work that indicates compatibility of inarigivir with Viread in the same formulation. We have conducted additional formulation development work for SB 9225 and successfully manufactured initial quantities of SB 9225. The introduction of SB 9225, if approved, could potentially result in enhanced patient compliance and allow for a more favorable safety profile. Subject to the results of our Phase 2 inarigivir trials, we could be in a position to initiate a Phase 3 program for SB 9225 in the United States, Europe and Asia in 2020. Inarigivir Strategic Collaborations. The treatment of chronic HBV is complex and heterogenous, and we believe that any curative treatment will require a combinatorial approach. Although our initial approach has been to evaluate inarigivir in combination with NUCs (see “Gilead Collaboration” above), we have explored pre-clinical stud

In [44]:
# Example query 2: "compatibility of inarigivir with Viread"
query = "which chunk describes about inhibit viral replication through two mechanisms:(i) a direct acting anti-viral (DAA) mechanism where inarigivir acts a non-nucleotide reverse transcriptase inhibitor (NNRTI)"
query_embedding = model.encode([query], convert_to_numpy=True)
similarities = cosine_similarity(query_embedding, embeddings)
top_n = 1
top_n_indices = similarities[0].argsort()[-top_n:][::-1]
relevant_chunks = [text_chunks[i] for i in top_n_indices]

print(f"Top {top_n} Relevant Chunks for the Query: '{query}'")
for index, chunk in zip(top_n_indices, relevant_chunks):
    print(f"Chunk Number: {index} - {chunk}")


Top 1 Relevant Chunks for the Query: 'which chunk describes about inhibit viral replication through two mechanisms:(i) a direct acting anti-viral (DAA) mechanism where inarigivir acts a non-nucleotide reverse transcriptase inhibitor (NNRTI)'
Chunk Number: 5 - Inarigivir and our other novel compounds are designed to inhibit viral replication through two mechanisms:(i) a direct acting anti-viral (DAA) mechanism where inarigivir acts a non-nucleotide reverse transcriptase inhibitor (NNRTI) to inhibit the interaction of pre-genomic RNA, or pgRNA, and HBV DNA polymerase within the replication complex; and (ii) binding to RIG-I with activation within the hepatocyte of the innate immune response through production of natural immunomodulatory cytokines, including type I and II interferons. Viruses have evolved mechanisms to block the protective effects of interferon production. Our compounds are designed to restore interferon production in infected cells. In the case of chronic HBV, we believe

In [45]:
# Example query 3: "compatibility of inarigivir with Viread"
query = "which chunk describes about a study functional cure rate, when combining PEG-IFN-a and Viread over a 48-week treatment period, is less than 10%"
query_embedding = model.encode([query], convert_to_numpy=True)
similarities = cosine_similarity(query_embedding, embeddings)
top_n = 1
top_n_indices = similarities[0].argsort()[-top_n:][::-1]
relevant_chunks = [text_chunks[i] for i in top_n_indices]

print(f"Top {top_n} Relevant Chunks for the Query: '{query}'")
for index, chunk in zip(top_n_indices, relevant_chunks):
    print(f"Chunk Number: {index} - {chunk}")


Top 1 Relevant Chunks for the Query: 'which chunk describes about a study functional cure rate, when combining PEG-IFN-a and Viread over a 48-week treatment period, is less than 10%'
Chunk Number: 7 - These direct-acting oral antiviral agents are potent suppressors of HBV DNA, but generally only suppress the virus during treatment without providing significant loss or clearance of HBsAg, and patients taking these antiviral agents require potentially life-long treatment. Experts and regulators have suggested that a “functional cure,” characterized by sustained loss of HBsAg with or without hepatitis B surface antibody conversion, should be the goal of any new curative HBV therapy. Studies have indicated that the current functional cure rate, when combining PEG-IFN-a and Viread over a 48-week treatment period, is less than 10%. According to an article published in the Journal of Hepatology in August 2017, experts believe that for any future treatment to significantly increase the current

In [46]:
# Example query 4: "compatibility of inarigivir with Viread"
query = "which chunk describes about Vemlidy and Gilead as cohorts for patients"

query_embedding = model.encode([query], convert_to_numpy=True)
similarities = cosine_similarity(query_embedding, embeddings)
top_n = 1
top_n_indices = similarities[0].argsort()[-top_n:][::-1]
relevant_chunks = [text_chunks[i] for i in top_n_indices]

print(f"Top {top_n} Relevant Chunks for the Query: '{query}'")
for index, chunk in zip(top_n_indices, relevant_chunks):
    print(f"Chunk Number: {index} - {chunk}")


Top 1 Relevant Chunks for the Query: 'which chunk describes about Vemlidy and Gilead as cohorts for patients'
Chunk Number: 10 - All patients in the first two cohorts will also receive Vemlidy as a monotherapy for 36 weeks. In a third cohort, Gilead is evaluating the administration of inarigivir 100mg in chronic HBV patients whose viral load is suppressed and who are already being treated with a NUC. CATALYST 1 and CATALYST 2 Trials. We plan to initiate in the first half of 2019 two major Phase 2 global trials examining the administration of inarigivir 400mg in different patient populations and under different dosing regimens. We anticipate that the first global trial, the CATALYST 1 trial, will be conducted in Asia and the U.S. and will evaluate in up to 60 treatment-naïve HBeAg positive and negative patients. Under this response-guided trial, patients will be randomly assigned to one of the following three arms: (i) 20 patients will receive inarigivir 400mg monotherapy daily for 12 w

In [47]:
# Example query: "compatibility of inarigivir with Viread"
query = "which chunk describes about protocols and amendments and Human clinical trials are typically conducted in four sequential phases, which may overlap or be combined"
query_embedding = model.encode([query], convert_to_numpy=True)
similarities = cosine_similarity(query_embedding, embeddings)
top_n = 1
top_n_indices = similarities[0].argsort()[-top_n:][::-1]
relevant_chunks = [text_chunks[i] for i in top_n_indices]

print(f"Top {top_n} Relevant Chunks for the Query: '{query}'")
for index, chunk in zip(top_n_indices, relevant_chunks):
    print(f"Chunk Number: {index} - {chunk}")


Top 1 Relevant Chunks for the Query: 'which chunk describes about protocols and amendments and Human clinical trials are typically conducted in four sequential phases, which may overlap or be combined'
Chunk Number: 50 - Interim, "top-line," and preliminary data from our clinical trials that we announce or publish from time to time may change as more patient data become available or as additional analyses are conducted, and the data are subject to audit and verification procedures that could result in material changes in the final data. From time to time, we may publicly disclose interim or “top-line” from our clinical studies, which are based on a preliminary analysis of then-available efficacy, tolerability, pharmacokinetic and safety data. The results and related findings and conclusions we may draw from this top-line data are subject to change following a more comprehensive review of the data related to the particular study or trial. Interim data from clinical trials that we may co