### AsyncIO Import

In [1]:
import nest_asyncio

nest_asyncio.apply()

### Get Open AI API Key

In [2]:
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

True

### Now Let us Load the Data

In [None]:
!mkdir data
!curl https://www.tripwire.com/state-of-security/digital-hygiene-healthcare-where-cybersecurity-matter-life-and-death -o data/cybersecurity_1.html

In [None]:
!curl https://www.tripwire.com/state-of-security/invisible-shield-exploring-silent-guardians-iot-security -o data/cybersecurity_2.html

In [4]:
!curl https://www.tripwire.com/state-of-security/guarding-health-errol-weiss-protecting-healthcare-sector-cyber-threats -o data/cybersecurity_3.html

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 70118    0 70118    0     0   920k      0 --:--:-- --:--:-- --:--:--  925k


In [5]:
!curl https://www.tripwire.com/state-of-security/what-is-a-security-operations-center-soc -o data/cybersecurity_4.html

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 65989    0 65989    0     0   722k      0 --:--:-- --:--:-- --:--:--  724k


In [6]:
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import BSHTMLLoader

path = "data/"
text_loader = DirectoryLoader(path, glob="*.html", loader_cls=BSHTMLLoader)
docs = text_loader.load()

Next we Chunk the documents

In [7]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 750,
    chunk_overlap  = 20,
    length_function = len
)

Create a Training Set

In [8]:
training_documents = text_splitter.split_documents(docs)

In [9]:
len(training_documents)

71

In [10]:
training_documents[0]

Document(metadata={'source': 'data/cybersecurity_1.html', 'title': 'Digital Hygiene in Healthcare: Where Cybersecurity Is a Matter of Life and Death | Tripwire'}, page_content='Digital Hygiene in Healthcare: Where Cybersecurity Is a Matter of Life and Death | Tripwire\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n    Skip to main content\n  \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nEN\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nEN\n\n\n\n\nSecondary Navigation\n\n\nCustomer Portal\n\n\nPartner Portal\n\n\nGET A DEMO\n\n\n\n\n\n\n\n\n\n\n\n\n    Products\n    Toggle Dropdown\n\n\n\nTripwire Enterprise\n\n\nTripwire ExpertOps\n\n\nTripwire IP360\n\n\nTripwire LogCenter\n\n\nView all products\n\n\n\n\n\n\n\n    Solutions\n    Toggle Dropdown\n\n\n\nSecurity Configuration Management\n\n\nFile Integrity and Change Monitoring\n\n\nVulnerability Management\n\n\nCloud\n\n\nCompliance\n\n\nIndustries\n\n\nView all solutions\n\n\n\n\n\nServices\n\n\n\n\n    Resources\n    Toggle Dropdown\n\n

Next We associate each of our chunks with a unique identifier.

In [11]:
import uuid

id_set = set()

for document in training_documents:
  id = str(uuid.uuid4())
  while id in id_set:
    id = uuid.uuid4()
  id_set.add(id)
  document.metadata["id"] = id

Next, we'll simply use naive Python slicing to create a training, test, and validation set to prepare our data for the next step.

In [12]:
# Calculate total length and split points
total_length = len(training_documents)
train_end = int(total_length * 0.8)
val_end = int(total_length * 0.9)

# Split the documents
training_split_documents = training_documents[:train_end]
val_split_documents = training_documents[train_end:val_end]
test_split_documents = training_documents[val_end:]

# Print split sizes to verify
print(f"Training: {len(training_split_documents)} documents ({len(training_split_documents)/total_length:.1%})")
print(f"Validation: {len(val_split_documents)} documents ({len(val_split_documents)/total_length:.1%})")
print(f"Test: {len(test_split_documents)} documents ({len(test_split_documents)/total_length:.1%})")

Training: 56 documents (78.9%)
Validation: 7 documents (9.9%)
Test: 8 documents (11.3%)


### Construct a Fine Tuning Dataset

In [13]:
from langchain_openai import ChatOpenAI

qa_chat_model = ChatOpenAI(
    model="gpt-4.1-mini",
    temperature=0
)

Next we create a Question Generation Prompt

In [14]:
from langchain_core.prompts import ChatPromptTemplate

qa_prompt = """\
Given the following context, you must generate questions based on only the provided context.

You are to generate {n_questions} questions which should be generated in the following format:

1. QUESTION #1
2. QUESTION #2
...

Context:
{context}
"""

qa_prompt_template = ChatPromptTemplate.from_template(qa_prompt)

Let us test this with an LCEL Chain

In [15]:
question_generation_chain = qa_prompt_template | qa_chat_model

In [16]:
import tqdm
import asyncio

"""
Sample Usage of TQDM:

for i in tqdm.tqdm(range(10)):
  time.sleep(1)
"""
async def process_document(document, n_questions):
    generated_questions = await question_generation_chain.ainvoke({"context":document.page_content, "n_questions":n_questions})

    doc_questions = {}
    doc_documents = {}

    for question in generated_questions.content.split("\n"):
        question_id = str(uuid.uuid4())
        doc_questions[question_id] = " ".join(question.split(":")[1:]).strip()
        doc_documents[question_id] = [document.metadata["id"]]

    return doc_questions, doc_documents

async def create_questions(documents, n_questions):

    # Create a list of tasks to process the documents
    tasks = [process_document(document, n_questions) for document in documents]

    questions = {}
    relevant_docs = {}

    for task in tqdm.tqdm(asyncio.as_completed(tasks), total=len(documents), desc="Processing Documents and Generating Questions"):
        doc_questions, doc_documents = await task
        questions.update(doc_questions)
        relevant_docs.update(doc_documents)

    return questions, relevant_docs

In [17]:
training_questions, training_relevant_contexts = await create_questions(training_split_documents, 2)

Processing Documents and Generating Questions: 100%|██████████| 56/56 [00:03<00:00, 18.02it/s]


In [18]:
val_questions, val_relevant_contexts = await create_questions(val_split_documents, 2)

Processing Documents and Generating Questions: 100%|██████████| 7/7 [00:02<00:00,  2.74it/s]


In [19]:
test_questions, test_relevant_contexts = await create_questions(test_split_documents, 2)

Processing Documents and Generating Questions: 100%|██████████| 8/8 [00:01<00:00,  4.49it/s]


### Reformat and Save the Dataset as JSON

In [20]:
import json

training_corpus = {train_item.metadata["id"] : train_item.page_content for train_item in training_split_documents}

train_dataset = {
    "questions" : training_questions,
    "relevant_contexts" : training_relevant_contexts,
    "corpus" : training_corpus
}

with open("training_dataset.jsonl", "w") as f:
  json.dump(train_dataset, f)

In [21]:
val_corpus = {val_item.metadata["id"] : val_item.page_content for val_item in val_split_documents}

val_dataset = {
    "questions" : val_questions,
    "relevant_contexts" : val_relevant_contexts,
    "corpus" : val_corpus
}

with open("val_dataset.jsonl", "w") as f:
  json.dump(val_dataset, f)

In [22]:
train_corpus = {test_item.metadata["id"] : test_item.page_content for test_item in test_split_documents}

test_dataset = {
    "questions" : test_questions,
    "relevant_contexts" : test_relevant_contexts,
    "corpus" : train_corpus
}

with open("test_dataset.jsonl", "w") as f:
  json.dump(test_dataset, f)

### Login to Hugging Face before Fine Tuning the Model.

In [None]:
from huggingface_hub import notebook_login

notebook_login()

### Fine-Tuning

In [23]:
from sentence_transformers import SentenceTransformer

model_id = "Snowflake/snowflake-arctic-embed-l"
model = SentenceTransformer(model_id)

In [24]:
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from sentence_transformers import InputExample

BATCH_SIZE = 10

corpus = train_dataset['corpus']
queries = train_dataset['questions']
relevant_docs = train_dataset['relevant_contexts']

# Move data into expected format for training.
examples = []
for query_id, query in queries.items():
    doc_id = relevant_docs[query_id][0]
    text = corpus[doc_id]
    example = InputExample(texts=[query, text])
    examples.append(example)


#Create a Pytorch Data Loader

loader = DataLoader(
    examples, batch_size=BATCH_SIZE)

### Loss Function

In [26]:
from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss

matryoshka_dimensions = [768, 512, 256, 128, 64]
inner_train_loss = MultipleNegativesRankingLoss(model)
train_loss = MatryoshkaLoss(
    model, inner_train_loss, matryoshka_dims=matryoshka_dimensions
)

### Evaluator

In [27]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator

corpus = val_dataset['corpus']
queries = val_dataset['questions']
relevant_docs = val_dataset['relevant_contexts']

evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs)

In [28]:
EPOCHS = 15

#### Login to Weights and Biases for the Training plots

In [30]:
import wandb
wandb.login()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /Users/pratikmurali/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mpratik79[0m ([33mpratik79-none[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

# Fine-Tune the Model!

In [2]:
%pip install 'accelerate>=0.26.0'

/Users/pratikmurali/code/aiml-workspaces/AIE6/Midterm_Challenge/.venv/bin/python: No module named pip
Note: you may need to restart the kernel to use updated packages.


In [1]:
warmup_steps = int(len(loader) * EPOCHS * 0.1)

model.fit(
    train_objectives=[(loader, train_loss)],
    epochs=EPOCHS,
    warmup_steps=warmup_steps,
    output_path='finetuned_arctic_ft',
    show_progress_bar=True,
    evaluator=evaluator,
    evaluation_steps=50
)

NameError: name 'loader' is not defined