In [None]:
# Strategy 1: Prompt engineering

! pip install -U langchain-community

import os
import json
import time
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from langchain.chat_models import ChatOpenAI

nltk.download("punkt")

# Set OpenAI API Key
os.environ["OPENAI_API_KEY"] = "API_key"

def load_mixed_qa_dataset(filepath):
    """
    Loads and processes a mixed QA dataset containing both context-based and open QA.

    Returns:
        - A merged dataset where each sample includes 'context' (if applicable), 'question', and 'answer'.
    """
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            data = json.load(f)
    except Exception as e:
        print(f"Error loading file {filepath}: {e}")
        return []

    merged_qa_context = []
    for item in data:
        context = item.get("context", "").strip()  # Extract context (if available)
        if "questions" in item:
            for qa_pair in item["questions"]:
                question = qa_pair.get("question", "").strip()
                answer_text = qa_pair.get("answer", {}).get("text", "").strip()  # Extract only answer text

                if question and answer_text:
                    merged_qa_context.append({
                        "context": context,  # Store the context
                        "question": question,
                        "answer": answer_text
                    })
        else:
            question = item.get("Question", "").strip()
            answer_text = item.get("Answer", "").strip()

            if question and answer_text:
                merged_qa_context.append({
                    "context": None,  # No context available
                    "question": question,
                    "answer": answer_text
                })

    print(f"Successfully processed {len(merged_qa_context)} samples from {filepath}")
    return merged_qa_context

mixed_qa_path = "file_path"
merged_qa_context = load_mixed_qa_dataset(mixed_qa_path)

# Prompt engineering format
def evaluate_gpt_on_dataset(dataset, save_path="file_path"):
    gpt4_model = ChatOpenAI(model="gpt-4o", temperature=0.0)

    for idx, sample in enumerate(dataset):
        question = sample["question"].strip()
        input_context = sample.get("context", "").strip() if sample.get("context") else ""
        ground_truth = sample["answer"].strip()
        if isinstance(ground_truth, dict):
            ground_truth = ground_truth.get("text", "").strip()

        if not ground_truth:
            continue

        is_context_based = bool(input_context)

        if is_context_based:
            query = (
                "You are an expert in answering questions from construction specifications.\n"
                "Below are some examples of QA pairs from construction specifications:\n"
                "Example 1:\n"
                "Context: The Contractor shall arrange surfacing operations so that the placing of materials will be accomplished during daylight hours. However, when necessary to complete the project within the time specified, or to avoid peak periods of public traffic, work may be undertaken during the hours of darkness, provided the Contractor furnishes and operates adequate lighting.\n"
                "Question: Who is responsible for arranging surfacing operations?\n"
                "Answer: The Contractor\n"
                "Example 2:\n"
                "Context: No specific unit of measurement shall apply to the lump sum item of shoring or extra excavation Class A. Shoring or extra excavation Class B will be measured by the square foot as follows: The area for payment will be one vertical plane measured along the centerline of the trench, including Structures. Measurement will be made from the existing ground line to the bottom of the excavation and for the length of the Work actually performed. If the Contract includes a pay item for grading to remove materials, the upper limit for measurement will be the neat lines of the grading section shown in the Plans. The bottom elevation for measurement will be the bottom of the excavation as shown in the Plans or as otherwise established by the Engineer.\n"
                "Question: Who establishes the bottom elevation for measurement in the excavation plans?\n"
                "Answer: the Engineer\n"
                f"Now, answer the following question based on the given context and the answer should be word, phrase, and sentence from the input context. Just output the final answer directly.:\n"
                f"Context: {input_context}\n"
                f"Question: {question}\n"
                "Answer:"
            )
        else:
            query = (
                "You are an expert at answering construction safety guideline questions.\n"
                "Below are some examples of QA pairs for construction safety:\n"
                "Example 1:\n"
                "Question: What safety precautions should I take when transporting portable lighting fixtures?\n"
                "Answer: Portable lighting fixtures must be equipped with protective netting, insulated, and grounded.\n"
                "Example 2:\n"
                "Question: What factors should be considered when planning equipment use?\n"
                "Answer: Consider equipment characteristics, work content, usage methods, surrounding environment, and transport routes to ensure safe operations.\n"
                f"Now, answer the following question concisely within 30 words. Just output the final answer directly.:\n"
                f"Question: {question}\n"
                "Just output the final answer directly. Answer:"
            )
        generated_answer = gpt4_model.predict(query).strip()

In [None]:
# Strategy 2: Hybrid RAG

! pip install rank-bm25 chromadb transformers datasets torch evaluate sentence-transformers rouge-score nltk
! pip install -U langchain-community

import os
import json
import time
import numpy as np
import nltk
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize
from langchain.chat_models import ChatOpenAI

nltk.download("punkt")

# Set OpenAI API Key
os.environ["OPENAI_API_KEY"] = "API_key"

# Chunk documents
import os
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

def load_and_chunk_documents(directory_path, chunk_size=512, overlap=50):
    """
    Loads multiple text files from a directory, chunks them, and returns processed text segments.

    Args:
        directory_path (str): Path to the directory containing text files.
        chunk_size (int): Maximum number of tokens per chunk.
        overlap (int): Number of overlapping tokens between chunks.

    Returns:
        List[str]: List of sentence-based chunks.
    """
    # Load all text files from the directory
    loader = DirectoryLoader(directory_path, glob="*.txt", loader_cls=TextLoader)
    documents = loader.load()

    # Split documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
    chunks = text_splitter.split_documents(documents)

    print(f"Loaded {len(documents)} documents, split into {len(chunks)} chunks")
    return chunks

directory_path = "file_path"
document_chunks = load_and_chunk_documents(directory_path)

# Indexing system
def create_hybrid_index(chunks):
    """
    Creates BM25 and FAISS indexes from document chunks.
    """
    # Tokenize chunks for BM25
    tokenized_chunks = [word_tokenize(chunk.page_content.lower()) for chunk in chunks]
    bm25 = BM25Okapi(tokenized_chunks)

    # Convert documents into dense embeddings for FAISS
    embedder = SentenceTransformer("all-mpnet-base-v2")
    doc_embeddings = embedder.encode([chunk.page_content for chunk in chunks], convert_to_numpy=True)

    # Build FAISS index
    dim = doc_embeddings.shape[1]
    faiss_index = faiss.IndexFlatL2(dim)
    faiss_index.add(doc_embeddings)

    print("BM25 & FAISS indexes created successfully!")
    return bm25, tokenized_chunks, chunks, faiss_index, embedder
bm25, tokenized_chunks, document_chunks, faiss_index, embedder = create_hybrid_index(document_chunks)
print("BM25 & FAISS indexes successfully created and tested")

# Retrieval system
def retrieve_relevant_docs_hybrid(query, bm25, tokenized_chunks, original_chunks, faiss_index, embedder, top_k=3, bm25_weight=0.4, faiss_weight=0.6):
    """
    Retrieves relevant documents using a hybrid BM25 + FAISS approach.
    Scores from both methods are weighted to return the best matches.
    """
    # BM25 Retrieval
    query_tokens = word_tokenize(query.lower())
    bm25_scores = np.array(bm25.get_scores(query_tokens))

    # FAISS Dense Retrieval
    query_embedding = embedder.encode([query], convert_to_numpy=True)
    D, I = faiss_index.search(query_embedding, top_k)

    # Convert FAISS scores to full-size array
    faiss_scores = np.zeros(len(original_chunks))
    for idx, score in zip(I[0], np.exp(-D[0])):
        faiss_scores[idx] = score

    # Normalize Scores (Avoid Division by Zero)
    if np.max(bm25_scores) > 0:
        bm25_scores = (bm25_scores - np.min(bm25_scores)) / (np.max(bm25_scores) - np.min(bm25_scores) + 1e-8)
    if np.max(faiss_scores) > 0:
        faiss_scores = (faiss_scores - np.min(faiss_scores)) / (np.max(faiss_scores) - np.min(faiss_scores) + 1e-8)

    # Compute Hybrid Score
    hybrid_scores = bm25_weight * bm25_scores + faiss_weight * faiss_scores

    # Get Top-K Documents
    top_indices = np.argsort(hybrid_scores)[::-1][:top_k]
    retrieved_docs = [original_chunks[i] for i in top_indices]


    print(f"Retrieved {len(retrieved_docs)} documents using Hybrid BM25+FAISS")
    for i, doc in enumerate(retrieved_docs):
        print(f"Document {i+1}:\n{doc.page_content[:300]}...\n")

    return retrieved_docs

# Generation mechanism
def generate_answer_with_hybrid(question, input_context="", use_context=False, bm25=None, tokenized_chunks=None, document_chunks=None, faiss_index=None, embedder=None):
    """
    Retrieves relevant documents using BM25 + FAISS Hybrid retrieval and generates an answer using GPT-4o.
    """
    retrieval_query = f"{input_context} {question}" if use_context else question
    retrieved_docs = retrieve_relevant_docs_hybrid(retrieval_query, bm25, tokenized_chunks, document_chunks, faiss_index, embedder, top_k=3)
    retrieved_context = " ".join([doc.page_content for doc in retrieved_docs]) if retrieved_docs else "No relevant context found"
    if use_context:
        query = f"Input Context: {input_context}\n"
        query += f"Retrieved Context: {retrieved_context}\n" if retrieved_context else ""
        query += f"Question: {question}\nThe answer should be words, phrases, or sentences from the input context. Answer:"
    else:
        query = f"Retrieved Context: {retrieved_context}\n"
        query += f"Question: {question}\nAnswer:"

    print("\n Final Query to GPT-4o:")
    print(query)
    gpt4_model = ChatOpenAI(model="gpt-4o", temperature=0.0)
    generated_answer = gpt4_model.predict(query).strip()

    return generated_answer

In [None]:
# Strategy 3: Task-specific fine-tuning

%cd /content/
%rm -rf LLaMA-Factory
!git clone --depth 1 https://github.com/hiyouga/LLaMA-Factory.git
%cd LLaMA-Factory
%ls
!pip install -e .[torch,bitsandbytes]

import json

%cd /content/LLaMA-Factory/

NAME = "Llama-3"
AUTHOR = "LLaMA Factory"

with open("data/identity.json", "r", encoding="utf-8") as f:
  dataset = json.load(f)

for sample in dataset:
  sample["output"] = sample["output"].replace("{{"+ "name" + "}}", NAME).replace("{{"+ "author" + "}}", AUTHOR)

with open("data/identity.json", "w", encoding="utf-8") as f:
  json.dump(dataset, f, indent=2, ensure_ascii=False)


import json

args = dict(
  stage="sft",
  do_train=True,
  model_name_or_path="unsloth/llama-3-8b-Instruct-bnb-4bit",
  dataset="identity, instruction_data_finetuning_llama",
  template="llama3",
  finetuning_type="lora",
  lora_target="all",
  output_dir="file_path",
  per_device_train_batch_size=,
  gradient_accumulation_steps=,
  lr_scheduler_type=,
  logging_steps=,
  warmup_ratio=,
  save_steps=,
  learning_rate=,
  num_train_epochs=,
  max_samples=,
  max_grad_norm=,
  loraplus_lr_ratio=,
  fp16=True,
  report_to="none",
)

json.dump(args, open("train_dataset.json", "w", encoding="utf-8"), indent=2)

In [None]:
# Strategy 4: Pretraining-and-fine-tuning

! pip install transformers datasets accelerate

# Pretraining on construction-specific corpus
file_paths = ["fine_path"]
corpus = []
for file_path in file_paths:
    with open(file_path, "r", encoding="utf-8") as file:
        corpus.append(file.read())
full_corpus = "\n".join(corpus)
print(f"Sample Text: {full_corpus[:500]}")

from datasets import Dataset
dataset = Dataset.from_dict({"text": corpus})
import os
import random
from datasets import Dataset
from transformers import T5Tokenizer
import numpy as np
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq

tokenizer = T5Tokenizer.from_pretrained("t5-small/medium/large")

corpus_dir = "file_path"

def load_text_files(directory):
    file_paths = [os.path.join(directory, file) for file in os.listdir(directory) if file.endswith(".txt")]
    print(f"Found {len(file_paths)} files for processing.")

    all_texts = []
    for file_path in file_paths:
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read().strip()
            if text:  # Ensure non-empty text
                all_texts.append({"text": text})

    return all_texts

text_data = load_text_files(corpus_dir)
dataset = Dataset.from_list(text_data)

def mask_spans(text, mask_ratio=0.15):
    tokens = text.split()
    num_tokens = len(tokens)
    num_masked = max(1, int(mask_ratio * num_tokens))

    if num_masked >= num_tokens:
        return "<extra_id_0>", " ".join(tokens)

    masked_indices = np.random.choice(num_tokens, num_masked, replace=False)
    masked_tokens = np.array(tokens, dtype=object)
    masked_tokens[masked_indices] = [f"<extra_id_{i}>" for i in range(num_masked)]

    masked_input = " ".join(masked_tokens)
    masked_output = " ".join(tokens[i] for i in masked_indices)

    return masked_input, masked_output

def format_autoregressive(text):
    return text.strip(), text.strip()

def preprocess_data_batch(batch):
    input_texts, target_texts = [], []

    for text in batch["text"]:
        if random.random() < 0.5:
            input_text, target_text = mask_spans(text)
        else:
            input_text, target_text = format_autoregressive(text)

        input_texts.append(input_text)
        target_texts.append(target_text)

    return {"input_text": input_texts, "output_text": target_texts}

dataset = dataset.map(preprocess_data_batch, batched=True, num_proc=4)

def tokenize_function(batch):
    inputs = tokenizer(batch["input_text"], padding="max_length", truncation=True, max_length=512)
    targets = tokenizer(batch["output_text"], padding="max_length", truncation=True, max_length=512)

    return {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "labels": targets["input_ids"],
    }

tokenized_dataset = dataset.map(tokenize_function, batched=True, num_proc=4)  # Parallel tokenization

split_dataset = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset, val_dataset = split_dataset["train"], split_dataset["test"]

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments

model = T5ForConditionalGeneration.from_pretrained("t5-small/medium/large")

training_args = TrainingArguments(
    output_dir="file_path",
    per_device_train_batch_size=,
    num_train_epochs=,
    save_steps=,
    save_total_limit=,
    logging_dir="file_path",
    logging_steps=,
    learning_rate=,
    weight_decay=,
    warmup_steps=,
    fp16=True
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

# Fine-tuning on construction QA dataset

import json
import torch
from datasets import Dataset, DatasetDict
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer


with open('training_dataset.json', 'r', encoding='utf-8') as train_file:
    train_data = json.load(train_file)

with open('testing_dataset.json', 'r', encoding='utf-8') as test_file:
    test_data = json.load(test_file)

def convert_to_hf_dataset(data):
    questions = []
    answers = []

    for entry in data:
        if "context" in entry:
            context = entry["context"]
            for qa in entry["questions"]:
                question = qa["question"]
                answer = qa["answer"]["text"]
                questions.append(f"question: {question} context: {context}")
                answers.append(answer)
        elif "Question" in entry and "Answer" in entry:
            question = entry["Question"]
            answer = entry["Answer"]
            questions.append(f"question: {question}")  # No context
            answers.append(answer)

    return Dataset.from_dict({"question": questions, "answer": answers})

train_dataset = convert_to_hf_dataset(train_data)
test_dataset = convert_to_hf_dataset(test_data)

dataset = DatasetDict({"train": train_dataset, "test": test_dataset})

tokenizer = T5Tokenizer.from_pretrained('construction_specialized_pretrained_T5')

def preprocess_function(examples):
    inputs = examples["question"]
    targets = examples["answer"]

    model_inputs = tokenizer(inputs, max_length=512, padding="max_length", truncation=True)
    labels = tokenizer(targets, max_length=128, padding="max_length", truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=["question", "answer"])

model = T5ForConditionalGeneration.from_pretrained('construction_specialized_pretrained_T5')

training_args = TrainingArguments(
    output_dir="output_path",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=,
    learning_rate=,
    per_device_train_batch_size=,
    per_device_eval_batch_size=,
    num_train_epochs=,
    weight_decay=,
    save_total_limit=,
    gradient_accumulation_steps=,
    eval_accumulation_steps=,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)

trainer.train()

results = trainer.evaluate()
