In [1]:
# Installs
!pip install transformers datasets keras-nlp keras>=3 tensorflow-text huggingface-hub peft langchain_community chromadb sentence-transformers


In [2]:
# Libraries
import os
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.utils.data import DataLoader, Dataset
from huggingface_hub import login
from google.colab import files

In [3]:
# Login to Hugging Face
login(token='sssshhh')


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [4]:
# Set up Keras parameters recommended by Google
os.environ["KERAS_BACKEND"] = "jax"
os.environ["XLA_PYTHON_CLIENT_MEM_FRACTION"] = "1.00"

In [5]:
uploaded = files.upload()

Saving ESSA qna_csv.csv to ESSA qna_csv (1).csv


In [6]:
# Load your data into a DataFrame
df = pd.read_csv('ESSA qna_csv.csv')

In [7]:
# Split the dataset into training and validation sets
train_data, val_data = train_test_split(df.to_dict(orient='records'), test_size=0.2, random_state=42)


In [8]:
# Custom Dataset class
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        question = self.data[idx]['Question']
        answer = self.data[idx]['Answer']
        input_text = f"Question: {question}\nAnswer: {answer}"
        inputs = self.tokenizer(input_text, return_tensors='pt', padding=True, truncation=True, max_length=512)
        return inputs.input_ids.squeeze(0), inputs.input_ids.squeeze(0)

In [9]:
# Load model
def load_model_and_tokenizer(model_name):
    model = AutoModelForCausalLM.from_pretrained(model_name, attn_implementation='eager')
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return model, tokenizer

In [10]:
# Load model and tokenizer
model_name = 'google/gemma-2-2b'
model, tokenizer = load_model_and_tokenizer(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [11]:
# Create DataLoaders
train_dataset = CustomDataset(train_data, tokenizer)
val_dataset = CustomDataset(val_data, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=1, shuffle=False)

In [12]:
# Initialize prompt display
def display_chat(prompt, response):
    print("Prompt:")
    print(prompt)
    print("\nResponse:")
    print(response)

In [13]:
# Generate set-up for model response
def generate_response(prompt):
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    completion = model.generate(input_ids, max_new_tokens=100, temperature=0.7)
    response = tokenizer.decode(completion[0], skip_special_tokens=True)
    return response.replace(prompt, "")

In [None]:
# Initial Prompt Example
prompt = "What is ESSA?"
response = generate_response(prompt)
display_chat(prompt, response)



Prompt:
What is ESSA?

Response:


The Every Student Succeeds Act (ESSA) is a new federal law that replaces the No Child Left Behind Act (NCLB). ESSA is a reauthorization of the Elementary and Secondary Education Act (ESEA), which was first passed in 1965.

ESSA is a reauthorization of the Elementary and Secondary Education Act (ESEA), which was first passed in 1965.

ESSA is a reauthorization of the Elementary and Secondary Education Act (ESEA


In [14]:
#Training and measurements

import torch
from torch import nn

def train_model(model, train_dataloader, val_dataloader, optimizer, epochs=3):
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in train_dataloader:
            input_ids, labels = batch
            optimizer.zero_grad()
            outputs = model(input_ids, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        # Calculate average training loss
        print(f"Epoch: {epoch + 1}, Training Loss: {total_loss / len(train_dataloader)}")

        # Validation step
        model.eval()
        total_val_loss = 0
        all_preds = []
        all_labels = []
        with torch.no_grad():
            for batch in val_dataloader:
                input_ids, labels = batch
                outputs = model(input_ids, labels=labels)
                val_loss = outputs.loss
                total_val_loss += val_loss.item()

        # Calculate average validation loss
        avg_val_loss = total_val_loss / len(val_dataloader)
        print(f"Epoch: {epoch + 1}, Validation Loss: {avg_val_loss}")


In [15]:
# Define optimizer (example)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4)

# Train the model
train_model(model, train_dataloader, val_dataloader, optimizer, epochs=3)

Epoch: 1, Training Loss: 7.239478299021721
Epoch: 1, Validation Loss: 6.296952451978411
Epoch: 2, Training Loss: 5.123479753732681
Epoch: 2, Validation Loss: 5.451409226372128
Epoch: 3, Training Loss: 4.073324343562126
Epoch: 3, Validation Loss: 5.144006002516973


In [16]:
# Save model
def save_model_and_tokenizer(model, tokenizer, path='./models/gemma_train1'):
    model.save_pretrained(path)
    tokenizer.save_pretrained(path)

save_model_and_tokenizer(model, tokenizer)

In [19]:
# Initialize prompt display
def display_chat(prompt, response):
    print("Prompt:")
    print(prompt)
    print("\nResponse:")
    print(response)

# Generate set-up for model response
def generate_response(prompt):
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    completion = model.generate(
        input_ids,
        max_new_tokens=200,  # Increase if necessary
        temperature=0.3,     # Adjust to introduce variability
        top_k=50,            # Optional: control diversity
        top_p=0.9            # Optional: control diversity
    )
    response = tokenizer.decode(completion[0], skip_special_tokens=True)
    return response.replace(prompt, "")

# Second Prompt Example
prompt = "What is ESSA?"
response = generate_response(prompt)
display_chat(prompt, response)



Prompt:
What is ESSA?

Response:

Answer: Title I schools in the Title I schools in the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the


In [20]:
# Load the fine-tuned model for inference
def load_finetuned_model_and_tokenizer(model_path):
    model = AutoModelForCausalLM.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    return model, tokenizer

model_path = './models/gemma_train1'
model, tokenizer = load_finetuned_model_and_tokenizer(model_path)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [21]:
from peft import get_peft_model, LoraConfig

In [22]:
# Reload dataset
df = pd.read_csv('ESSA qna_csv.csv')
dataset = CustomDataset(df.to_dict(orient='records'), tokenizer)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

In [23]:
# Set up LoRA configuration
lora_config = LoraConfig(r=4, lora_alpha=16, lora_dropout=0.1, bias="none")
model = get_peft_model(model, lora_config)

# Define an optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4)

In [24]:
# Training loop with validation
def train_model(model, dataloader, optimizer, epochs=2):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in dataloader:
            input_ids, labels = batch
            optimizer.zero_grad()
            outputs = model(input_ids, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch: {epoch + 1}, Loss: {total_loss / len(dataloader)}")

In [25]:
# Train the model
train_model(model, dataloader, optimizer, epochs=3)

It is strongly recommended to train Gemma2 models with the `eager` attention implementation instead of `sdpa`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.


Epoch: 1, Loss: 3.387894919603178
Epoch: 2, Loss: 3.1342530970526212
Epoch: 3, Loss: 2.968976615679146


In [26]:
# Save model
def save_model_and_tokenizer(model, tokenizer, path='./models/gemma_LoRAfinetuned1'):
    model.save_pretrained(path)
    tokenizer.save_pretrained(path)

save_model_and_tokenizer(model, tokenizer)

In [27]:
  # Initialize prompt display
def display_chat(prompt, response):
    print("Prompt:")
    print(prompt)
    print("\nResponse:")
    print(response)

# Generate set-up for model response
def generate_response(prompt):
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    completion = model.generate(input_ids, max_new_tokens=100, temperature=0.0)
    response = tokenizer.decode(completion[0], skip_special_tokens=True)
    return response.replace(prompt, "")

# Third Prompt Example
prompt = "What is ESSA?"
response = generate_response(prompt)
display_chat(prompt, response)



Prompt:
What is ESSA?

Response:

Answer: ESSA requires states to ensure accountability and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency,


In [28]:
# Initialize prompt display
def display_chat(prompt, response):
    print("Prompt:")
    print(prompt)
    print("\nResponse:")
    print(response)

# Generate set-up for model response
def generate_response(prompt):
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    completion = model.generate(
        input_ids,
        max_new_tokens=200,  # Increase if necessary
        temperature=0.7,     # Adjust to introduce variability
        top_k=50,            # Optional: control diversity
        top_p=0.9            # Optional: control diversity
    )
    response = tokenizer.decode(completion[0], skip_special_tokens=True)
    return response.replace(prompt, "")

# Fourth Prompt Example
prompt = "What is ESSA?"
response = generate_response(prompt)
display_chat(prompt, response)



Prompt:
What is ESSA?

Response:

Answer: ESSA requires states to ensure accountability and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational ag

In [29]:
# RAG Implementation
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline

In [30]:
uploaded = files.upload()

Saving ESSA RAG file_10.21.docx to ESSA RAG file_10.21.docx


In [35]:
class DocumentWithText:
    def __init__(self, content, metadata=None):
        self.page_content = content
        self.metadata = metadata if metadata is not None else {}

In [36]:
!pip install python-docx
from docx import Document

# Load and split context documents
def load_and_split_documents(file_path):
    # Load the Word document
    doc = Document(file_path)
    documents = [DocumentWithText(paragraph.text) for paragraph in doc.paragraphs if paragraph.text]

    # Here you can choose how to split the text
    text_splitter = CharacterTextSplitter(chunk_size=9000, chunk_overlap=0)
    texts = text_splitter.split_documents(documents)
    return texts

# Load the Word document
texts = load_and_split_documents("ESSA RAG file_10.21.docx")



In [37]:
# Create embeddings and vector store
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
db = Chroma.from_documents(texts, embeddings)
retriever = db.as_retriever()

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [38]:
from langchain.prompts import PromptTemplate
# Create a prompt template for RAG
template = """Use the following information to answer the question:
{context}
Question: {question}
Answer:"""
prompt_template = PromptTemplate(template=template, input_variables=["context", "question"])


In [39]:
from transformers import pipeline, GenerationConfig
# Create the RetrievalQA chain
llm = HuggingFacePipeline(pipeline=pipeline("text2text-generation", model=model, tokenizer=tokenizer,
                                           generation_config=GenerationConfig(max_new_tokens=256))) # Added generation_config
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt_template}
)

The model 'PeftModel' is not supported for text2text-generation. Supported models are ['BartForConditionalGeneration', 'BigBirdPegasusForConditionalGeneration', 'BlenderbotForConditionalGeneration', 'BlenderbotSmallForConditionalGeneration', 'EncoderDecoderModel', 'FSMTForConditionalGeneration', 'GPTSanJapaneseForConditionalGeneration', 'LEDForConditionalGeneration', 'LongT5ForConditionalGeneration', 'M2M100ForConditionalGeneration', 'MarianMTModel', 'MBartForConditionalGeneration', 'MT5ForConditionalGeneration', 'MvpForConditionalGeneration', 'NllbMoeForConditionalGeneration', 'PegasusForConditionalGeneration', 'PegasusXForConditionalGeneration', 'PLBartForConditionalGeneration', 'ProphetNetForConditionalGeneration', 'SeamlessM4TForTextToText', 'SeamlessM4Tv2ForTextToText', 'SwitchTransformersForConditionalGeneration', 'T5ForConditionalGeneration', 'UMT5ForConditionalGeneration', 'XLMProphetNetForConditionalGeneration'].
  llm = HuggingFacePipeline(pipeline=pipeline("text2text-generat

In [40]:
# Example RAG query
query = "What is ESSA?"
result = qa_chain({"query": query})

# Print the entire result for debugging
print(result)

# Access the 'result' key directly
retrieved_text = result['result'] if 'result' in result else "No context available."

# Create the prompt using the retrieved text
prompt = f"Use the following information to answer the question:\n{retrieved_text}\nQuestion: {query}\nAnswer:"

# Generate the response using the prompt
response = generate_response(prompt)

# Display the question and answer
print("Question:", query)
print("Answer:", response)

  result = qa_chain({"query": query})


{'query': 'What is ESSA?', 'result': "Use the following information to answer the question:\nLanguage Instruction for English learners:  An LEA using ESSA  funds to provide a language instruction educational program, not later than 30 days after the beginning of the school year, inform parents of an English learner identified for participation or participating in such a program.  For a child who has not been identified as an English learner prior to the beginning of the school year but is identified as an English learner during such school year, an LEA must notify the child's parents during the first two weeks of the child being placed in a language instruction educational program.\n\nESSA covers the following grants below. Our K-12 schools are probably most familiar with the Consolidation Application grants for Title I, II, and IV; EL grant or Title III. Other ESSA grants that are a little less common include RLIS, 21st CCLC, N&D, Migrant, and McKinney Vento.:\n\nParental participatio



Question: What is ESSA?
Answer:  The federal government remains to ensure accountability systems. The state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency. The federal government government remains committed to ensure accountability and the state educational agency. The federal government remains committed to ensure accountability and the state educational agency. The federal government remains committed to ensure accountability and the state educational agency. The federal gover

In [41]:
# RAG query 2
query = "What are State responsibilites for developing academic standards?"
result = qa_chain({"query": query})

# Print the entire result for debugging
print(result)

# Access the 'result' key directly
retrieved_text = result['result'] if 'result' in result else "No context available."

# Create the prompt using the retrieved text
prompt = f"Use the following information to answer the question:\n{retrieved_text}\nQuestion: {query}\nAnswer:"

# Generate the response using the prompt
response = generate_response(prompt)

# Display the question and answer
print("Question:", query)
print("Answer:", response)

{'query': 'What are State responsibilites for developing academic standards?', 'result': 'Use the following information to answer the question:\n§\u2009200.1 State responsibilities for developing challenging academic standards.\n\n(a)\xa0Academic standards in general.\xa0 A State must adopt challenging academic content standards and aligned academic achievement standards that will be used by the State, its local educational agencies (LEAs), and its schools to carry out this subpart. These academic standards must be the same state academic content standards and aligned academic achievement standards that the State applies to all public schools and public school students in the State, including the public schools and public school students served under this subpart\n\nEach State, in consultation with its LEAs, must implement a system of high-quality, yearly student academic assessments that include, at a minimum, academic assessments in mathematics, reading/language arts, and science.\n\



Question: What are State responsibilites for developing academic standards?
Answer:  The federal government remains committed to ensure accountability and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency, and the state educational agency. The federal government remains committed to ensure accountability systems. The state educational agency, and the state educational agency, and the state educational agen