In [None]:
!pip install rouge-score datasets transformers huggingface_hub torch evaluate
import os
import pandas as pd
import torch
from datasets import Dataset
import evaluate
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForSeq2Seq
from huggingface_hub import login

# 🔹 Disable Weights & Biases (WANDB)
os.environ["WANDB_DISABLED"] = "true"

# 🔹 Login to Hugging Face
login(token="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")

# 🔹 Load dataset
df = pd.read_parquet("hf://datasets/AI4Chem/ChemData700K/data/train-00000-of-00001.parquet")
df = df.head(10000)  # Use 10k records

# 🔹 Remove any nulls (if present)
df = df.dropna(subset=["instruction", "input", "output"])

# 🔹 Combine instruction and input into one input string
def combine_fields(example):
    return {
        "input_text": f"question: {example['instruction']} {example['input']}",
        "target_text": example["output"]
    }

combined_data = df.apply(combine_fields, axis=1, result_type="expand")
dataset = Dataset.from_pandas(combined_data)

# 🔹 Load tokenizer and model
MODEL_NAME = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

# 🔹 Tokenization
def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["input_text"],
        max_length=512,
        truncation=True,
        padding="max_length"
    )

    labels = tokenizer(
        examples["target_text"],
        max_length=128,
        truncation=True,
        padding="max_length"
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# 🔹 Tokenize
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset.column_names)

# 🔹 Train/test split (80/20)
split_dataset = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

# 🔹 Evaluation metric
#rouge = load_metric("rouge")
rouge_metric = evaluate.load("rouge")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return {"rougeL": result["rougeL"].mid.fmeasure}

# 🔹 Training arguments
training_args = TrainingArguments(
    output_dir="./t5-qa-chem",
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    save_strategy="epoch",
    report_to="none"
)

# 🔹 Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# 🔹 Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 🔹 Train the model
trainer.train()

# 🔹 Save the model
trainer.save_model("./t5-qa-chem-trained")

# 🔹 Function to answer chemistry questions (no manual context!)
def answer_chemistry_question(question_text):
    input_text = f"question: {question_text}"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True).to(model.device)

    with torch.no_grad():
        outputs = model.generate(**inputs)

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# 🔹 Example usage
question = "What is the molecular weight of water?"
print("Q:", question)
print("A:", answer_chemistry_question(question))


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12=

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

  trainer = Trainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 6.19 GiB. GPU 0 has a total capacity of 14.74 GiB of which 6.16 GiB is free. Process 6827 has 8.57 GiB memory in use. Of the allocated memory 7.28 GiB is allocated by PyTorch, and 1.17 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
!pip install datasets transformers huggingface_hub torch
import os
import pandas as pd
import torch
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from huggingface_hub import login

# Disable Weights & Biases logging
os.environ["WANDB_DISABLED"] = "true"

# Login to Hugging Face (replace with your token)
login(token="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")

# Load the entire ChemData700K dataset
df = pd.read_parquet("hf://datasets/AI4Chem/ChemData700K/data/train-00000-of-00001.parquet")
# Uncomment the next line to use the full dataframe; here we use all available data.
# df = df.copy()
df = df.head(10000)
dataset = Dataset.from_pandas(df)

# Load Bloomz model for QA (non-Google model)
MODEL_NAME = "bigscience/bloomz-560m"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

# Tokenization function for causal LM fine-tuning
def tokenize_function(examples):
    # We assume:
    # - "input" contains the question.
    # - "instruction" contains context/instructions.
    # - "output" contains the expected answer.
    # We build a prompt that looks like:
    #   "Question: <question>\nContext: <instruction>\nAnswer:"
    # and then we append the expected answer.
    prompts = [
        "Question: " + str(q) + "\nContext: " + str(ctx) + "\nAnswer:"
        for q, ctx in zip(examples["input"], examples["instruction"])
    ]
    answers = [str(ans) for ans in examples["output"]]
    # For causal LM training, concatenate the prompt with the answer.
    full_texts = [prompt + " " + answer for prompt, answer in zip(prompts, answers)]

    tokenized = tokenizer(
        full_texts,
        padding="max_length",
        truncation=True,
        max_length=256  # Adjust as needed for your data
    )
    # For causal LM, labels are the same as input_ids.
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

# Tokenize the entire dataset (remove original columns afterward)
tokenized_dataset = dataset.map(
    tokenize_function, batched=True, remove_columns=["instruction", "input", "output"]
)

# Define training arguments (using the entire dataset for training)
training_args = TrainingArguments(
    output_dir="./bloomz-qa-results",
    evaluation_strategy="no",  # No evaluation since we're training on the full dataset
    learning_rate=3e-5,
    per_device_train_batch_size=4,  # Adjust based on GPU memory
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    save_total_limit=1,
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

# Fine-tune the model
trainer.train()

# Save the trained model
trainer.save_model("./bloomz-qa-trained")

# Inference function to answer any chemistry-related question
def answer_question(question, context=None):
    # If no external context is provided, use a default instruction prompt.
    if context is None:
        context = "Use your scientific knowledge to answer the following chemistry question."
    prompt = f"Question: {question}\nContext: {context}\nAnswer:"
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=50,
            num_beams=4,
            early_stopping=True
        )
    # Decode the full generated text and remove the prompt portion.
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Optionally, remove the prompt from the generated text.
    answer = generated_text[len(prompt):].strip()
    return answer

# Example usage: Ask a chemistry-related question without needing external context.
question = "What is the molecular weight of water?"
response = answer_question(question)
print("Question:", question)
print("Answer:", response)




tokenizer_config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/715 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 16.12 MiB is free. Process 6827 has 14.72 GiB memory in use. Of the allocated memory 14.56 GiB is allocated by PyTorch, and 31.59 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)