In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
    print('Select the Runtime → "Change runtime type" menu to enable a GPU accelerator, ')
    print('and then re-execute this cell.')
else:
    print(gpu_info)

Tue Dec  3 04:41:10 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   70C    P8              12W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
from psutil import virtual_memory

ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
    print('To enable a high-RAM runtime, select the Runtime → "Change runtime type" ')
    print('menu, and then select High-RAM in the Runtime shape dropdown. Then, ')
    print('re-execute this cell.')
else:
    print('You are using a high-RAM runtime!')

Your runtime has 54.8 gigabytes of available RAM

You are using a high-RAM runtime!


In [None]:
import subprocess

def get_gpu_memory():
    try:
        # Run nvidia-smi and capture the output
        result = subprocess.run(['nvidia-smi', '--query-gpu=memory.free,memory.total', '--format=csv,nounits,noheader'],
                                stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        if result.returncode != 0:
            print("Failed to run nvidia-smi. Is NVIDIA driver installed?")
            print(result.stderr)
            return

        # Parse the output
        memory_info = result.stdout.strip().split('\n')
        for i, gpu in enumerate(memory_info):
            free, total = map(int, gpu.split(','))
            print(f"GPU {i}: {free} MiB free out of {total} MiB total")
    except FileNotFoundError:
        print("nvidia-smi command not found. Make sure NVIDIA drivers are installed and the PATH is set.")

# Run the function
get_gpu_memory()

GPU 0: 15101 MiB free out of 15360 MiB total


In [None]:
!pip install transformers
!pip install datasets
!pip install torch
!pip install tensorboard



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import json
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
)
from datasets import Dataset

# Paths to data files
training_data_path = "/content/drive/MyDrive/646Project/scifact/training_data.jsonl"
eval_data_path = "/content/drive/MyDrive/646Project/scifact/evaluation_data.jsonl"


In [None]:
# Initialize the tokenizer
from transformers import AutoTokenizer

model_name = "gpt2"  # Or any other GPT-2 variant
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Set padding token

# Prepare the training data
def prepare_train_data(training_data_path):
    """
    Prepares the training data by combining query and positive text with EOS token.
    """
    train_data = []
    with open(training_data_path, "r") as f:
        for line in f:
            example = json.loads(line.strip())
            # combined_text = f"{example['query']} {tokenizer.eos_token} {example['positive_text']}"
            combined_text = f"Generate a relevant pseudo-question for: {example['query']} {tokenizer.eos_token} {example['positive_text']}"
            train_data.append({"text": combined_text})
    return train_data

# Load and process the training dataset
train_data = prepare_train_data(training_data_path)
train_dataset = Dataset.from_list(train_data)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# Step 2: Prepare the Evaluation Dataset
def prepare_eval_data(eval_data_path):
    """
    Prepare the evaluation dataset for GPT-2 fine-tuning.
    Concatenate an instruction, query, and text as input for causal language modeling.
    """
    eval_data = []
    with open(eval_data_path, "r") as f:
        for line in f:
            example = json.loads(line.strip())
            # Add the instructional prompt for consistency with training data
            combined_text = f"Generate a relevant pseudo-question for: {example['query']} {tokenizer.eos_token} {example['text']}"
            eval_data.append({"text": combined_text})
    return eval_data

# Load and process the evaluation dataset
eval_data = prepare_eval_data(eval_data_path)
eval_dataset = Dataset.from_list(eval_data)

In [None]:
# Step 3: Load GPT-2 Model and Tokenizer
model_name = "gpt2"  # You can use "gpt2-medium" or "gpt2-large" if needed
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Set pad token to eos
model = AutoModelForCausalLM.from_pretrained(model_name)

# Step 4: Tokenize the Datasets
def tokenize_function(example):
    return tokenizer(
        example["text"],
        truncation=True,
        max_length=512,
        padding="max_length",
    )

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)

# Step 5: Define Data Collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # Causal LM requires no masking
)


Map:   0%|          | 0/906 [00:00<?, ? examples/s]

Map:   0%|          | 0/464 [00:00<?, ? examples/s]

In [None]:
from transformers import Trainer, TrainingArguments

# Step 6: Define Training Arguments
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/646Project/scifact/results-gpt2",  # Save results in Google Drive
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="/content/drive/MyDrive/646Project/scifact/logs-gpt2",  # Save logs in Google Drive
    logging_steps=100,
    eval_steps=500,
    evaluation_strategy="steps",
    save_steps=500,
    save_total_limit=2,
    learning_rate=5e-5,
    fp16=True,  # Enable mixed precision for faster training
    report_to="tensorboard",
    load_best_model_at_end=True,
)

# Step 7: Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Step 8: Train the Model
# Assuming `trainer` is the Trainer object
training_output = trainer.train()

# Access metrics
print(f"Global Steps: {training_output.global_step}")
print(f"Training Loss: {training_output.training_loss}")
print(f"Metrics: {training_output.metrics}")

# Step 9: Save the Fine-Tuned Model
model.save_pretrained("/content/drive/MyDrive/646Project/scifact/gpt2_finetuned")
tokenizer.save_pretrained("/content/drive/MyDrive/646Project/scifact/gpt2_finetuned")

print("Model fine-tuned and saved successfully!")

  trainer = Trainer(


Step,Training Loss,Validation Loss


Global Steps: 171
Training Loss: 3.632921542340552
Metrics: {'train_runtime': 151.7432, 'train_samples_per_second': 17.912, 'train_steps_per_second': 1.127, 'total_flos': 710191742976000.0, 'train_loss': 3.632921542340552, 'epoch': 3.0}
Model fine-tuned and saved successfully!


In [None]:
eval_results = trainer.evaluate()

# Print evaluation metrics
for key, value in eval_results.items():
    print(f"{key}: {value}")

eval_loss: 2.747037887573242
eval_runtime: 10.4082
eval_samples_per_second: 44.58
eval_steps_per_second: 2.786
epoch: 3.0


In [None]:
import json
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load the fine-tuned GPT-2 model and tokenizer
model_path = "/content/drive/MyDrive/646Project/scifact/gpt2_finetuned"
model = AutoModelForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Ensure that the pad token is correctly set for consistent behavior
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

# Load the JSON file with queries
input_file_path = "/content/drive/MyDrive/646Project/scifact/query_text_to_pseudo_questions_test.json"
with open(input_file_path, "r") as f:
    query_data = json.load(f)

# Post-processing function to clean the response
def post_process_response(query, response):
    """
    Removes the input text or prompt prefix from the response if it is repeated.

    :param query: The input query
    :param response: The generated response
    :return: Cleaned response
    """
    # Remove the instruction prefix if present
    prompt_prefix = "Generate a relevant pseudo-question for:"
    if prompt_prefix in response:
        response = response.replace(prompt_prefix, "").strip()

    return response

# Generate responses using fine-tuning format
def generate_response_with_controls(query):
    """
    Generates a response using the fine-tuning input format with controlled parameters.

    :param query: The input query
    :return: Generated pseudo-question
    """
    # Use the same format as fine-tuning
    input_text = f"Generate a relevant pseudo-question for: {query} {tokenizer.eos_token}"
    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512
    )
    # Use sampling for more diverse and less repetitive responses
    output_ids = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=200,  # Adjust as needed
        pad_token_id=tokenizer.pad_token_id,
        do_sample=True,  # Enable sampling
        temperature=0.7,  # Control randomness
        top_p=0.9,  # Nucleus sampling for diversity
        repetition_penalty=1.2,  # Penalize repeated tokens
        num_return_sequences=1,  # Return one response per query
    )
    response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return response

# Process the queries and generate responses
responses = {}
total_queries = len(query_data)

for i, question in enumerate(query_data.keys(), start=1):
    # Generate response
    raw_response = generate_response_with_controls(question)
    # Post-process response
    cleaned_response = post_process_response(question, raw_response)
    responses[question] = cleaned_response

    # Print the question and response every 10 queries or at the end
    if i % 10 == 0 or i == total_queries:
        print(f"Processed {i}/{total_queries} questions...")
        print(f"Question: {question}")
        # print(f"Raw Response: {raw_response}")
        print(f"Cleaned Response: {cleaned_response}")

# Save the responses into a file
output_file_path = "/content/drive/MyDrive/646Project/scifact/gpt2_responses_finetuned_format.json"
with open(output_file_path, "w") as f:
    json.dump(responses, f, indent=4)

print(f"Responses saved to {output_file_path}")

Processed 10/100 questions...
Question: ADAR1 binds to Dicer to cleave pre-miRNA.
Cleaned Response: ADAR1 binds to Dicer to cleave pre-miRNA.  What is the role of TKM in regulating miR2 expression? Does it regulate microRNAs and proteins that are involved in RNA replication or transmission by NcRs (noncerebellar nuclei)? Are they targeted at signaling pathways implicated in gene regulation through regulatory functions such as transcriptional activation, translation initiation and cell differentiation ? How do diphtheria regulates host cells via its interaction with different genes on protein folding sites using two distinct classes [16] , 17 ? The study was conducted over 12 weeks from April 25th till October 2nd 2016 during which time we found out more about how many RNIs were regulated under both proteins . We used an interferon beta/II ligand inhibitor system based upon RDA2721B7L6A3E9DQGCTTYZHCD5CXF
Processed 20/100 questions...
Question: Albendazole is used to treat lymphatic fila

In [None]:
import json
from datasets import load_dataset

# Load the GPT-2 responses file
responses_file_path = "/content/drive/MyDrive/646Project/scifact/gpt2_responses_finetuned_format.json"
with open(responses_file_path, "r") as f:
    eval_with_responses = json.load(f)  # Loaded as a dictionary

# Load scifact dataset queries
dataset_q = load_dataset("BeIR/scifact", "queries")
queries_dataset = dataset_q["queries"]

# Map questions to their IDs
def map_questions_to_ids(eval_with_responses, queries_dataset):
    """
    Maps the queries in eval_with_responses to their IDs in the queries_dataset.

    :param eval_with_responses: Dictionary with query-response pairs
    :param queries_dataset: HF dataset containing questions and their IDs
    :return: Dictionary mapping question IDs to responses
    """
    id_response_mapping = {}
    for question, response in eval_with_responses.items():
        # Find the matching question ID
        question_id = None
        for query in queries_dataset:
            if query["text"] == question:
                question_id = query["_id"]
                break

        if question_id is not None:
            id_response_mapping[question_id] = response
        else:
            print(f"ID not found for query: {question}")

    return id_response_mapping

# Create the mapping
id_response_mapping = map_questions_to_ids(eval_with_responses, queries_dataset)

# Save the ID-to-response mapping to a file
id_response_output_path = "/content/drive/MyDrive/646Project/scifact/gpt2_id_response_mapping.json"
with open(id_response_output_path, "w") as f:
    json.dump(id_response_mapping, f, indent=4)

print(f"ID-to-response mapping saved to {id_response_output_path}")

README.md:   0%|          | 0.00/14.0k [00:00<?, ?B/s]

scifact.py:   0%|          | 0.00/1.66k [00:00<?, ?B/s]

queries/queries/0000.parquet:   0%|          | 0.00/67.5k [00:00<?, ?B/s]

Generating queries split:   0%|          | 0/1109 [00:00<?, ? examples/s]

ID-to-response mapping saved to /content/drive/MyDrive/646Project/scifact/gpt2_id_response_mapping.json


In [None]:
# Install the OpenAI library if not already installed
!pip install openai



In [None]:
mport json
import os
import openai

# Set the OpenAI API key
os.environ["OPENAI_API_KEY"] = "api_key"
openai.api_key = os.environ["OPENAI_API_KEY"]
from openai import OpenAI
client = OpenAI()


# Fine-tuned model ID
fine_tuned_model_id = "ft:gpt-3.5-turbo-0125:personal:3rdattemptnfcorpus:AZVe3AnC"

# Retrieve fine-tuning job details
fine_tune_job = openai.FineTune.retrieve(id=fine_tuned_model_id)
# Print the details
print(fine_tune_job)

In [None]:
################################################################################
#                              GPT-3 CONTEXT                                   #
################################################################################

import json
import os
import openai

# Set the OpenAI API key
os.environ["OPENAI_API_KEY"] = "api_key"
openai.api_key = os.environ["OPENAI_API_KEY"]
from openai import OpenAI
client = OpenAI()

# Load the JSON file with queries
input_file_path = "/content/drive/MyDrive/646Project/scifact/query_text_to_pseudo_questions_test.json"
with open(input_file_path, "r") as f:
    query_data = json.load(f)


# Define the system message guiding the model's behavior
system_message = (
    "You are a helpful assistant that generates one relevant, novel, and contextually rich question about the given input. Only provide the resulting question."
)

# Function to generate a response using GPT-3.5-turbo
def generate_response_with_gpt3(query_text):
    """
    Generates a response using the fine-tuned GPT-3.5-turbo model.

    :param query_text: The input query
    :return: Generated pseudo-question
    """
    try:
        # Call the GPT-3.5 API with structured messages
        response = client.chat.completions.create(
            model="ft:gpt-3.5-turbo-0125:personal:3rdattemptnfcorpus:AZVe3AnC",  # Your fine-tuned model
            messages=[
                {"role": "system", "content": system_message},
                {"role": "user", "content": f"Generate a new question about the question: '{query_text}'"}
            ],
            max_tokens=200,  # Limit the length of the response
            temperature=0.7,  # Control randomness
            top_p=0.9,  # Nucleus sampling
            frequency_penalty=1.2,  # Penalize repeated phrases
            presence_penalty=0.6, # Encourage new topics
        )
        # Extract the assistant's message
        generated_question = response.choices[0].message.content.strip()
        return generated_question
    except Exception as e:
        print(f"Error generating response for query '{query_text}': {e}")
        return None

# Process the queries and generate concatenated responses
responses = {}
total_queries = len(query_data)

for i, question in enumerate(query_data.keys(), start=1):
    concatenated_responses = []

    for attempt in range(5):  # Ask the model 5 times
        response = generate_response_with_gpt3(question)
        if response:
            concatenated_responses.append(response)

    # Combine all 5 responses into a single sentence
    if concatenated_responses:
        combined_response = " ".join(concatenated_responses)
        responses[question] = combined_response

    # Print progress every 10 queries or at the end
    if i % 10 == 0 or i == total_queries:
        print(f"Processed {i}/{total_queries} questions...")
        print(f"Question: {question}")
        print(f"Response: {combined_response if concatenated_responses else 'No valid responses generated.'}")

# Save the responses into a file
output_file_path = "/content/drive/MyDrive/646Project/scifact/gpt3_multiple_responses_concatenated.json"
with open(output_file_path, "w") as f:
    json.dump(responses, f, indent=4)

print(f"Responses saved to {output_file_path}")

Processed 10/100 questions...
Question: ADAR1 binds to Dicer to cleave pre-miRNA.
Response: Does the lack of a specific binding site on TGF-beta1 mRNA suggest that it is What are the two main types of diabetes that can be found in South Asians? Does the presence of antibodies to Campylobacter jejuni in patients with Guillain-Barre syndrome What is the relationship between mercury and sex hormones in male fish eaters? How does the human body recognize and respond to double-stranded RNA?
Processed 20/100 questions...
Question: Albendazole is used to treat lymphatic filariasis.
Response: What is the purpose of this work? What is the dietary condition called when an individual has a deficiency in tryptophan? Which of the following is not a recognized use for ivermectin: control or elimination? Does the study suggest that more women should be informed about MHT and how it may affect their What is the purpose of this study?
Processed 30/100 questions...
Question: Articles published in open a

In [None]:
import json
from datasets import load_dataset

# Load the GPT-3 multiple responses file
responses_file_path = "/content/drive/MyDrive/646Project/scifact/gpt3_multiple_responses_concatenated.json"
with open(responses_file_path, "r") as f:
    eval_with_responses = json.load(f)  # Loaded as a dictionary with original queries as keys

# Load scifact dataset queries
dataset_q = load_dataset("BeIR/scifact", "queries")
queries_dataset = dataset_q["queries"]

# Map questions to their IDs
def map_questions_to_ids(eval_with_responses, queries_dataset):
    """
    Maps the queries in eval_with_responses to their IDs in the queries_dataset.

    :param eval_with_responses: Dictionary with query-response pairs
    :param queries_dataset: HF dataset containing questions and their IDs
    :return: Dictionary mapping question IDs to concatenated responses
    """
    id_response_mapping = {}
    for question, concatenated_response in eval_with_responses.items():
        # Find the matching question ID
        question_id = None
        for query in queries_dataset:
            if query["text"] == question:
                question_id = query["_id"]
                break

        if question_id is not None:
            id_response_mapping[question_id] = concatenated_response
        else:
            print(f"ID not found for query: {question}")

    return id_response_mapping

# Create the mapping from question IDs to concatenated responses
id_response_mapping = map_questions_to_ids(eval_with_responses, queries_dataset)

# Save the ID-to-response mapping to a file
id_response_output_path = "/content/drive/MyDrive/646Project/scifact/gpt3_multiple_id_response_mapping.json"
with open(id_response_output_path, "w") as f:
    json.dump(id_response_mapping, f, indent=4)

print(f"ID-to-response mapping saved to {id_response_output_path}")

ID-to-response mapping saved to /content/drive/MyDrive/646Project/scifact/gpt3_multiple_id_response_mapping.json
