In [None]:
import datasets
import tempfile
import logging
import random
import config
import os
import yaml
import time
import torch
import transformers
import pandas as pd
import jsonlines
from utilities import *
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from transformers import TrainingArguments
from transformers import AutoModelForCausalLM
from llama import BasicModelRunner
from transformers.trainer_callback import TrainerCallback
import itertools
from IPython.display import clear_output

In [None]:
model_name = "EleutherAI/pythia-160m"
#model_name = "EleutherAI/pythia-70m"
# Get the current directory
current_directory = os.getcwd()
# Join the folder path
folder_path = os.path.join(current_directory, "content")
dataset_name = "ai-medical-chatbot_processed.jsonl"
dataset_path = os.path.join(folder_path, dataset_name)
#dataset_path = f"/content/{dataset_name}"
use_hf = False
training_config = {
    "model": {
        "pretrained_name": model_name,
        "max_length" : 2048
    },
    "datasets": {
        "use_hf": use_hf,
        "path": dataset_path
    },
    "verbose": True
}
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
train_dataset, test_dataset = tokenize_and_split_data(training_config, tokenizer)
base_model = AutoModelForCausalLM.from_pretrained(model_name)
device_count = torch.cuda.device_count()
if device_count > 0:
    logger.debug("Select GPU device")
    device = torch.device("cuda")
else:
    logger.debug("Select CPU device")
    device = torch.device("cpu")

In [None]:
from collections import Counter
import math

def cosine_similarity(str1, str2):
    """
    Computes the cosine similarity between two strings using the Bag-of-Words model.

    Args:
        str1: The first string.
        str2: The second string.

    Returns:
        A float representing the cosine similarity between the two strings.
    """
    # Tokenize the strings
    tokens1 = str1.split()
    tokens2 = str2.split()

    # Create bag of words for each string
    bow1 = Counter(tokens1)
    bow2 = Counter(tokens2)

    # Get the set of all unique words
    all_words = set(bow1.keys()).union(set(bow2.keys()))

    # Compute dot product
    dot_product = sum(bow1[word] * bow2[word] for word in all_words)

    # Compute magnitudes
    magnitude1 = math.sqrt(sum(bow1[word] ** 2 for word in all_words))
    magnitude2 = math.sqrt(sum(bow2[word] ** 2 for word in all_words))

    # Compute cosine similarity
    if magnitude1 == 0 or magnitude2 == 0:
        return 0
    else:
        return dot_product / (magnitude1 * magnitude2)


In [None]:
def inference_new(text, model, tokenizer, max_input_tokens=1000, max_output_tokens=1000):
  # Tokenize
  input_ids = tokenizer.encode(
      text,
      return_tensors="pt",
      truncation=True,
      max_length=max_input_tokens
  )

  # Generate
  device = model.device
  attention_mask = torch.ones_like(input_ids)  # Create mask with all 1s

  # Fix: Mask all padding tokens, including the first element
  attention_mask[input_ids == tokenizer.pad_token_id] = 0

  generated_tokens_with_prompt = model.generate(
      input_ids.to(device),
      max_length=max_output_tokens,
      attention_mask=attention_mask,
      pad_token_id=tokenizer.eos_token_id  # Set pad token
  )

  # Decode
  generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)

  # Strip the prompt
  generated_text_answer = generated_text_with_prompt[0][len(text):]
  return generated_text_answer



In [None]:
from transformers import AutoModelForCausalLM
import shutil
def train_model(hyperparameters, delete=False, testing=False):
  max_steps = hyperparameters["max_steps"]


  # Convert hyperparameter values to integers and add them to the string
  hyperparameter_str = '_'.join(str(int(value)) if isinstance(value, (int, float)) else value for value in hyperparameters.values())
  # Create the trained_model_name variable
  trained_model_name = f"ai_medical_{hyperparameter_str}"

  #trained_model_name = f"ai_medical_{max_steps}_steps"
  output_dir = trained_model_name
  training_args = TrainingArguments(
    # Learning rate
    learning_rate=hyperparameters["learning_rate"],

    # Number of training epochs
    num_train_epochs=hyperparameters["num_train_epochs"],

    # Max steps to train for (each step is a batch of data)
    # Overrides num_train_epochs, if not -1
    max_steps=max_steps,

    # Batch size for training
    per_device_train_batch_size=hyperparameters["per_device_train_batch_size"],

    # Directory to save model checkpoints
    output_dir=output_dir,

    # Other arguments
    overwrite_output_dir=False, # Overwrite the content of the output directory
    disable_tqdm=False, # Disable progress bars
    eval_steps=120, # Number of update steps between two evaluations
    save_steps=120, # After # steps model is saved
    warmup_steps=1, # Number of warmup steps for learning rate scheduler
    per_device_eval_batch_size=1, # Batch size for evaluation
    evaluation_strategy="steps",
    logging_strategy="steps",
    logging_steps=1,
    optim=hyperparameters["optim"],
    gradient_accumulation_steps = hyperparameters['gradient_accumulation_steps'],
    gradient_checkpointing=False,
    # Parameters for early stopping
    load_best_model_at_end=True,
    save_total_limit=1,
    metric_for_best_model="eval_loss",
    greater_is_better=False
  )
  base_model.to(device)
  model_flops = (
    base_model.floating_point_ops(
      {
        "input_ids": torch.zeros(
            (1, training_config["model"]["max_length"])
        )
      }
    )
    * training_args.gradient_accumulation_steps
  )

  #print(base_model)
  print("Memory footprint", base_model.get_memory_footprint() / 1e9, "GB")
  print("Flops", model_flops / 1e9, "GFLOPs")

  trainer = Trainer(
    model=base_model,
    model_flops=model_flops,
    total_steps=max_steps,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)
  training_output = trainer.train()
  # Evaluate the model
  eval_results = trainer.evaluate()

  # Adding Evaluation 
  save_dir = f'{output_dir}'
  trainer.save_model(save_dir)
  print("Saved model to:", save_dir)
  finetuned_slightly_model = AutoModelForCausalLM.from_pretrained(save_dir, local_files_only=True)
  finetuned_slightly_model.to(device)
  test_question = test_dataset[0]['question']
  print("Question input (test):", test_question)
  predicted_answer=inference_new(test_question, finetuned_slightly_model, tokenizer)
  print("Finetuned slightly model's answer: ")
  print(predicted_answer) 
  test_answer = test_dataset[0]['answer']
  print("Target answer output (test):", test_answer)
  metric_cosine_similarity=cosine_similarity(test_answer, predicted_answer)
  print("Cosine Similarity:", metric_cosine_similarity)
  # Deleting the folder to save space
  clear_output()
  if delete:
    shutil.rmtree(save_dir)
    print("Deleted model folder:", save_dir)
  if testing:
    return eval_results, training_output, metric_cosine_similarity,test_question,test_answer,predicted_answer

  else:
    return eval_results, training_output, metric_cosine_similarity


In [None]:
hyperparameters={'learning_rate': 1e-06,
'num_train_epochs': 1,
'per_device_train_batch_size': 1,
'optim': 'adafactor',
'num_iterations': 1,
'max_steps':3,
'gradient_accumulation_steps':2}

In [None]:
eval_results, training_output, metric_cosine_similarity,test_question,test_answer,predicted_answer =train_model(hyperparameters, delete=True, testing=True)

In [None]:
test_text = test_dataset[0]['question']
print("Question input (test):", test_text)
print(f"Correct answer from ai-medical-chatbot: {test_dataset[0]['answer']}")
print("Model's answer: ")
#print(inference_new(test_text, base_model, tokenizer))

In [None]:
from tqdm import tqdm
import pandas as pd

In [None]:
def find_best_hyperparameters():
    best_hyperparameters = None
    best_loss = float('inf')
    # Lists to store data
    hyperparameters_list = []
    eval_results_list = []
    training_output_list = []
    cosine_similarity_list = []

    test_question_list = []
    test_answer_list = []
    predicted_answer_list = []



    
    # Define hyperparameter search space
    hyperparameter_space = {
        "learning_rate": [1e-6, 1e-5, 1e-4],
        "num_train_epochs": [1,5,10,20],
        "per_device_train_batch_size": [1],
        "optim": ["adafactor"],
        "max_steps": [3],
        "gradient_accumulation_steps": [3],
    }
    # Generate all combinations of hyperparameters
    all_hyperparameters = list(itertools.product(*hyperparameter_space.values()))

    # Assuming all_hyperparameters is a list of hyperparameter combinations
    for hyperparameter_values in tqdm(all_hyperparameters):
        hyperparameters = dict(zip(hyperparameter_space.keys(), hyperparameter_values))
        
        # Evaluate the model
        # Print the current hyperparameters
        print("Using hyperparameters:")
        for key, value in hyperparameters.items():
            print(f"{key}: {value}")
        eval_results, training_output, metric_cosine_similarity, test_question, test_answer, predicted_answer = train_model(hyperparameters,delete=True,testing=True)
        
        # Append data to lists
        hyperparameters_list.append(hyperparameters)
        eval_results_list.append(eval_results)
        training_output_list.append(training_output)
        cosine_similarity_list.append(metric_cosine_similarity)

        test_question_list.append(test_question)
        test_answer_list.append(test_answer)
        predicted_answer_list.append(predicted_answer)

        # Check if this set of hyperparameters gives better results
        if eval_results["eval_loss"] < best_loss:
                best_loss = eval_results["eval_loss"]
                best_hyperparameters = hyperparameters
        clear_output()
    # Create DataFrame
    data = {
        'Question':test_question_list,
        'Answer':test_answer_list,
        'Prediction':predicted_answer_list,
        'Hyperparameters': hyperparameters_list,
        'Evaluation Results': eval_results_list,
        'Training Output': training_output_list,
        'Cosine Similarity': cosine_similarity_list
    }
    df = pd.DataFrame(data)
    
    return best_hyperparameters, best_loss, df

In [None]:
# Call the function to find the best hyperparameters
best_hyperparameters, best_loss ,df = find_best_hyperparameters()

In [None]:
print("Best hyperparameters:", best_hyperparameters)
print("Best loss:", best_loss)

In [None]:
# Sort the DataFrame by 'eval_loss' inside the 'Evaluation Results' column
df_sorted = df.sort_values(by='Evaluation Results', 
                           key=lambda x: x.apply(lambda d: d['eval_loss']))



In [None]:
df_sorted.head()

In [None]:
# Sort the DataFrame by 'Cosine Similarity' from largest to smallest
df_cos = df.sort_values(by='Cosine Similarity', ascending=False)

In [None]:
df_cos[["Answer","Prediction","Cosine Similarity","Evaluation Results"]]