In [9]:
!pip install transformers
!pip install nltk rouge-score scikit-learn torch

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl

In [12]:
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
import torch
from transformers import TextDataset, DataCollatorForLanguageModeling
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from sklearn.metrics import f1_score

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
df = pd.read_csv('/content/gdrive/My Drive/Indigo-HackToHire/processed_data.csv', index_col=0)
df.head(5)

Unnamed: 0,question,tokenized_question,stemmed_question,setmmed_nostop_answer,answer,tokenized_answer,stemmed_answer
0,Why whenever I get in the shower my girlfriend...,"['why', 'whenever', 'i', 'get', 'in', 'the', '...","['whi', 'whenev', 'i', 'get', 'in', 'the', 'sh...","['whi', 'whenev', 'get', 'shower', 'girlfriend...",Isn’t it awful? You would swear that there was...,"['isn’t', 'it', 'awful', 'you', 'would', 'swea...","['isn’t', 'it', 'aw', 'you', 'would', 'swear',..."
1,"What is a proxy, and how can I use one?","['what', 'is', 'a', 'proxy', 'and', 'how', 'ca...","['what', 'is', 'a', 'proxi', 'and', 'how', 'ca...","['proxi', 'use', 'one']",A proxy server is a system or router that prov...,"['a', 'proxy', 'server', 'is', 'a', 'system', ...","['a', 'proxi', 'server', 'is', 'a', 'system', ..."
2,"What song has the lyrics ""someone left the cak...","['what', 'song', 'has', 'the', 'lyrics', 'some...","['what', 'song', 'ha', 'the', 'lyric', 'someon...","['song', 'ha', 'lyric', 'someon', 'left', 'cak...",MacArthur's Park\n,"['macarthur', 's', 'park']","['macarthur', 's', 'park']"
3,I am the owner of an adult website called http...,"['i', 'am', 'the', 'owner', 'of', 'an', 'adult...","['i', 'am', 'the', 'owner', 'of', 'an', 'adult...","['owner', 'adult', 'websit', 'call', 'http', '...",Don't let apps that are liers put adds on your...,"['don', 't', 'let', 'apps', 'that', 'are', 'li...","['don', 't', 'let', 'app', 'that', 'are', 'lie..."
4,Does the Bible mention anything about a place ...,"['does', 'the', 'bible', 'mention', 'anything'...","['doe', 'the', 'bibl', 'mention', 'anyth', 'ab...","['doe', 'bibl', 'mention', 'anyth', 'place', '...",St. John in the book of Revelation mentions an...,"['st', 'john', 'in', 'the', 'book', 'of', 'rev...","['st', 'john', 'in', 'the', 'book', 'of', 'rev..."


In [5]:
# Test & Train Split
df = df.sample(frac=1)

# Ratio and index definition
ratio = 0.75
total_rows = df.shape[0]
train_size = int(total_rows*ratio)

# Split data into test and train
train = df[0:train_size]
test = df[train_size:]

In [6]:
# Prepare the data in the format required by GPT-2
def prepare_data(df, output_file):
    with open(output_file, 'w') as f:
        for index, row in df.iterrows():
            question = row['question']
            answer = row['answer']
            f.write(f"Question: {question}\nAnswer: {answer}\n\n")

prepare_data(train, '/content/gdrive/My Drive/Indigo-HackToHire/qatrain_data.txt')
prepare_data(test, '/content/gdrive/My Drive/Indigo-HackToHire/qatest_data.txt')

In [7]:
# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Create the dataset
def load_dataset(file_path, tokenizer, block_size=512):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size,
        overwrite_cache=True
    )

dataset = load_dataset('/content/gdrive/My Drive/Indigo-HackToHire/qatrain_data.txt', tokenizer)

# Create the data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./gpt_results',
    overwrite_output_dir=True,
    num_train_epochs=2,
    per_device_train_batch_size=16,
    save_steps=10_000,
    save_total_limit=2,
)

# Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset
)

# Train the model
trainer.train()

# Save the model
trainer.save_model('/content/gdrive/My Drive/Indigo-HackToHire/fine-tuned-gpt2')
tokenizer.save_pretrained('/content/gdrive/My Drive/Indigo-HackToHire/fine-tuned-gpt2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]



Step,Training Loss
500,3.2007
1000,3.0746
1500,2.9748
2000,2.9282


Step,Training Loss
500,3.2007
1000,3.0746
1500,2.9748
2000,2.9282
2500,2.9152


('/content/gdrive/My Drive/Indigo-HackToHire/fine-tuned-gpt2/tokenizer_config.json',
 '/content/gdrive/My Drive/Indigo-HackToHire/fine-tuned-gpt2/special_tokens_map.json',
 '/content/gdrive/My Drive/Indigo-HackToHire/fine-tuned-gpt2/vocab.json',
 '/content/gdrive/My Drive/Indigo-HackToHire/fine-tuned-gpt2/merges.txt',
 '/content/gdrive/My Drive/Indigo-HackToHire/fine-tuned-gpt2/added_tokens.json')

In [14]:
def generate_answer(model, tokenizer, question, device='cuda'):
    inputs = tokenizer.encode_plus(question, return_tensors='pt').to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs.input_ids[0]))
    return answer

def compute_metrics(references, predictions):
    bleu_scores = []
    rouge_l_scores = []
    f1_scores = []

    rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

    for ref, pred in zip(references, predictions):
        # BLEU score
        bleu_score = sentence_bleu([ref.split()], pred.split())
        bleu_scores.append(bleu_score)

        # ROUGE-L score
        rouge_l_score = rouge.score(ref, pred)['rougeL'].fmeasure
        rouge_l_scores.append(rouge_l_score)

        # F1-score
        ref_tokens = set(ref.split())
        pred_tokens = set(pred.split())
        common_tokens = ref_tokens.intersection(pred_tokens)

        if len(common_tokens) == 0:
            f1 = 0.0
        else:
            precision = len(common_tokens) / len(pred_tokens)
            recall = len(common_tokens) / len(ref_tokens)
            f1 = 2 * (precision * recall) / (precision + recall)
        f1_scores.append(f1)

    avg_bleu = sum(bleu_scores) / len(bleu_scores)
    avg_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores)
    avg_f1 = sum(f1_scores) / len(f1_scores)

    return avg_bleu, avg_rouge_l, avg_f1

# Example usage
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

questions = test.question
references = test.answer

predictions = [generate_answer(model, tokenizer, q, device) for q in questions]

avg_bleu, avg_rouge_l, avg_f1 = compute_metrics(references, predictions)

print(f"Average BLEU Score: {avg_bleu}")
print(f"Average ROUGE-L Score: {avg_rouge_l}")
print(f"Average F1 Score: {avg_f1}")

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Average BLEU Score: 0.0020519698581623175
Average ROUGE-L Score: 0.08896012579326204
Average F1 Score: 0.09304177460995446
