In [1]:
import os

# Set env vars BEFORE importing huggingface modules
os.environ["HF_HOME"] = "/projects/sciences/computing/sheju347/.cache/huggingface"
os.environ["TRANSFORMERS_CACHE"] = "/projects/sciences/computing/sheju347/.cache/huggingface/hub"

# Now import huggingface modules
from huggingface_hub import constants

print("HF_HOME:", constants.HF_HOME)
print("HF_HUB_CACHE:", constants.HF_HUB_CACHE)

HF_HOME: /projects/sciences/computing/sheju347/.cache/huggingface
HF_HUB_CACHE: /projects/sciences/computing/sheju347/.cache/huggingface/hub


In [2]:

class TrainingData:
    def __init__(self, context: str, question: str, is_correct: bool):
        self.context = context
        self.question = question
        self.is_correct = is_correct


In [3]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

# model_name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract"
model_name = "google-bert/bert-base-uncased"
# model_name = "allenai/longformer-base-4096"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import re
import os

QUESTION_COUNT = 10178

# log_file = "9-20-train_150_30_1st_H100.txt"
log_file = "9-20-train_150_30_2nd_H100.txt"

path = "../../notebooks/"

file_name = path + log_file

training_data_list = []

with open(file_name) as f:
    is_reading_context = False
    context = ""

    is_reading_question = False
    question = ""

    for line in f:
        # if line == "\n":
        #     continue

        stripped_line = line.strip()
        if stripped_line == "Context:":
            is_reading_context = True
        elif stripped_line == "Question:":
            is_reading_context = False
            is_reading_question = True
        elif stripped_line.startswith("[output]"):
            is_correct = stripped_line[len("[output]"):] == "True"
            context = context.strip()
            question = question.strip()
            data = TrainingData(context, question, is_correct)
            # print("aaa", context, question, is_correct)

            training_data_list.append(data)

            # if len(training_data_list) % 100 == 0:
            #     print(f"finished processing {len(training_data_list)} data")

            # clear
            is_reading_context = False
            context = ""
            is_reading_question = False
            question = ""
            # break
        else:
            if is_reading_context:
                context += line
            elif is_reading_question:
                question += line

print(len(training_data_list))

In [4]:
filename = "./data/cfimdb-train.txt"

training_data_list = []

with open(filename, 'r') as fp:
    for line in fp:
        label, org_sent = line.split(' ||| ')
        # sent = org_sent.lower().strip()
        # tokens = tokenizer.tokenize("[CLS] " + sent + " [SEP]")
        label = int(label.strip())
        # if label not in num_labels:
        #     num_labels[label] = len(num_labels)
        data = TrainingData(org_sent, "", label)
        training_data_list.append(data)

print(training_data_list[6].context, training_data_list[6].is_correct)

""" I never actually thought that a film could be so great , but alas I was wrong . Great acting , great plot , fun effects . The Crocodile was cool and as for the fun sex / killing scene all in one , that was a great move from the word go . It was truly shocking and that is a compliment ! How can someone make this film , watch it back and then actually say "" "" Yeah , my favorite movie . People will watch that "" "" If you have n't seen it I beg you watch it . """
 1


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Suppose training_data_list is your list of TrainingData objects
data = pd.DataFrame([{
    "context": d.context,
    "question": d.question,
    "label": int(d.is_correct)   # True->1, False->0
} for d in training_data_list])

# Train/val/test split (e.g., 70/15/15)
train_df, temp_df = train_test_split(data, test_size=0.3, random_state=42, stratify=data["label"])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df["label"])

print(len(train_df), len(val_df), len(test_df))
# print(test_df.iloc[0]["context"])
# print(test_df.iloc[0]["question"])
print(test_df.iloc[1]["label"])

1194 256 257
1


In [6]:
import torch
from torch.utils.data import Dataset

class QADataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=512):
        self.data = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        context = self.data.loc[idx, "context"]
        question = self.data.loc[idx, "question"]
        label = self.data.loc[idx, "label"]

        encoding = self.tokenizer(
            context,
            #question,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(label, dtype=torch.long)
        }

In [7]:
train_dataset = QADataset(train_df, tokenizer)
val_dataset = QADataset(val_df, tokenizer)
test_dataset = QADataset(test_df, tokenizer)

In [8]:
print(train_dataset[1]["input_ids"].tolist())
# print(train_dataset[1]["labels"])

[101, 2013, 2054, 1045, 1005, 2310, 3191, 1037, 2843, 1997, 2111, 2020, 9364, 2011, 2023, 2143, 1010, 4102, 2000, 2112, 1015, 1012, 3322, 1045, 2071, 1050, 1005, 1056, 3305, 2023, 2021, 2044, 1037, 2978, 1997, 2245, 1045, 2228, 2027, 2024, 2157, 2000, 2022, 1012, 2061, 4063, 4059, 2232, 4247, 2010, 2755, 2241, 4129, 1997, 18178, 1005, 1055, 2166, 2008, 2002, 2318, 1999, 2112, 1015, 1012, 2112, 1015, 2409, 1037, 2466, 1997, 1037, 4329, 3048, 2013, 4895, 25013, 16508, 2000, 2019, 4821, 3144, 7091, 1012, 2112, 1016, 4136, 1037, 2466, 1997, 1037, 4329, 2008, 5829, 2013, 4895, 25013, 16508, 2000, 1037, 3294, 7736, 7091, 1012, 2009, 2003, 2025, 2061, 4063, 4059, 2232, 1005, 1055, 6346, 2008, 2122, 1016, 3033, 1997, 18178, 1005, 1055, 2166, 2018, 3294, 2367, 13105, 1012, 2002, 29116, 15867, 2000, 2425, 2119, 1999, 1037, 7199, 19647, 2126, 1012, 1996, 13972, 2089, 2514, 1037, 2843, 2488, 2746, 2041, 1997, 1996, 5988, 2044, 2112, 1015, 2084, 2112, 1016, 2021, 2008, 2003, 1996, 4507, 1997, 18178

In [9]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

def compute_metrics(eval_pred):
    from sklearn.metrics import accuracy_score, f1_score
    logits, labels = eval_pred
    preds = logits.argmax(-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds)
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


In [10]:

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.346,0.209577,0.957031,0.957529
2,0.0943,0.244038,0.953125,0.953125
3,0.0572,0.27341,0.960938,0.96124


TrainOutput(global_step=1791, training_loss=0.14060368756220235, metrics={'train_runtime': 87.2137, 'train_samples_per_second': 41.072, 'train_steps_per_second': 20.536, 'total_flos': 942463800299520.0, 'train_loss': 0.14060368756220235, 'epoch': 3.0})

In [11]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

# model_name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract"
# model_name = "google-bert/bert-base-uncased"
# model_name = "allenai/longformer-base-4096"
model_name = "./results/checkpoint-3562"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

In [12]:
preds = trainer.predict(test_dataset)
print(preds)
y_pred = preds.predictions.argmax(-1)
y_true = preds.label_ids

from sklearn.metrics import classification_report
print(classification_report(y_true, y_pred))

PredictionOutput(predictions=array([[-4.0941863 ,  4.026944  ],
       [-4.136982  ,  4.05463   ],
       [-4.0876756 ,  4.017905  ],
       [-4.1084943 ,  3.9993894 ],
       [-4.129156  ,  4.0948544 ],
       [ 4.521436  , -4.6694183 ],
       [-4.136835  ,  4.047822  ],
       [ 4.548204  , -4.7400475 ],
       [-4.092423  ,  4.0306506 ],
       [-4.025205  ,  3.9671757 ],
       [ 4.5302463 , -4.7546487 ],
       [ 4.509764  , -4.6941876 ],
       [-4.0689826 ,  4.035645  ],
       [-4.0128574 ,  3.8946562 ],
       [-4.0726314 ,  3.956131  ],
       [ 4.5017285 , -4.6144614 ],
       [ 4.509204  , -4.67687   ],
       [-4.141014  ,  4.02516   ],
       [-0.2300345 ,  0.33628598],
       [-4.0313773 ,  3.9832835 ],
       [ 4.5410514 , -4.653341  ],
       [ 4.4994125 , -4.5782127 ],
       [-4.1176777 ,  4.0297937 ],
       [-4.106482  ,  4.041486  ],
       [ 4.5662856 , -4.6570344 ],
       [ 4.497233  , -4.7210975 ],
       [-4.1407313 ,  4.0535064 ],
       [-4.035175  ,  3.95