In [1]:
from urllib.request import urlretrieve
from pathlib import Path
import pandas as pd
import json
from scipy.optimize import minimize

from datasets import load_dataset, DatasetDict, Dataset, concatenate_datasets
from transformers import TrainingArguments
#from main import finetune, eval, preprocess_function, calc_entropy_loss
from sys import argv as args
import os

from datasets import DatasetDict, ClassLabel

2023-05-11 11:58:24.547472: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
dataset_name = 'CSAbstruct'
function_names = ['eval', 'finetune', 'download', 'calc_entropy_loss']
dataset_types = ["train", "dev", "test"]
CSAbstruct_data_path = '../data/CSAbstruct/'

In [3]:
def download_data():
    """Downloading the CSabstruct dataset from github
    """
    
    #https://github.com/allenai/sequential_sentence_classification/blob/master/data/CSAbstruct/dev.jsonl
    CSABSTRUCT_DATA_BASE_URL = (
        "https://raw.githubusercontent.com/allenai/sequential_sentence_classification/master/data/CSAbstruct/")

    for dataset_type in dataset_types:
        print(f"Downloading {dataset_type} data...")
        file_name = CSAbstruct_data_path + dataset_type + '.csv'

        #print(CSABSTRUCT_DATA_BASE_URL + dataset_type+ '.jsonl')
        tempFile, headers = urlretrieve(
            CSABSTRUCT_DATA_BASE_URL + dataset_type + '.jsonl')
        lines = Path(tempFile).read_text("utf-8").strip().splitlines()
        
        for line in lines:
            print(parse_line_json(line))
            break

        l = [parse_json(p) for p in [parse_line_json(line)
                          for line in lines] if p is not None]
        flat_list = []
        for sublist in l:
            for item in sublist:
                flat_list.append(item)
        df = pd.DataFrame(flat_list)
        
        df.to_csv(file_name, index=False)
        print("Saved at:", file_name)

In [4]:
def load_data() -> DatasetDict:
    """Loading CSAbstruct dataset from corresponding csv format

    Returns:
        DatasetDict: it contains train, validation, test datasets
    """
    # dataset_dict - containing the dataset type as key and value is dataset of that type
    dataset_dict = DatasetDict()
    # itreate each dataset type
    for dataset_type in dataset_types:
        file_name = CSAbstruct_data_path + dataset_type + '.csv' # file
        if not Path(file_name).exists():
            print(
                f'{dataset_type} data is not available. Tried to find at:', file_name)
            download_data()

        # load dataset
        ds_dict = load_dataset("csv", data_files = file_name)
        
        ds = ds_dict['train'] # train is the default value when we load the dataset from csv

        # casting label column
        ds = ds.cast_column('label', ClassLabel(names=ds.unique('label')))

        # appending to dataset_dict
        dataset_dict[dataset_type] = ds

    return dataset_dict

In [5]:
def parse_line_json(line: str):
    line_json = json.loads(line)
    return line_json

def parse_json(line:dict):
    result_list = []
    for i in range(len(line['sentences'])):
        result_list.append({"text":line['sentences'][i],"label":line['labels'][i]})
    return result_list

In [6]:
# download dataset
download_data()

Downloading train data...
{'abstract_id': 0, 'sentences': ['Gamification has the potential to improve the quality of learning by better engaging students with learning activities.', 'Our objective in this study is to evaluate a gamified learning activity along the dimensions of learning, engagement, and enjoyment.', 'The activity made use of a gamified multiple choice quiz implemented as a software tool and was trialled in three undergraduate IT-related courses.', 'A questionnaire survey was used to collect data to gauge levels of learning, engagement, and enjoyment.', 'Results show that there was some degree of engagement and enjoyment.', 'The majority of participants (77.63 per cent) reported that they were engaged enough to want to complete the quiz and 46.05 per cent stated they were happy while playing the quiz.', 'In terms of learning, the overall results were positive since 60.53 per cent of students stated that it enhanced their learning effectiveness.', 'A limitation of the wo

In [7]:
# load dataset
dataset = load_data()

Downloading and preparing dataset csv/default to /work/pi_adrozdov_umass_edu/snatesan_umass_edu/hf_cache/datasets/csv/default-df1d9ea5bb82d2c3/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /work/pi_adrozdov_umass_edu/snatesan_umass_edu/hf_cache/datasets/csv/default-df1d9ea5bb82d2c3/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)


  0%|          | 0/1 [00:00<?, ?it/s]

Casting the dataset:   0%|          | 0/11333 [00:00<?, ? examples/s]

Downloading and preparing dataset csv/default to /work/pi_adrozdov_umass_edu/snatesan_umass_edu/hf_cache/datasets/csv/default-c309920c02222ace/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /work/pi_adrozdov_umass_edu/snatesan_umass_edu/hf_cache/datasets/csv/default-c309920c02222ace/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)


  0%|          | 0/1 [00:00<?, ?it/s]

Casting the dataset:   0%|          | 0/2026 [00:00<?, ? examples/s]

Downloading and preparing dataset csv/default to /work/pi_adrozdov_umass_edu/snatesan_umass_edu/hf_cache/datasets/csv/default-e72d8c185594823c/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /work/pi_adrozdov_umass_edu/snatesan_umass_edu/hf_cache/datasets/csv/default-e72d8c185594823c/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)


  0%|          | 0/1 [00:00<?, ?it/s]

Casting the dataset:   0%|          | 0/1349 [00:00<?, ? examples/s]

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 11333
    })
    dev: Dataset({
        features: ['text', 'label'],
        num_rows: 2026
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1349
    })
})

In [9]:
train_data, valid_data = dataset['train'], dataset['dev']

In [10]:
train_data

Dataset({
    features: ['text', 'label'],
    num_rows: 11333
})

In [11]:
# train_data labels
labels = train_data.features["label"].names
label2id = {labels[i]: i for i in range(len(labels))}
id2label = {i: labels[i] for i in range(len(labels))}

In [12]:
label2id

{'background': 0, 'objective': 1, 'method': 2, 'result': 3, 'other': 4}

In [13]:
# NOTE: Based on your requirements, make changes to the variables:  checkpoints_out_dir, dataset_subset
# model and checkpoints_out_dir directory
L_Model = "allenai/scibert_scivocab_uncased"
checkpoints_out_dir = "../checkpoints9/csabstract"
# device
device = 'cuda:0'

In [14]:
# Setting hyperparameters
hyper_params = {
    "seed" : 40,
    "learning_rate" : 1e-6,
    "per_device_train_batch_size" : 5,
    "per_device_eval_batch_size" : 5,
    "num_train_epochs" : 10, 
    "weight_decay" : 0.0001,
    "test_batch_size": 16 }

# hyper_params_X = list(hyper_params.values())
# hyper_params_types = [int, float, int, int, int, float, int]

In [15]:
# import torch
# import random
# torch.manual_seed(hyper_params['seed'])
# random.seed(hyper_params['seed'])

In [16]:
# preprocessing: 
# convert text --> ids
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [17]:
# Initialise tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(L_Model)

In [18]:
# Tokenize train and validation dataset
train_data = train_data.map(preprocess_function, batched=True)
valid_data = valid_data.map(preprocess_function, batched=True)

Map:   0%|          | 0/11333 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/2026 [00:00<?, ? examples/s]

In [19]:
# data collator to form a batch from list of training dataset
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [20]:
# Evaluate metrics
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis= 1)
    return accuracy.compute(predictions=predictions, references=labels)

In [21]:
# Define model
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    L_Model, num_labels=len(labels), id2label=id2label, label2id=label2id, return_dict=True)

# load the model into GPU
model = model.to(device)

comet_ml is installed but `COMET_API_KEY` is not set.
Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification mo

In [22]:
 model.device

device(type='cuda', index=0)

In [23]:
# define training arguments
training_args = TrainingArguments(
    output_dir=checkpoints_out_dir,
    learning_rate=hyper_params['learning_rate'],
    per_device_train_batch_size=hyper_params['per_device_train_batch_size'],
    per_device_eval_batch_size=hyper_params['per_device_eval_batch_size'],
    num_train_epochs=hyper_params['num_train_epochs'],
    gradient_accumulation_steps = 4,
    weight_decay=hyper_params['weight_decay'],
    lr_scheduler_type="cosine",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    seed=hyper_params['seed'],
    load_best_model_at_end=True
)

# define trainer
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_data,
    eval_dataset = valid_data,
    tokenizer = tokenizer,
    data_collator = data_collator,
    compute_metrics = compute_metrics,
)

# Train model
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
0,1.264,1.486468,0.383514
1,1.0484,1.53248,0.386476
2,0.9957,1.613245,0.382527
4,0.9632,1.613304,0.384995
4,0.9458,1.641645,0.386969
5,0.9247,1.648427,0.389931
6,0.9073,1.653294,0.388944
8,0.9028,1.663152,0.386476
8,0.8947,1.664028,0.385982
9,0.8975,1.663424,0.386969


TrainOutput(global_step=5660, training_loss=0.9660464768696169, metrics={'train_runtime': 1756.7615, 'train_samples_per_second': 64.511, 'train_steps_per_second': 3.222, 'total_flos': 2600202860683218.0, 'train_loss': 0.9660464768696169, 'epoch': 9.99})

In [None]:
# # Evaluate metrics
# import random
# import evaluate
# import numpy as np

# f1 = evaluate.load("f1")

# def compute_metrics(eval_pred):
#     predictions, labels = eval_pred
#     predictions = np.argmax(predictions, axis= 1)
#     return f1.compute(predictions=predictions, references=labels, average='micro')

# # def compute_metrics(eval_pred):
# #     predictions, labels = eval_pred
# #     predictions = np.argmax(predictions, axis=1)
    
# #     # calculate validation accuracy
# #     eval_accuracy = accuracy.compute(predictions=predictions, references=labels)
    
# #     # calculate train accuracy
# #     train_predictions, train_labels = trainer.predict(train_data_s)
# #     train_predictions = np.argmax(train_predictions, axis=1)
# #     train_accuracy = accuracy.compute(predictions=train_predictions, references=train_labels)
    
# #     return {"train_accuracy": train_accuracy, "eval_accuracy": eval_accuracy}

In [None]:
# from transformers import Trainer
# Trainer.hyperparameter_search?

In [None]:
# model.device

In [None]:
# from typing import List, Union
# from scipy import optimize
# # Define model
# from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# # Setting hyperparameters
# hyper_params = {
#     #"seed" : 200,
#     "learning_rate" : 1e-5,
#     "per_device_train_batch_size" : 5,
#     "per_device_eval_batch_size" : 5,
#     "num_train_epochs" : 10, 
#     "weight_decay" : 0.1,
#     #"test_batch_size": 16,
# }

# # hyper_params_X = list(hyper_params.values())
# # hyper_params_types = [
# #     # int, # For seed
# #     float, 
# #     int, 
# #     int, 
# #     int, 
# #     float, 
# #     int]
# trainer = None
# def loss(x: List[Union[int,float]], *args):
#     """
#     Optimizer that chooses optimal hyperpermaters for a model that trains true parameters.
#     <HACK>
#     NOTE NOTE NOTE NOTE NOTE NOTE
#     THE ORDER MUST ABSOLUTELY NOT CHANGE. IF YOU DO, BE SUPER DUPER CAREFUL. 
#     """
#     global trainer
#     trainer = None
#     #new_hyper_params_X = [hyper_params_types[i](i) for i in x]
#     new_learning_rate, new_weight_decay , *none_others = x
    
#     model = AutoModelForSequenceClassification.from_pretrained(
#     L_Model, num_labels=len(labels), id2label=id2label, label2id=label2id, return_dict=True)

#     # load the model into GPU
#     model = model.to(device)

#     # define training arguments
#     training_args = TrainingArguments(
#         output_dir=checkpoints_out_dir,
#         learning_rate=new_learning_rate, #hyper_params['learning_rate'],
#         per_device_train_batch_size=hyper_params['per_device_train_batch_size'],
#         per_device_eval_batch_size=hyper_params['per_device_eval_batch_size'],
#         num_train_epochs=hyper_params['num_train_epochs'],
#         weight_decay= new_weight_decay, #hyper_params['weight_decay'],
#         #seed=hyper_params['seed'],
#         evaluation_strategy="epoch",
#         load_best_model_at_end=True,
#         #optim="adamw_torch",
#         #eval_steps=100,  # evaluate every 100 steps
#         save_strategy="epoch",
#         #logging_steps=100,  # log train accuracy every 100 steps
#     )

#     # define trainer
#     trainer = Trainer(
#         model = model,
#         args = training_args,
#         train_dataset = train_data,
#         eval_dataset = valid_data,
#         tokenizer = tokenizer,
#         data_collator = data_collator,
#         compute_metrics = compute_metrics,
#     )
    
#     """
#     self._trainer.evaluate(eval_dataset=self._trainer.train_dataset, metric_key_prefix="train")"
#     {‘train_loss’: 0.7159061431884766, ‘train_accuracy’: 0.4, ‘train_f1’: 0.5714285714285715, ‘train_runtime’: 6.2973, ‘train_samples_per_second’: 2.382, ‘train_steps_per_second’: 0.159, ‘epoch’: 1.0}
# {‘eval_loss’: 0.8529007434844971, ‘eval_accuracy’: 0.0, ‘eval_f1’: 0.0, ‘eval_runtime’: 2.0739, ‘eval_samples_per_second’: 0.964, ‘eval_steps_per_second’: 0.482, ‘epoch’: 1.0}
    
#     """
#     trainer.train()
#     trainset_vals = trainer.evaluate(eval_dataset=train_data, metric_key_prefix='train')
#     validation_vals = trainer.evaluate(eval_dataset=valid_data, metric_key_prefix='valid')
#     tloss, tacc = trainset_vals['train_loss'], trainset_vals['train_accuracy']
#     vloss, vacc = validation_vals['valid_loss'], validation_vals['valid_accuracy']
    
#     #loss_vac =  (0.3*tloss+0.7*vloss) + (-0.15*tacc - 0.85*vacc)
#     loss_vac =  (-0.15*tacc - 0.85*vacc)
#     print("################################")
#     print(f"{new_learning_rate},{new_weight_decay}: {loss_vac:5.3f}, {tacc}, {vacc}")
#     return loss_vac
    
# best_learning_rate = optimize.minimize(loss,x0=[1e-5, 0.1],bounds=[(5e-7,5e-3), (0,0.1)],method='L-BFGS-B')
# #best_learning_rate = optimize.minimize(loss,x0=[1e-4, 0.1],method='CG', options={'maxiter':40})

# # # Train model
# # trainer.hyperparameter_search()
    
# # # Train model
# # trainer.hyperparameter_search()


    

### Evaluation on test_set

In [24]:
test_data = dataset['test']

In [25]:
test_data[1]

{'text': 'Traditional approaches to multi-label image classification learn independent classifiers for each category and employ ranking or thresholding on the classification results.',
 'label': 0}

In [26]:
len(test_data)

1349

In [37]:
# predict test dataset
from transformers import pipeline
from sklearn.metrics import classification_report

# pipeline
pipeline_task = 'text-classification'

# device
device = 'cuda:0'

# model 
checkpoints_dir = '../checkpoints9/csabstract/checkpoint-5660'

classifier = pipeline(pipeline_task, model=checkpoints_dir, device=device)

# Make predictions on the testing dataset
predictions = classifier(test_data['text'], batch_size=16)

# Convert the predictions to a list of labels
predicted_labels = [p['label'] for p in predictions]
true_labels = [classifier.model.config.id2label[label] for label in test_data['label']]

In [38]:
# calculate f1 score for each label and accuracy
from sklearn.metrics import classification_report

report = classification_report(true_labels, predicted_labels, output_dict=True)

# report has three root variables 1. accuracy 2. macro avg 3. weighted avg
macro_avg_f1_score = report['macro avg']['f1-score']
weighted_avg_f1_score = report['weighted avg']['f1-score']

accuracy = report['accuracy']

print('Macro Average F1 score: {:.2f}'.format(macro_avg_f1_score))
print('Weighted Average F1 score: {:.2f}'.format(weighted_avg_f1_score))
print('Accuracy: {:.2f}%'.format(accuracy * 100))

Macro Average F1 score: 0.38
Weighted Average F1 score: 0.39
Accuracy: 37.73%


In [39]:
import pandas as pd
df = pd.DataFrame(report)

# predictions dir
predictions_out_dir = '../predictions/csabstruct_test.csv'

df = df.transpose()
df = df.reset_index().rename(columns={'index': 'label'})
df = df[:-3] # removing accuracy, macro avg, weighted avg from the report
df.insert(df.columns.get_loc('label') + 1, 'label_index', [classifier.model.config.label2id[l] for l in df['label']])
df_sorted = df.sort_values(by='f1-score')
df_sorted.to_csv(predictions_out_dir, index = False)