In [1]:
import os
import sys

import json

import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import numpy as np
import random

from sklearn.feature_extraction.text import CountVectorizer

from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments     # https://huggingface.co/transformers/v3.0.2/main_classes/trainer.html#transformers.TFTrainingArguments
from transformers import Trainer    # https://huggingface.co/transformers/v3.0.2/main_classes/trainer
from transformers import DataCollatorForLanguageModeling
from transformers import pipeline

from datasets import load_metric
from datasets import load_dataset

import time
import requests
import wandb

# Config file
from transformer_config import Configuration
from transformer_config import CONSTANTS as C

In [2]:

# Global variables
METRIC = load_metric("accuracy")    # metric to use
val_dataset_global = None

  METRIC = load_metric("accuracy")    # metric to use


In [3]:
DATA_FOLDER = 'data/'
TWITTER_FOLDER = DATA_FOLDER + 'twitter-datasets/'

In [4]:
def compute_metrics(eval_pred): 
    '''Required function to evaluate the predictions of the model on the evaluation dataset.
     '''
    logits, labels = eval_pred      # here, we have to get rid of the second element (neutral class) of the logits before taking the softmax IF we want to only predict neg/pos
    predictions = np.argmax(logits, axis=-1)

    return METRIC.compute(predictions=predictions, references=labels)

In [5]:
# DATASET CLASSES
class TrainDataset(Dataset):
    '''Pytorch Dataset object used to store the data in a format
    that can be easily sent to the gpu for the models.'''
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels.long()

    def __getitem__(self, idx):
        item = {key: torch.tensor(value[idx]) for key, value in self.encodings.items()}
        item['labels'] = self.labels[idx].clone().detach()
        return item

    def __len__(self):
        return len(self.labels)     # number of items in the Dataset

In [6]:
class TestDataset(Dataset):
    '''Pytorch Dataset object used to store the data in a format
    that can be easily sent to the gpu for the models.'''
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(value[idx]) for key, value in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])     # number of items in the Dataset

In [7]:
# Store the testing data (tweets) in a numpy array
def load_test_data():
    '''Loads the testing data and returns it in a numpy array.'''
    filename = project_path + "test_data.txt"
    tweets = []
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.partition(",")[2]   # this allows to get the text content only for each line of the file (see that the file starts with "n," where n is the line numbner); it parts the string into three strings as: before the arg, the arg, and after the arg
            tweets.append(line.rstrip())

    return tweets

In [8]:
def get_test_data(tweets):
    ''' Takes as input the tweets as a numpy array with The tweets still
     being still in text, it will tokenize it using the corresponding tokenizer of the model.
     The tokenizer_max_length must be set from the config either passing the corresponding parameter,
     or the default value.
     returns the TestDataset object.

    '''
    nb_of_samples = len(tweets)
    print(f'{nb_of_samples} tweets loaded for testing.\n')
    tweets = tokenizer(tweets, max_length=config.tokenizer_max_length, padding="max_length", truncation=True)
    tweets = TestDataset(tweets)

    return tweets

In [9]:
def train(model, train_dataset, val_dataset):
    '''Helper function to start the training of a model.
    takes as input:
    - model: a huggingface model
    - train_dataset: a TrainDataset object used for the training.
    - val_dataset: a TrainDataset object used for the validation

    Creates a Trainer using the training arguments defaults, and the ones passed by commandline.
    Doesn't return anything per se, but the model taken as parameter will be trained after the training.'''

    training_args = TrainingArguments(output_dir=checkpoints_path,
                                      overwrite_output_dir=True,
                                      per_device_train_batch_size=config.bs_train,
                                      per_device_eval_batch_size=config.bs_eval,
                                      learning_rate=config.lr,
                                      evaluation_strategy="steps",            # "steps"
                                      save_strategy="steps",
                                      gradient_accumulation_steps=4,
                                      # gradient_checkpointing=True,
                                      save_total_limit=2,
                                      fp16=config.fp16,
                                      seed=config.seed,
                                      warmup_steps=500,                       # number of warmup steps for learning rate scheduler
                                      weight_decay=config.weight_decay,       # strength of weight decay
                                      logging_dir='./logs',                   # directory for storing logs
                                      logging_steps=500,
                                      load_best_model_at_end=True,
                                      num_train_epochs=config.n_epochs,
                                      report_to="wandb" # WANDB INTEGRATION
                                      )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics
    )

    if len(os.listdir(checkpoints_path)) == 0:
        trainer.train()
    else:
        trainer.train(checkpoints_path, resume_from_checkpoint=True)    # ??? Working like that?

    best_model_at_end_path = f"/best_model/"
    trainer.save_model(experiments_results_path + best_model_at_end_path)    # save the best model of the current iteration

In [10]:
def load_model_from_checkpoint(path_to_checkpoint):
    ''' Helper function, to load the model from a checkpoint.
    takes as input a path to the checkpoint (from the "experiment-[...]" )
     '''
    full_path_to_model_checkpoint = experiment_path + path_to_checkpoint
    model = AutoModelForSequenceClassification.from_pretrained(full_path_to_model_checkpoint, num_labels=config.num_labels, local_files_only=False, ignore_mismatched_sizes=True)
    print(f"Loaded model from: {full_path_to_model_checkpoint}")

    return model

In [11]:
def numpy_2d_softmax(model_preds):
    '''Converts the raw predictions from a HuggingFace model into clean logits.'''
    max = np.max(model_preds, axis=1, keepdims=True)
    e_x = np.exp(model_preds-max)
    sum = np.sum(e_x, axis=1, keepdims=True)
    out = e_x / sum
    return out

What is `logits` ??
logits could be interpreted as the probability predicted by the model. This was/is used when combining different models, as it yields better results to average these values instead of the rounded predictions.

In [12]:
def predict(model):
    ''' Takes the trained model as input, and will get the testing data and produce the predictions.
    Also produces the logits.txt file that is used for ensembling the models.
    returns a numpy array with the predictions of the model.
     '''

    test_trainer = Trainer(model)
    tweets = get_test_data(load_test_data())
    raw_preds, _, _ = test_trainer.predict(tweets)     # only predictions to return, no label ids, no metrics; see HF Trainer doc
    Y_test_pred = np.argmax(raw_preds, axis=1)

    # store the logits in a file
    logits = numpy_2d_softmax(raw_preds)    # beer owning line
    print(len(logits))
    print(logits)

    if not(config.model_name is None):
        model_name_for_logits = config.model_name.split("/")[1]
    else:
        model_name_for_logits = "NoModelNameGiven"

    if not(config.load_model is None):
        model_name_for_logits = config.load_model.split("experiment-")[1].split("\\")[0]

    np.savetxt(test_results_path + model_name_for_logits + "-" + 'logits.txt', logits, delimiter=",", header = "negative,positive", comments = "") # fmt="%1d"

    return Y_test_pred

In [13]:
def generate_submission(Y_preds):
    '''Takes as input a numpy array containing the model predictions, and generates
    a correctly formatted output csv file for the kaggle competition.'''
    nb_of_samples=len(Y_preds)
    results = np.zeros((nb_of_samples, 2))

    results[:,0] = np.arange(1, nb_of_samples+1).astype(np.int32)  # save the ids
    results[:,1] = [-1 if elem == 0 else 1 for elem in Y_preds]  # save the test predictions

    final_filename = f"{experiment_date_for_folder_name}-submission.csv"
    np.savetxt(test_results_path + final_filename, results, fmt="%1d", delimiter=",", header = "Id,Prediction", comments = "")

    return final_filename

### Prints for debug only ?

In [14]:
# TRAINING
def run_training(model):
    ''''''
    wandb_project_name = f"ML-epfl-{model_name}".replace("/","-").replace("\\","").replace("?","").replace("%","").replace(":","")
    wandb.init(project=wandb_project_name)
    try:
        print("Starting load and train")
        trained_model = load_and_train(model)
        # torch.cuda.empty_cache()  # can be used if save the trained model before that line and load it again after that line
        

    except Exception as e:
        print("GOT ERROR:", str(e))
        raise(e)

    print(f"Finished Training without problem.\n")

    return trained_model

In [15]:
# TESTING
def run_prediction(model):
    '''Function to call from main to start the testing of a trained model.
    Takes as input the trained model and returns the output filename.csv'''
    try:
        Y_test_pred = predict(model)
        submit_filename = generate_submission(Y_test_pred)
    except Exception as e:
        print("GOT ERROR:", str(e))
        raise(e)

    return submit_filename

In [16]:
def load_and_train(model):
    '''Helper function for training the model using the batches strategie to allow the model the run on systems with low amount of memory.
    takes as input:
    -model: the model to train (HF model)
    
    returns the trained model.'''

    if model is None:
        model = AutoModelForSequenceClassification.from_pretrained(config.model_name, num_labels=config.num_labels, local_files_only=False, ignore_mismatched_sizes=True)


    

    datasets = load_dataset("./HF_dataset.py")

    def tokenization(sample):
        return tokenizer(sample["text"], max_length=config.tokenizer_max_length, padding="max_length", truncation=True)

    datasets = datasets.map(tokenization, batched=True)

    train_dataset = datasets["train"]
    val_dataset = datasets["validation"]

    # TRAINING
    train(model, train_dataset, val_dataset)

    return model

In [17]:
if __name__ == "__main__":

    torch.cuda.empty_cache()

    # To time the duration of the experiment
    time_run = time.time()     # better to use perf_counter() than time()

    # Get the config
    config = Configuration.parse_cmd()

    # Prepare the folder where this experiment (i.e., program run) outputs and results will be saved
    experiment_id = int(time_run)
    experiment_date = time.ctime(experiment_id)
    print("CURRENT DATE TIME: ", experiment_date)
    experiment_date_name = experiment_date.replace(" ", "_").replace(":", "h")[:-8] + experiment_date[-8:-5].replace(":", "m") + "s"
    experiment_date_for_folder_name = "experiment-" + experiment_date_name

    # Prepare and set the paths
    if config.on_cluster:
        print("\nRunning on the cluster.")
        project_path = os.environ["CIL_PROJECT_PATH"]   # see cluster .bashrc file for the environment variables
        experiment_path = os.environ["CIL_EXPERIMENTS_PATH"] + "Experiments/"   # see cluster .bashrc file for the environment variables
    else:
        print("\nRunning locally.")
        project_path = "./"
        experiment_path = "./" + "Experiments/"

    experiments_results_path = experiment_path + experiment_date_for_folder_name
    os.makedirs(experiments_results_path, exist_ok=True)    # create the experiment folder(s) needed
    checkpoints_path = experiments_results_path + "/checkpoints/"
    print("The project path is: ", project_path)
    print("The experiment path is: ", experiment_path)
    print("The model checkpoints will be saved at: ", checkpoints_path, "\n")

    # for the submission
    test_results_path = experiments_results_path + "/test_results/"
    os.makedirs(test_results_path, exist_ok=True)    # create the folder(s) if needed


    # Fix seeds for reproducibility
    SEED = config.seed
    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.backends.cudnn.deterministic = False
    torch.backends.cudnn.benchmark = True # DEBUG: comment this when debugging.

    # Save in the experiment folder the command line that was used to run this program
    cmd = sys.argv[0] + ' ' + ' '.join(sys.argv[1:])
    with open(os.path.join(experiments_results_path, 'cmd.txt'), 'w') as f:
        f.write(cmd)

    # Data
    
    model_name = config.model_name
    use_most_freq_words = config.freq_words
    train_val_ratio = config.train_val_ratio

    discord_enabled = config.discord

    # Model
    n_epochs = config.n_epochs
    bs_train = config.bs_train
    bs_eval = config.bs_eval
    lr = config.lr
    fp16 = config.fp16
    weight_decay = config.weight_decay
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

    # Create the model
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=config.num_labels, local_files_only=False, ignore_mismatched_sizes=True)

    # If we need to load a model from a checkpoint or not
    if not (config.load_model is None):
        with open(experiment_path + config.load_model + "/config.json", 'r') as json_file:
            json_dict = json.load(json_file)

        model_name = json_dict["_name_or_path"]
        print("Using a checkpoint from the model architecture: ", model_name)

        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
        model = load_model_from_checkpoint(config.load_model)   # load_model should be (from a previous exp) e.g.: experiment-Thu_Jul_28_03h29m56s/checkpoints/checkpoint-14500

    model.to(C.DEVICE)  # automatic if use the Trainer()
    print("\nRunning on", C.DEVICE, " with PyTorch", torch.__version__, "\n")

    # --- TRAINING & VALIDATION ---
    if config.train:
        model = run_training(model)

    # --- TESTING ---
    if config.test:
        submit_filename = run_prediction(model)

    # Time that took the whole experiment to run
    time_run = time.time() - time_run
    print(f"The program took {str(time_run/60/60)[:6]} Hours or {str(time_run/60)[:6]} minutes to run.")

usage: ipykernel_launcher.py [-h] [--tag TAG] [--seed SEED] [--on_cluster]
                             [--autosubmit] [--discord]
                             [--load_model LOAD_MODEL] [--test] [--train]
                             [--num_labels NUM_LABELS] [--full_data]
                             [--amount_of_data AMOUNT_OF_DATA]
                             [--amount_per_it AMOUNT_PER_IT]
                             [--start_at_it START_AT_IT]
                             [--use_HF_dataset_format] [--freq_words]
                             [--tokenizer_max_length TOKENIZER_MAX_LENGTH]
                             [--model_name MODEL_NAME]
                             [--train_val_ratio TRAIN_VAL_RATIO] [--lr LR]
                             [--n_epochs N_EPOCHS]
                             [--weight_decay WEIGHT_DECAY]
                             [--bs_train BS_TRAIN] [--bs_eval BS_EVAL]
                             [--fp16]
ipykernel_launcher.py: error: ambiguous option: --f

SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
