# Fine-tuning Galactica for classification

In [1]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 3108, in _dep_map
    return self.__dep_map
  File "/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 2901, in __getattr__
    raise AttributeError(attr)
AttributeError: _DistInfoDistribution__dep_map

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/base_command.py", line 169, in exc_logging_wrapper
    status = run_func(*args)
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/req_command.py", line 242, in wrapper
    return func(self, options, args)
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/commands/install.py", line 377, in run
    requirement_set = resolver.resolve(
  Fi

In [1]:
from transformers import AutoTokenizer
import numpy as np
import os
from transformers import Trainer, TrainingArguments,AutoModelWithLMHead, OPTForSequenceClassification, set_seed
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from datasets import Dataset

In [2]:
model_params_size = "125m"

In [3]:
import random
seed = 42
set_seed(seed)
random.seed(seed)
torch.cuda.manual_seed_all(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)

no_deprecation_warning=True

In [4]:
from transformers import AutoConfig

# Download configuration from huggingface.co and cache.
config = AutoConfig.from_pretrained(f"facebook/galactica-{model_params_size}")
print(config)

OPTConfig {
  "_name_or_path": "facebook/galactica-125m",
  "_remove_final_layer_norm": false,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "architectures": [
    "OPTForCausalLM"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "do_layer_norm_before": true,
  "dropout": 0.1,
  "enable_bias": true,
  "eos_token_id": 2,
  "ffn_dim": 3072,
  "hidden_size": 768,
  "init_std": 0.02,
  "layer_norm_elementwise_affine": true,
  "layerdrop": 0.0,
  "learned_embeddings": true,
  "max_position_embeddings": 2048,
  "model_type": "opt",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "scale_embeddings": false,
  "torch_dtype": "float32",
  "transformers_version": "4.29.1",
  "use_cache": true,
  "vocab_size": 50000,
  "word_embed_proj_dim": 768
}



In [5]:
tokenizer = AutoTokenizer.from_pretrained(f"facebook/galactica-{model_params_size}", pad_token = "<pad>", eos_token = "</s>")
tokenizer

PreTrainedTokenizerFast(name_or_path='facebook/galactica-125m', vocab_size=50000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=True)

here we set the pad token to eos token and make sure that padding and truncation is left sided. Also, we limit the input size to 2048.

In [6]:
tokenizer.pad_token_id = config.eos_token_id
print(tokenizer.special_tokens_map)

{'eos_token': '</s>', 'pad_token': '</s>'}


In [7]:
# restrict to only 2048 tokens input.
tokenizer.model_max_length = 2048
# tokenizer.model_max_length = 2560

# pad left since decoder only architecture and the last token is used for classification, unlike bert.
tokenizer.padding_side = "left"
tokenizer.truncation_side = "left"

get model and move to cuda.

In [8]:
def load_model(path = f"facebook/galactica-{model_params_size}"):
  model = OPTForSequenceClassification.from_pretrained(path)
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  print(device)
  model.config.pad_token_id = model.config.eos_token_id
  model.resize_token_embeddings(len(tokenizer))
  model = model.to(device)
  return model

In [9]:
model = load_model()

Some weights of the model checkpoint at facebook/galactica-125m were not used when initializing OPTForSequenceClassification: ['lm_head.weight']
- This IS expected if you are initializing OPTForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing OPTForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/galactica-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


cuda


# Data loading and prep

In [11]:
!ls drive/MyDrive

 colab	'Colab Notebooks'   detect-gpt2  'TUM XAI'


In [13]:
!ls ../data

 archive			        classifier_input_restricted_train.csv
 classifier_input_restricted.csv        data_nlp2022
 classifier_input_restricted_test.csv  'stats and cleaning.ipynb'


In [14]:
DATA_PATH = os.path.join("..", "data")

In [15]:
train_df = pd.read_csv(os.path.join(DATA_PATH, "classifier_input_restricted_train.csv"))
test_df = pd.read_csv(os.path.join(DATA_PATH, "classifier_input_restricted_test.csv"))

In [16]:
gpt3_test = pd.read_csv(os.path.join(DATA_PATH, "data_nlp2022", "gpt3curienlp2022_restricted_test.csv"))

In [17]:
real2022nlp_df = pd.read_csv(os.path.join(DATA_PATH, "data_nlp2022", "realnlp2022_restricted_4000.csv"))

In [None]:
train_df[train_df.duplicated()]

Unnamed: 0,id,year,title,abstract,introduction,conclusion,categories,src,label


In [18]:
def transform_df(df, sep = None, start_of_text = None, end_of_text = None) -> pd.DataFrame:
  """
    Transforms a pandas DataFrame by concatenating the "abstract", "introduction", and "conclusion" columns into a new "text" column, with optional separators and text to prepend and append to the "text" column.

    Args:
        df (pandas.DataFrame): The input DataFrame to transform.
        sep (str, optional): A string to separate the text from the label in the output DataFrame. Defaults to None.
        start_of_text (str, optional): A string to prepend to the beginning of the text. Defaults to None.
        end_of_text (str, optional): A string to append to the end of the text. Defaults to None.

    Returns:
        pandas.DataFrame: The transformed DataFrame with a "text" column containing the concatenated text, and optionally a "label" and "src" column.
  """
  end_of_text = end_of_text if end_of_text else ""
  start_of_text = start_of_text if start_of_text else ""
  # Create text
  df["text"] = start_of_text \
    + "Abstract:\n\n" + df["abstract"] \
    + "\n\nIntroduction:\n\n" + df["introduction"] \
    + "\n\nConclusion:\n\n" + df["conclusion"] \
    + end_of_text
  if (sep):
    df["text"] = df["text"] + sep + df["label"].astype(str) + end_of_text
    return df[["text"]]

  return df[["text", "label", "src"]]

In [19]:
from sklearn.model_selection import train_test_split
from datasets import Dataset

def tokenize_func(examples):
  return tokenizer(examples["text"], padding= True, truncation=True)

def get_tokenized_dataset(df, sep = None, start_of_text = None, end_of_text = None, split = True, transformed = False):
  """
    This function takes a pandas DataFrame df and preprocesses it by performing the following steps:
    If transformed is False, it applies the transform_df() function to the df DataFrame to tokenize and preprocess the text data.
    If split is True, it splits the preprocessed data into train and validation sets using a 80-20 split.
    It creates a Hugging Face Dataset object for the train and validation sets using the from_pandas() function.
    It tokenizes the text data using the tokenize_func() function.
    It returns the tokenized train and validation sets as a tuple (tokenized_train, tokenized_val).

  Args:
      df: A pandas DataFrame containing text data.
      sep: The separator to use when tokenizing the text data. Default is None.
      start_of_text: The token to use at the start of each text sequence. Default is None.
      end_of_text: The token to use at the end of each text sequence. Default is None.
      split: A boolean indicating whether or not to split the preprocessed data into train and validation sets. Default is True.
      transformed: A boolean indicating whether or not the df DataFrame has already been preprocessed. Default is False.
      Returns:

  tokenized_train: A Hugging Face Dataset object containing the tokenized train data.
  tokenized_val: A Hugging Face Dataset object containing the tokenized validation data. If split is False, this value is None.
  """
  transformed_df = df
  if (not transformed):
    transformed_df = transform_df(df, sep, start_of_text, end_of_text)
  train_texts = transformed_df
  tokenized_val = None
  if (split):
    train_texts, val_texts = train_test_split(train_texts, test_size=.2, random_state=seed)
    val_texts = val_texts[~val_texts["text"].isna()]
    val_ds = Dataset.from_pandas(val_texts, split="test")
    tokenized_val = val_ds.map(tokenize_func, batched=True)
  # clean NA
  train_texts = train_texts[~train_texts["text"].isna()]
  train_ds = Dataset.from_pandas(train_texts, split="train")
  # tokenize.
  tokenized_train = train_ds.map(tokenize_func, batched=True)
  return tokenized_train, tokenized_val

split data into train and val

In [None]:
tokenized_train, tokenized_val = get_tokenized_dataset(train_df)

Map:   0%|          | 0/3200 [00:00<?, ? examples/s]

Map:   0%|          | 0/12800 [00:00<?, ? examples/s]

In [None]:
tokenized_train

Dataset({
    features: ['text', 'label', 'src', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 12800
})

In [None]:
tokenized_val

Dataset({
    features: ['text', 'label', 'src', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 3200
})

In [27]:
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/81.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.0


### setting up wandb for logging

In [28]:
!pip install wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wandb
  Downloading wandb-0.15.2-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.31-py3-none-any.whl (184 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m184.3/184.3 kB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-1.23.0-py2.py3-none-any.whl (205 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m205.1/205.1 kB[0m [31m29.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting pathtools (from wandb)
  Downloading pathtools-0.1.2.tar.gz (11 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting

set wandb api key to save to the cloud

In [20]:
import wandb
os.environ["WANDB_API_KEY"] = ""
wandb.login()
%env WANDB_PROJECT=galactica_paper_classifier

# to disable wandb
# os.environ["WANDB_DISABLED"] = "false"

[34m[1mwandb[0m: Currently logged in as: [33mmohamed-heshamse[0m ([33mxai-artificial-papers[0m). Use [1m`wandb login --relogin`[0m to force relogin


env: WANDB_PROJECT=galactica_paper_classifier


create metrices function to be passed to the trainer later.

In [21]:
import evaluate
acc_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
recall_metric = evaluate.load("recall")
precision_metric = evaluate.load("precision")

def compute_metrics(eval_pred):
    output_dict = {}
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    output_dict.update(acc_metric.compute(predictions=predictions, references=labels))
    output_dict.update(f1_metric.compute(predictions=predictions, references=labels))
    output_dict.update(recall_metric.compute(predictions=predictions, references=labels))
    output_dict.update(precision_metric.compute(predictions=predictions, references=labels))
    return output_dict

# Training

Here we train our main model on our training dataset without removing or adding any other datasets. This model will be later saved and called for eval under the variable `model_exper`.

In [None]:
# start a new run
wandb.init(project = "galactica_paper_classifier")
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=4,              # total number of training epochs
    learning_rate=5e-6,
    per_device_train_batch_size=2,  # batch size per device during training
    per_device_eval_batch_size=2,   # batch size for evaluation
    gradient_accumulation_steps = 4,
    weight_decay=0.01,
    warmup_steps=1000,
    logging_dir='./logs',            # directory for storing logs
    logging_steps=1000,
    eval_steps=1000,
    save_steps=1000,
    evaluation_strategy="steps",
    save_total_limit = 1,
    # save_strategy = "no",
    load_best_model_at_end=True,
    fp16=True,
    report_to="wandb"
    )


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `OPTForSequenceClassification.forward` and have been ignored: __index_level_0__, src, text, token_type_ids. If __index_level_0__, src, text, token_type_ids are not expected by `OPTForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 12800
  Num Epochs = 4
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 4
  Total optimization steps = 6400
  Number of trainable parameters = 125031936
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss,Accuracy,F1,Recall,Precision
1000,0.3749,0.208862,0.949063,0.951761,0.992593,0.914156
2000,0.1766,0.185385,0.956562,0.958495,0.990741,0.928282
3000,0.1181,0.097938,0.980625,0.981098,0.99321,0.969277
4000,0.0635,0.085129,0.983125,0.983333,0.983333,0.983333
5000,0.0797,0.13125,0.979375,0.97989,0.992593,0.967509
6000,0.0177,0.123608,0.9825,0.982885,0.992593,0.973366


The following columns in the evaluation set don't have a corresponding argument in `OPTForSequenceClassification.forward` and have been ignored: __index_level_0__, src, text, token_type_ids. If __index_level_0__, src, text, token_type_ids are not expected by `OPTForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3200
  Batch size = 2
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `OPTForSequenceClassification.forward` and have been ignored: __index_level_0__, src, text, token_type_ids. If __index_level_0__, src, text, token_type_ids are not expected by `OPTForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3200
  Batch size = 2
Saving 

TrainOutput(global_step=6400, training_loss=0.13145559638738633, metrics={'train_runtime': 5535.88, 'train_samples_per_second': 9.249, 'train_steps_per_second': 1.156, 'total_flos': 5.35135745212416e+16, 'train_loss': 0.13145559638738633, 'epoch': 4.0})

In [None]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `OPTForSequenceClassification.forward` and have been ignored: __index_level_0__, src, text, token_type_ids. If __index_level_0__, src, text, token_type_ids are not expected by `OPTForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3200
  Batch size = 2


{'eval_loss': 0.08512917160987854,
 'eval_accuracy': 0.983125,
 'eval_f1': 0.9833333333333333,
 'eval_recall': 0.9833333333333333,
 'eval_precision': 0.9833333333333333,
 'eval_runtime': 109.7964,
 'eval_samples_per_second': 29.145,
 'eval_steps_per_second': 14.572,
 'epoch': 4.0}

In [None]:
# wandb analysis and testing
wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.023 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.031959…

0,1
eval/accuracy,▁▃▇█▇██
eval/f1,▁▂██▇██
eval/loss,█▇▂▁▄▃▁
eval/precision,▁▂▇█▆▇█
eval/recall,█▆█▁██▁
eval/runtime,▁█▇█▇██
eval/samples_per_second,█▁▂▁▁▁▁
eval/steps_per_second,█▁▂▁▁▁▁
train/epoch,▁▁▂▂▄▄▅▅▆▆▇▇██
train/global_step,▁▁▂▂▄▄▅▅▆▆▇▇██

0,1
eval/accuracy,0.98313
eval/f1,0.98333
eval/loss,0.08513
eval/precision,0.98333
eval/recall,0.98333
eval/runtime,109.7964
eval/samples_per_second,29.145
eval/steps_per_second,14.572
train/epoch,4.0
train/global_step,6400.0


In [None]:
# save model manually
# model_best = OPTForSequenceClassification.from_pretrained(trainer.model)
# model_path = "./results/82/"
# trainer.model.save_pretrained(model_path)       # save the model
# tokenizer.save_pretrained(model_path)

Configuration saved in ./results/82/config.json
Model weights saved in ./results/82/pytorch_model.bin
tokenizer config file saved in ./results/82/tokenizer_config.json
Special tokens file saved in ./results/82/special_tokens_map.json


('./results/82/tokenizer_config.json',
 './results/82/special_tokens_map.json',
 './results/82/tokenizer.json')

# eval and Experiments

In [22]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [23]:
from scipy.special import softmax
def compute_metrics_eval(eval_pred):
    output_dict = {}
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    wrongly_classified = np.where(predictions != labels)[0]
    output_dict.update(acc_metric.compute(predictions=predictions, references=labels))
    output_dict.update(f1_metric.compute(predictions=predictions, references=labels))
    output_dict.update(recall_metric.compute(predictions=predictions, references=labels))
    output_dict.update(precision_metric.compute(predictions=predictions, references=labels))
    output_dict.update({"wrongly_classified": wrongly_classified})
    output_dict.update({"softmax_probs": softmax(logits, axis = -1)})
    return output_dict

## Functions used for eval.

In [24]:
def get_falsely_classified(index_arr, tokenized_dataset):
  """
  Given an index array and a tokenized dataset, returns a DataFrame containing the texts, labels, and predicted labels of the dataset at the specified indices. If the dataset contains source data, it is also included in the returned DataFrame.
    
    Parameters:
    index_arr (List[int]): A list of indices corresponding to the elements of the tokenized dataset to include in the returned DataFrame.
    tokenized_dataset (datasets.Dataset): A tokenized dataset containing text data, label data, and optionally source data.
    
    Returns:
    pandas.DataFrame: A DataFrame containing the texts, labels, predicted labels, and optionally source data of the dataset at the specified indices.
  """
  texts = tokenized_dataset[index_arr]['text']
  label = tokenized_dataset[index_arr]['label']
  src = tokenized_dataset[index_arr].get('src', [])
  preds = [1 if lbl == 0 else 0 for lbl in label]
  df = pd.DataFrame({'text': texts, 'label': label, 'prediction': preds, 'src': src})
  return df

In [25]:
def eval_dataset(df, model, tokenized = False, transformed = False, get_probs_df = True):
  """
  Evaluates a model on a given dataset and returns information about the evaluation. If the dataset is not already tokenized, it will be tokenized automatically. If the dataset contains source data, it will be included in the returned DataFrame.
    
    Parameters:
    df (pandas.DataFrame): The dataset to evaluate the model on. The dataset should have columns "text" and "label", and optionally "src" if source data is present.
    model (transformers.Trainer): The model to evaluate the dataset on.
    tokenized (bool, optional): If True, assumes the dataset is already tokenized. Defaults to False.
    transformed (bool, optional): If True, assumes the dataset is already transformed. Defaults to False.
    get_probs_df (bool, optional): If True, returns a DataFrame containing the softmax probabilities of each label for each text. Defaults to True.
    
    Returns:
    Tuple[pandas.DataFrame, dict, pandas.DataFrame]: A tuple containing three elements:
        1. A DataFrame containing the texts, labels, and predicted labels of the elements in the dataset that were wrongly classified by the model.
        2. A dictionary containing information about the evaluation, including the loss and accuracy of the model on the dataset.
        3. A DataFrame containing the texts, labels, predicted labels, and optionally source data and softmax probabilities of the entire dataset.
  """
  if (not tokenized):
    tokenized_df, _ = get_tokenized_dataset(df, transformed = transformed, split = False)
  else:
    tokenized_df = df
  training_args_best = TrainingArguments(per_device_eval_batch_size=2,  output_dir="./output_eval")
  trainer_eval = Trainer(model=model, args=training_args_best, compute_metrics=compute_metrics_eval)
  output = trainer_eval.evaluate(tokenized_df)

  wrongly_classified = output["eval_wrongly_classified"]
  wrongly_classified_df = get_falsely_classified(wrongly_classified, tokenized_df)
  display(wrongly_classified_df)

  df_complete = pd.DataFrame({"text":tokenized_df["text"], "label":tokenized_df["label"], "src": tokenized_df["src"]})
  softmax_arr = np.array(output["eval_softmax_probs"])
  df_complete['prediction'] = softmax_arr.argmax(-1)

  if (get_probs_df):
    softmax_probs = pd.DataFrame(softmax_arr)
    df_complete = pd.concat([df_complete, softmax_probs], axis = 1)
  
  print(output)

  return wrongly_classified_df, output, df_complete

## eval on the test dataset

In [None]:
# load the model trained on TRAIN dataset.
model_exper = OPTForSequenceClassification.from_pretrained("results/82-trainedOnNewData")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
model_exper.config.pad_token_id = model_exper.config.eos_token_id
model_exper.resize_token_embeddings(len(tokenizer))
model_exper = model_exper.to(device)

# eval the model on TEST dataset.
class_table, output_dict, df_test_complete = eval_dataset(df = test_df, model = model_exper)
# class_table, output_dict, df_test_complete = eval_dataset(df = test_df, model = trainer.model)

loading configuration file results/82-trainedOnNewData/config.json
Model config OPTConfig {
  "_name_or_path": "facebook/galactica-125m",
  "_remove_final_layer_norm": false,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "architectures": [
    "OPTForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "do_layer_norm_before": true,
  "dropout": 0.1,
  "enable_bias": true,
  "eos_token_id": 2,
  "ffn_dim": 3072,
  "hidden_size": 768,
  "init_std": 0.02,
  "layer_norm_elementwise_affine": true,
  "layerdrop": 0.0,
  "learned_embeddings": true,
  "max_position_embeddings": 2048,
  "model_type": "opt",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 2,
  "problem_type": "single_label_classification",
  "scale_embeddings": false,
  "torch_dtype": "float32",
  "transformers_version": "4.26.1",
  "use_cache": true,
  "vocab_size": 50000,
  "word_embed_proj_dim": 768
}

loading weights file results/82-trainedOnNewData/pytor

cuda


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the evaluation set don't have a corresponding argument in `OPTForSequenceClassification.forward` and have been ignored: __index_level_0__, src, token_type_ids, text. If __index_level_0__, src, token_type_ids, text are not expected by `OPTForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 8000
  Batch size = 2


Trainer is attempting to log a value of "[  20   80  162  213  275  286  425  443  489  596  654  756  776  827
  841  849  850  974 1101 1131 1167 1181 1326 1370 1409 1425 1756 1794
 1815 1818 1834 2052 2057 2071 2089 2100 2164 2182 2212 2282 2394 2462
 2637 2809 2820 2861 2886 3010 3105 3133 3158 3185 3191 3274 3612 3653
 3872 3883 3918 4024 4075 4092 4290 4299 4310 4347 4436 4564 4588 4615
 4647 4711 4729 4822 4906 4912 4918 4939 4974 4990 5017 5051 5097 5098
 5156 5341 5413 5421 5425 5438 5517 5537 5582 5713 6130 6172 6227 6229
 6240 6402 6449 6527 6573 6583 6638 6693 6696 6747 6750 6764 6790 6833
 6836 6888 6906 6925 7028 7058 7106 7122 7163 7178 7279 7286 7323 7458
 7672 7846 7893 7902 7903]" of type <class 'numpy.ndarray'> for key "eval/wrongly_classified" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[[4.9271455e-07 9.9999952e-01]
 [4.1542555e-07 9.9999964e-01]
 [7.2485335e-0

Unnamed: 0,text,label,prediction,src
0,Abstract:\n\nThe Cygnus Loop is a well-studied...,0,1,real
1,Abstract:\n\nIn this paper we review studies o...,0,1,real
2,Abstract:\n\nThe leptonic W boson production a...,1,0,galactica
3,Abstract:\n\nWe investigate the potential for ...,0,1,real
4,Abstract:\n\nThe growth and composition of Ear...,0,1,real
...,...,...,...,...
126,Abstract:\n\nThe physical solutions of Lagrang...,0,1,real
127,Abstract:\n\nThe purpose of this note is to pr...,1,0,galactica
128,"Abstract:\n\nIn this paper, I examine what I r...",0,1,real
129,Abstract:\n\nThe Digital Ludeme Project (DLP) ...,0,1,real


{'eval_loss': 0.09899298846721649, 'eval_accuracy': 0.983625, 'eval_f1': 0.9836597230884372, 'eval_recall': 0.98575, 'eval_precision': 0.9815782922579039, 'eval_wrongly_classified': array([  20,   80,  162,  213,  275,  286,  425,  443,  489,  596,  654,
        756,  776,  827,  841,  849,  850,  974, 1101, 1131, 1167, 1181,
       1326, 1370, 1409, 1425, 1756, 1794, 1815, 1818, 1834, 2052, 2057,
       2071, 2089, 2100, 2164, 2182, 2212, 2282, 2394, 2462, 2637, 2809,
       2820, 2861, 2886, 3010, 3105, 3133, 3158, 3185, 3191, 3274, 3612,
       3653, 3872, 3883, 3918, 4024, 4075, 4092, 4290, 4299, 4310, 4347,
       4436, 4564, 4588, 4615, 4647, 4711, 4729, 4822, 4906, 4912, 4918,
       4939, 4974, 4990, 5017, 5051, 5097, 5098, 5156, 5341, 5413, 5421,
       5425, 5438, 5517, 5537, 5582, 5713, 6130, 6172, 6227, 6229, 6240,
       6402, 6449, 6527, 6573, 6583, 6638, 6693, 6696, 6747, 6750, 6764,
       6790, 6833, 6836, 6888, 6906, 6925, 7028, 7058, 7106, 7122, 7163,
       7178, 72

In [None]:
df_test_complete.to_csv("galactica_complete_preds.csv")

## Robustness check

Does the model have similar results when presented with real data coming from a different pdf parser?

In [None]:
model_exper = load_model("results/82-trainedOnNewData")

loading configuration file results/82-trainedOnNewData/config.json
Model config OPTConfig {
  "_name_or_path": "facebook/galactica-125m",
  "_remove_final_layer_norm": false,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "architectures": [
    "OPTForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "do_layer_norm_before": true,
  "dropout": 0.1,
  "enable_bias": true,
  "eos_token_id": 2,
  "ffn_dim": 3072,
  "hidden_size": 768,
  "init_std": 0.02,
  "layer_norm_elementwise_affine": true,
  "layerdrop": 0.0,
  "learned_embeddings": true,
  "max_position_embeddings": 2048,
  "model_type": "opt",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 2,
  "problem_type": "single_label_classification",
  "scale_embeddings": false,
  "torch_dtype": "float32",
  "transformers_version": "4.26.1",
  "use_cache": true,
  "vocab_size": 50000,
  "word_embed_proj_dim": 768
}

loading weights file results/82-trainedOnNewData/pytor

cuda


In [None]:
class_table_real, output_dict_real, df_real_pred = eval_dataset(df = real2022nlp_df, model = model_exper)

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
The following columns in the evaluation set don't have a corresponding argument in `OPTForSequenceClassification.forward` and have been ignored: __index_level_0__, src, text, token_type_ids. If __index_level_0__, src, text, token_type_ids are not expected by `OPTForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 4000
  Batch size = 2


  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "[   6   12   22   87   92  105  111  140  184  214  219  225  279  290
  365  376  416  423  441  445  460  534  565  596  599  627  656  657
  692  726  781  784  791  826  833  853  898  965  966 1016 1080 1130
 1153 1173 1188 1200 1265 1283 1313 1370 1380 1431 1434 1477 1484 1515
 1525 1536 1559 1560 1574 1590 1605 1611 1625 1649 1665 1674 1680 1685
 1692 1702 1728 1795 1841 1848 1860 1875 1900 1912 1995 2015 2081 2140
 2172 2175 2188 2194 2230 2233 2268 2280 2354 2364 2370 2448 2546 2549
 2621 2646 2648 2670 2675 2679 2700 2745 2748 2752 2785 2794 2797 2803
 2836 2849 2851 2885 2897 2910 2918 2919 2925 2942 2985 2987 3016 3033
 3066 3108 3109 3156 3170 3171 3179 3197 3199 3226 3235 3247 3291 3300
 3326 3329 3351 3470 3473 3544 3558 3561 3587 3614 3651 3678 3701 3757
 3764 3766 3772 3793 3806 3825 3844 3914 3917 3930 3943]" of type <class 'numpy.ndarray'> for key "eval/wrongly_classified"

Unnamed: 0,text,label,prediction,src
0,Abstract:\n\nThe family of all k-independent s...,0,1,real2022nlp
1,Abstract:\n\nThe OGS for non-abelian groups is...,0,1,real2022nlp
2,Abstract:\n\nThis work is concerned with the d...,0,1,real2022nlp
3,Abstract:\n\nGraph Neural Networks (GNNs) have...,0,1,real2022nlp
4,Abstract:\n\nNeural Architecture Search (NAS) ...,0,1,real2022nlp
...,...,...,...,...
160,"Abstract:\n\nIn this paper, we study multi-blo...",0,1,real2022nlp
161,Abstract:\n\nHigher genus modular graph tensor...,0,1,real2022nlp
162,Abstract:\n\nWe consider the long-time behavio...,0,1,real2022nlp
163,Abstract:\n\nWe study the GIT quotient of the ...,0,1,real2022nlp


{'eval_loss': 0.23886463046073914, 'eval_accuracy': 0.95875, 'eval_f1': 0.0, 'eval_recall': 0.0, 'eval_precision': 0.0, 'eval_wrongly_classified': array([   6,   12,   22,   87,   92,  105,  111,  140,  184,  214,  219,
        225,  279,  290,  365,  376,  416,  423,  441,  445,  460,  534,
        565,  596,  599,  627,  656,  657,  692,  726,  781,  784,  791,
        826,  833,  853,  898,  965,  966, 1016, 1080, 1130, 1153, 1173,
       1188, 1200, 1265, 1283, 1313, 1370, 1380, 1431, 1434, 1477, 1484,
       1515, 1525, 1536, 1559, 1560, 1574, 1590, 1605, 1611, 1625, 1649,
       1665, 1674, 1680, 1685, 1692, 1702, 1728, 1795, 1841, 1848, 1860,
       1875, 1900, 1912, 1995, 2015, 2081, 2140, 2172, 2175, 2188, 2194,
       2230, 2233, 2268, 2280, 2354, 2364, 2370, 2448, 2546, 2549, 2621,
       2646, 2648, 2670, 2675, 2679, 2700, 2745, 2748, 2752, 2785, 2794,
       2797, 2803, 2836, 2849, 2851, 2885, 2897, 2910, 2918, 2919, 2925,
       2942, 2985, 2987, 3016, 3033, 3066, 3108, 3

## GPT-3 out of distribution

Using the GPT3 dataset from lab of 2022 NLP to asses how good the model is in classifying out of domain generators. We also train the model using few more examples from GPT3 to see if there is any improvement.

In [None]:
# unvomment when using the already trained model.
# model_exper = load_model("results/85-TrainedOnGPT3")

cuda


how well does the model perform on OOD GPT3?

In [None]:
class_table_gpt3, output_dict_gpt3, df_gpt3_pred = eval_dataset(df = gpt3_test, model = model_exper)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
The following columns in the evaluation set don't have a corresponding argument in `OPTForSequenceClassification.forward` and have been ignored: __index_level_0__, src, text, token_type_ids. If __index_level_0__, src, text, token_type_ids are not expected by `OPTForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 2


Trainer is attempting to log a value of "[  0   1   2   3   4   6   8  10  11  12  13  14  15  16  18  19  21  24
  25  26  28  29  31  32  33  34  35  36  37  38  39  40  41  42  44  45
  46  47  48  49  50  51  52  53  56  58  59  60  61  62  64  65  66  67
  68  69  71  72  74  75  78  80  81  82  83  85  86  89  91  92  93  94
  96  97  98  99 101 104 106 107 108 109 110 112 113 114 115 116 118 119
 121 123 124 125 126 127 128 130 131 133 134 137 138 140 142 143 144 145
 146 147 148 149 150 151 153 155 156 157 158 159 160 161 162 163 165 166
 167 168 169 170 171 174 175 176 177 178 179 180 181 182 183 184 185 186
 187 189 191 193 194 196 197 198 199 200 201 202 203 204 205 207 208 209
 211 212 213 214 215 217 219 220 221 222 223 224 225 226 227 230 231 232
 233 237 238 239 240 241 242 244 246 248 249 250 251 252 253 254 255 256
 257 258 260 261 264 265 267 268 269 270 272 273 275 276 277 278 279 281
 282 283 286 288 289 290 291 293 294 296 297 298 299 301 302 303 304 305
 306 309 3

Unnamed: 0,text,label,prediction,src
0,Abstract:\n\nWe propose a method for self-supe...,1,0,gpt32022nlp
1,"Abstract:\n\nIn this work, we propose a new gr...",1,0,gpt32022nlp
2,Abstract:\n\nA major limitation of current net...,1,0,gpt32022nlp
3,"Abstract:\n\nIn this paper, we introduce Solo-...",1,0,gpt32022nlp
4,Abstract:\n\nWe address the problem of scalabl...,1,0,gpt32022nlp
...,...,...,...,...
749,Abstract:\n\nConventional image-text represent...,1,0,gpt32022nlp
750,"Abstract:\n\nIn this paper, we present a novel...",1,0,gpt32022nlp
751,Abstract:\n\nWe consider the problem of learni...,1,0,gpt32022nlp
752,"Abstract:\n\nIn this work, we propose to impro...",1,0,gpt32022nlp


{'eval_loss': 7.657734394073486, 'eval_accuracy': 0.246, 'eval_f1': 0.39486356340288925, 'eval_recall': 0.246, 'eval_precision': 1.0, 'eval_wrongly_classified': array([  0,   1,   2,   3,   4,   6,   8,  10,  11,  12,  13,  14,  15,
        16,  18,  19,  21,  24,  25,  26,  28,  29,  31,  32,  33,  34,
        35,  36,  37,  38,  39,  40,  41,  42,  44,  45,  46,  47,  48,
        49,  50,  51,  52,  53,  56,  58,  59,  60,  61,  62,  64,  65,
        66,  67,  68,  69,  71,  72,  74,  75,  78,  80,  81,  82,  83,
        85,  86,  89,  91,  92,  93,  94,  96,  97,  98,  99, 101, 104,
       106, 107, 108, 109, 110, 112, 113, 114, 115, 116, 118, 119, 121,
       123, 124, 125, 126, 127, 128, 130, 131, 133, 134, 137, 138, 140,
       142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 153, 155, 156,
       157, 158, 159, 160, 161, 162, 163, 165, 166, 167, 168, 169, 170,
       171, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185,
       186, 187, 189, 191, 193, 194, 196, 197, 

In [None]:
gpt3_train = pd.read_csv(os.path.join(DATA_PATH, "data_nlp2022", "gpt3curienlp2022_restricted_train.csv"))
traingpt3_df = pd.concat([train_df, gpt3_train])
tokenized_train_traingpt3, tokenized_test_traingpt3 = get_tokenized_dataset(traingpt3_df)

Map:   0%|          | 0/3440 [00:00<?, ? examples/s]

Map:   0%|          | 0/13760 [00:00<?, ? examples/s]

In [None]:
model_gpt3 = load_model(f"facebook/galactica-{model_params_size}")

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--facebook--galactica-125m/snapshots/97dd5f1c6cc30b3fc393e4cd638a0ecda921de89/config.json
Model config OPTConfig {
  "_name_or_path": "/content/mini",
  "_remove_final_layer_norm": false,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "architectures": [
    "OPTForCausalLM"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "do_layer_norm_before": true,
  "dropout": 0.1,
  "enable_bias": true,
  "eos_token_id": 2,
  "ffn_dim": 3072,
  "hidden_size": 768,
  "init_std": 0.02,
  "layer_norm_elementwise_affine": true,
  "layerdrop": 0.0,
  "learned_embeddings": true,
  "max_position_embeddings": 2048,
  "model_type": "opt",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "scale_embeddings": false,
  "torch_dtype": "float32",
  "transformers_version": "4.26.1",
  "use_cache": true,
  "vocab_size": 50000,
  "word_embed_proj_dim": 768
}

loading weights 

cuda


In [None]:
os.environ["WANDB_DISABLED"] = "false"
wandb.init(project = "galactica_paper_classifier")
training_args_gpt3 = TrainingArguments(
    output_dir='./results',
    num_train_epochs=4,              # total number of training epochs
    learning_rate=5e-6,
    per_device_train_batch_size=2,  # batch size per device during training
    per_device_eval_batch_size=2,   # batch size for evaluation
    gradient_accumulation_steps = 4,
    weight_decay=0.01,
    warmup_steps=1000,
    logging_dir='./logs',            # directory for storing logs
    logging_steps=1000,
    eval_steps=1000,
    save_steps=1000,
    evaluation_strategy="steps",
    save_total_limit = 1,
    # save_strategy = "no",
    load_best_model_at_end=True,
    fp16=True,
    report_to="wandb"
    )


trainer_gpt3 = Trainer(
    model=model_gpt3,
    args=training_args_gpt3,
    train_dataset=tokenized_train_traingpt3,
    eval_dataset=tokenized_test_traingpt3,
    compute_metrics=compute_metrics
)

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/accuracy,███▇██▁
eval/f1,███▇█▇▁
eval/loss,▁▁▁▁▁▁█
eval/precision,▄▆▇▁▆▇█
eval/recall,█████▇▁
eval/runtime,▂▂▂▂▂█▁
eval/samples_per_second,▃▃▃▃▃▁█
eval/steps_per_second,▃▃▃▃▃▁█
train/epoch,▁▁▃▃▄▄▆▆▇▇█
train/global_step,▂▂▃▃▅▅▆▆▇▇█▁▁

0,1
eval/accuracy,0.428
eval/f1,0.59944
eval/loss,4.67965
eval/precision,1.0
eval/recall,0.428
eval/runtime,15.8892
eval/samples_per_second,62.936
eval/steps_per_second,31.468
train/epoch,4.0
train/global_step,0.0


PyTorch: setting up devices
Using cuda_amp half precision backend


In [None]:
trainer_gpt3.train()

The following columns in the training set don't have a corresponding argument in `OPTForSequenceClassification.forward` and have been ignored: __index_level_0__, src, token_type_ids, text. If __index_level_0__, src, token_type_ids, text are not expected by `OPTForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 13760
  Num Epochs = 4
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 4
  Total optimization steps = 6880
  Number of trainable parameters = 125031936
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss,Accuracy,F1,Recall,Precision
1000,0.3964,0.505373,0.885465,0.90409,0.989345,0.832362
2000,0.2157,0.128974,0.971512,0.974386,0.993074,0.956388
3000,0.1402,0.077286,0.984302,0.985707,0.992009,0.979484
4000,0.0831,0.075553,0.986628,0.987831,0.994672,0.981083
5000,0.065,0.067622,0.986337,0.987523,0.990943,0.984127
6000,0.0224,0.13697,0.979942,0.981875,0.995738,0.968394


The following columns in the evaluation set don't have a corresponding argument in `OPTForSequenceClassification.forward` and have been ignored: __index_level_0__, src, token_type_ids, text. If __index_level_0__, src, token_type_ids, text are not expected by `OPTForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3440
  Batch size = 2
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `OPTForSequenceClassification.forward` and have been ignored: __index_level_0__, src, token_type_ids, text. If __index_level_0__, src, token_type_ids, text are not expected by `OPTForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3440
  Batch size = 2
Saving 

TrainOutput(global_step=6880, training_loss=0.13646381331044574, metrics={'train_runtime': 5825.6643, 'train_samples_per_second': 9.448, 'train_steps_per_second': 1.181, 'total_flos': 5.752709261033472e+16, 'train_loss': 0.13646381331044574, 'epoch': 4.0})

how well does the model perform on GPT-3 when trained on a few GPT-3 data?

In [None]:
class_table_gpt3, output_dict_gpt3, df_gpt3_pred = eval_dataset(df = gpt3_test, model = trainer_gpt3.model)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the evaluation set don't have a corresponding argument in `OPTForSequenceClassification.forward` and have been ignored: __index_level_0__, src, token_type_ids, text. If __index_level_0__, src, token_type_ids, text are not expected by `OPTForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 2


Trainer is attempting to log a value of "[  2  10  11  15  19  36  39  40  44  47  51  52  53  60  61  63  67  68
  69  71  78  83  85  86  91  92  96  97  98 104 107 108 118 123 124 125
 130 133 137 138 147 148 151 153 154 158 159 160 162 166 167 168 169 171
 175 176 177 181 182 183 189 193 194 196 198 211 215 219 225 237 238 241
 242 244 246 248 257 260 266 268 271 275 277 278 282 283 289 290 293 294
 298 309 310 313 320 325 327 328 330 333 337 338 342 345 346 348 355 356
 359 360 362 363 365 367 368 370 375 376 379 385 387 393 395 396 398 403
 404 409 411 413 423 428 430 432 438 439 442 443 446 450 456 460 465 466
 468 469 476 479 483 487 489 490 491 492 495 498 500 501 503 506 508 510
 517 518 524 530 532 534 536 537 544 547 549 552 553 558 560 562 568 573
 574 577 580 585 587 597 600 602 604 606 615 616 618 619 624 625 627 630
 631 635 637 639 642 648 650 654 656 658 665 672 673 675 688 691 698 701
 705 707 711 712 714 721 724 730 732 734 735 740 742 747 749 750 752 760
 762 768 7

Unnamed: 0,text,label,prediction,src
0,Abstract:\n\nA major limitation of current net...,1,0,gpt32022nlp
1,Abstract:\n\nWe propose a novel pre-training m...,1,0,gpt32022nlp
2,Abstract:\n\nUnsupervised image-to-image trans...,1,0,gpt32022nlp
3,"Abstract:\n\nIn this paper, we propose a metho...",1,0,gpt32022nlp
4,Abstract:\n\nWearable devices such as fitness ...,1,0,gpt32022nlp
...,...,...,...,...
294,Abstract:\n\nWe propose a novel framework for ...,1,0,gpt32022nlp
295,Abstract:\n\nThe ability to caption images is ...,1,0,gpt32022nlp
296,Abstract:\n\nWe propose an ADAM-style algorith...,1,0,gpt32022nlp
297,Abstract:\n\nWe develop a theory that explains...,1,0,gpt32022nlp


{'eval_loss': 1.4823449850082397, 'eval_accuracy': 0.701, 'eval_f1': 0.8242210464432685, 'eval_recall': 0.701, 'eval_precision': 1.0, 'eval_wrongly_classified': array([  2,  10,  11,  15,  19,  36,  39,  40,  44,  47,  51,  52,  53,
        60,  61,  63,  67,  68,  69,  71,  78,  83,  85,  86,  91,  92,
        96,  97,  98, 104, 107, 108, 118, 123, 124, 125, 130, 133, 137,
       138, 147, 148, 151, 153, 154, 158, 159, 160, 162, 166, 167, 168,
       169, 171, 175, 176, 177, 181, 182, 183, 189, 193, 194, 196, 198,
       211, 215, 219, 225, 237, 238, 241, 242, 244, 246, 248, 257, 260,
       266, 268, 271, 275, 277, 278, 282, 283, 289, 290, 293, 294, 298,
       309, 310, 313, 320, 325, 327, 328, 330, 333, 337, 338, 342, 345,
       346, 348, 355, 356, 359, 360, 362, 363, 365, 367, 368, 370, 375,
       376, 379, 385, 387, 393, 395, 396, 398, 403, 404, 409, 411, 413,
       423, 428, 430, 432, 438, 439, 442, 443, 446, 450, 456, 460, 465,
       466, 468, 469, 476, 479, 483, 487, 489, 

other datasets

In [None]:
class_table, output_dict, df_pred = eval_dataset(df = test_df, model = model_exper)

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Trainer is attempting to log a value of "[  20   36  162  184  207  211  213  286  310  425  472  478  489  596
  756  837  841  849  974 1101 1305 1326 1370 1468 1616 1756 1794 1815
 1818 1834 1884 2052 2089 2100 2182 2282 2393 2409 2462 2585 2726 2809
 2820 2861 3010 3133 3274 3594 3612 3653 3872 4092 4260 4290 4299 4310
 4436 4564 4588 4647 4687 4711 4822 4912 4932 4990 5017 5095 5098 5151
 5203 5421 5425 5430 5485 5491 5517 5537 5582 5607 5649 5713 6032 6130
 6172 6227 6229 6233 6240 6402 6449 6561 6576 6583 6638 6693 6750 6768
 6790 6833 6888 6897 6906 6925 7028 7033 7060 7122 7178 7236 7279 7323
 7347 7364 7458 7461 7467 7644 7657 7816 7846 7885 7902]" of type <class 'numpy.ndarray'> for key "eval/wrongly_classified" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[[8.1308411e-07 9.9999917e-01]
 [9.1974368e-07 9.9999905e-01]
 [7.8015063e-07 9.9999917e-01]
 ...
 [8.6416105e-07 9.9

Unnamed: 0,text,label,prediction,src
0,Abstract:\n\nThe Cygnus Loop is a well-studied...,0,1,real
1,Abstract:\n\nWe study the power and energy uti...,1,0,gpt2
2,Abstract:\n\nThe leptonic W boson production a...,1,0,galactica
3,Abstract:\n\nWe study the locally-defined soci...,0,1,real
4,"Abstract:\n\nIn this work, we address the prob...",0,1,real
...,...,...,...,...
118,Abstract:\n\nModern deep learning (DL) archite...,0,1,real
119,"Abstract:\n\nFor years, the quantum/reversible...",0,1,real
120,Abstract:\n\nThe purpose of this note is to pr...,1,0,galactica
121,Abstract:\n\nHough transform (HT) has been the...,0,1,real


{'eval_loss': 0.08680661767721176, 'eval_accuracy': 0.984625, 'eval_f1': 0.9847071988064156, 'eval_recall': 0.99, 'eval_precision': 0.9794706900816226, 'eval_wrongly_classified': array([  20,   36,  162,  184,  207,  211,  213,  286,  310,  425,  472,
        478,  489,  596,  756,  837,  841,  849,  974, 1101, 1305, 1326,
       1370, 1468, 1616, 1756, 1794, 1815, 1818, 1834, 1884, 2052, 2089,
       2100, 2182, 2282, 2393, 2409, 2462, 2585, 2726, 2809, 2820, 2861,
       3010, 3133, 3274, 3594, 3612, 3653, 3872, 4092, 4260, 4290, 4299,
       4310, 4436, 4564, 4588, 4647, 4687, 4711, 4822, 4912, 4932, 4990,
       5017, 5095, 5098, 5151, 5203, 5421, 5425, 5430, 5485, 5491, 5517,
       5537, 5582, 5607, 5649, 5713, 6032, 6130, 6172, 6227, 6229, 6233,
       6240, 6402, 6449, 6561, 6576, 6583, 6638, 6693, 6750, 6768, 6790,
       6833, 6888, 6897, 6906, 6925, 7028, 7033, 7060, 7122, 7178, 7236,
       7279, 7323, 7347, 7364, 7458, 7461, 7467, 7644, 7657, 7816, 7846,
       7885, 7902]

In [None]:
class_table, output_dict, df_pred = eval_dataset(df = real2022nlp_df, model = model_exper)

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "[  36   50   62   87   92  105  107  111  116  140  150  184  192  205
  214  219  227  237  244  248  261  279  290  326  339  362  365  385
  407  422  431  437  439  445  472  473  474  476  534  536  538  539
  544  546  551  562  565  599  627  630  655  656  657  676  681  692
  693  702  728  772  774  781  784  792  826  875  886  898  920  934
  944  957  965  974 1007 1010 1016 1029 1045 1080 1105 1130 1153 1155
 1173 1187 1188 1200 1205 1215 1231 1252 1255 1265 1279 1283 1306 1313
 1348 1357 1380 1381 1384 1387 1401 1413 1424 1427 1431 1434 1437 1477
 1480 1484 1485 1487 1490 1494 1515 1525 1529 1536 1560 1567 1573 1574
 1600 1606 1611 1625 1630 1674 1702 1717 1728 1751 1762 1763 1773 1792
 1797 1814 1820 1860 1872 1881 1893 1900 1903 1912 1916 1926 1960 1979
 1992 2015 2037 2084 2114 2129 2135 2136 2140 2161 2163 2172 2175 2186
 2211 2222 2223 2233 2267 2270 2280 2298 2299 2303 2

Unnamed: 0,text,label,prediction,src
0,Abstract:\n\nLet R be a real closed field. We ...,0,1,real2022nlp
1,Abstract:\n\nThis paper proposes an inverse re...,0,1,real2022nlp
2,Abstract:\n\nA practical shortcoming of deep n...,0,1,real2022nlp
3,Abstract:\n\nGraph Neural Networks (GNNs) have...,0,1,real2022nlp
4,Abstract:\n\nNeural Architecture Search (NAS) ...,0,1,real2022nlp
...,...,...,...,...
311,Abstract:\n\nPulmonary vessel segmentation is ...,0,1,real2022nlp
312,Abstract:\n\n: A coherent reflectometer with a...,0,1,real2022nlp
313,Abstract:\n\n: The cross section of the proces...,0,1,real2022nlp
314,Abstract:\n\nThe recent success of single-agen...,0,1,real2022nlp


{'eval_loss': 0.5516067147254944, 'eval_accuracy': 0.921, 'eval_f1': 0.0, 'eval_recall': 0.0, 'eval_precision': 0.0, 'eval_wrongly_classified': array([  36,   50,   62,   87,   92,  105,  107,  111,  116,  140,  150,
        184,  192,  205,  214,  219,  227,  237,  244,  248,  261,  279,
        290,  326,  339,  362,  365,  385,  407,  422,  431,  437,  439,
        445,  472,  473,  474,  476,  534,  536,  538,  539,  544,  546,
        551,  562,  565,  599,  627,  630,  655,  656,  657,  676,  681,
        692,  693,  702,  728,  772,  774,  781,  784,  792,  826,  875,
        886,  898,  920,  934,  944,  957,  965,  974, 1007, 1010, 1016,
       1029, 1045, 1080, 1105, 1130, 1153, 1155, 1173, 1187, 1188, 1200,
       1205, 1215, 1231, 1252, 1255, 1265, 1279, 1283, 1306, 1313, 1348,
       1357, 1380, 1381, 1384, 1387, 1401, 1413, 1424, 1427, 1431, 1434,
       1437, 1477, 1480, 1484, 1485, 1487, 1490, 1494, 1515, 1525, 1529,
       1536, 1560, 1567, 1573, 1574, 1600, 1606, 1611

In [None]:
class_table, output_dict, df_pred = eval_dataset(df = test_df[test_df["src"] == "chatgpt"], model = model_exper)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["text"] = start_of_text \


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Trainer is attempting to log a value of "[  7   9  17  19  23  24  44  62  68  69  75  80  89  99 110 114 130 137
 149 161 165 172 184 199 205 207 210 221 226 242 244 248 255 270 273 278
 302 304 309 313 314 317 321 325 326 331 344 346 356 359 380 395 398 411
 412 416 418 423 449 454 460 465 469 482 500 506 508 511 513 522 523 531
 535 541 545 549 551 565 567 569 570 577 610 621 632 633 643 652 653 654
 656 657 662 676 677 693 697 698 702 716 728 738 739 750 761 769 776 781
 805 807 841 842 846 854 856 863 868 903 911 917 919 927 941 948 959 965
 976 996]" of type <class 'numpy.ndarray'> for key "eval/wrongly_classified" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[[6.7414639e-06 9.9999321e-01]
 [6.8322523e-03 9.9316782e-01]
 [9.8164906e-05 9.9990177e-01]
 ...
 [9.3493156e-07 9.9999905e-01]
 [2.2115259e-01 7.7884740e-01]
 [1.5974183e-01 8.4025812e-01]]" of type <class 'numpy.ndarra

Unnamed: 0,text,label,prediction,src
0,"Abstract:\n\nIn this note, we explore the conn...",1,0,chatgpt
1,"Abstract:\n\nIn this paper, we study the expon...",1,0,chatgpt
2,"Abstract:\n\nIn this document, we describe the...",1,0,chatgpt
3,"Abstract:\n\nIn this work, we investigate the ...",1,0,chatgpt
4,Abstract:\n\nUltra-faint dwarf spheroidal gala...,1,0,chatgpt
...,...,...,...,...
123,Abstract:\n\nTidal deformations refer to the d...,1,0,chatgpt
124,Abstract:\n\nThis document presents an introdu...,1,0,chatgpt
125,Abstract:\n\nMixed modulus-anomaly mediated su...,1,0,chatgpt
126,"Abstract:\n\nIn this paper, we investigate the...",1,0,chatgpt


{'eval_loss': 0.5770673155784607, 'eval_accuracy': 0.872, 'eval_f1': 0.9316239316239316, 'eval_recall': 0.872, 'eval_precision': 1.0, 'eval_wrongly_classified': array([  7,   9,  17,  19,  23,  24,  44,  62,  68,  69,  75,  80,  89,
        99, 110, 114, 130, 137, 149, 161, 165, 172, 184, 199, 205, 207,
       210, 221, 226, 242, 244, 248, 255, 270, 273, 278, 302, 304, 309,
       313, 314, 317, 321, 325, 326, 331, 344, 346, 356, 359, 380, 395,
       398, 411, 412, 416, 418, 423, 449, 454, 460, 465, 469, 482, 500,
       506, 508, 511, 513, 522, 523, 531, 535, 541, 545, 549, 551, 565,
       567, 569, 570, 577, 610, 621, 632, 633, 643, 652, 653, 654, 656,
       657, 662, 676, 677, 693, 697, 698, 702, 716, 728, 738, 739, 750,
       761, 769, 776, 781, 805, 807, 841, 842, 846, 854, 856, 863, 868,
       903, 911, 917, 919, 927, 941, 948, 959, 965, 976, 996]), 'eval_softmax_probs': array([[6.7414639e-06, 9.9999321e-01],
       [6.8322523e-03, 9.9316782e-01],
       [9.8164906e-05, 9.99

## Out-of-distribution ChatGPT

In [26]:
# unvomment when using the already trained model.
model_exper = load_model("results/84-noChatGPT")

cuda


first eval how good our trained model is in detecting chatgpt when trained on chatgpt data.

In [None]:
class_table_chatgpt, output_dict_chatgpt, df_chatgpt_pred = eval_dataset(df = test_df[test_df["src"] == "chatgpt"], model = model_exper)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["text"] = start_of_text \


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the evaluation set don't have a corresponding argument in `OPTForSequenceClassification.forward` and have been ignored: __index_level_0__, src, token_type_ids, text. If __index_level_0__, src, token_type_ids, text are not expected by `OPTForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 2


Trainer is attempting to log a value of "[  1  17  19  24  31  40  44  50  54  58  59  62  69  84  89  99 113 126
 130 131 135 137 141 149 155 157 159 165 172 177 179 192 193 197 199 205
 207 210 218 221 238 242 248 262 264 270 273 274 276 278 281 302 304 309
 311 313 314 318 321 322 331 338 346 347 349 351 359 361 365 377 379 380
 398 402 412 418 430 433 444 449 454 458 459 465 469 475 490 494 500 511
 513 515 521 522 523 534 535 541 545 549 551 569 570 575 579 583 605 615
 621 632 638 641 652 654 656 662 675 677 679 686 689 693 696 697 700 702
 710 716 728 739 750 760 769 770 771 775 786 803 807 815 840 842 846 847
 852 854 863 867 876 903 917 919 923 924 928 932 941 944 949 950 959 962
 972 976 979 996 998]" of type <class 'numpy.ndarray'> for key "eval/wrongly_classified" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[[6.9631387e-05 9.9993038e-01]
 [9.9338782e-01 6.6121155e-03]
 

Unnamed: 0,text,label,prediction,src
0,Abstract:\n\nThis study investigates the prese...,1,0,chatgpt
1,"Abstract:\n\nIn this document, we describe the...",1,0,chatgpt
2,"Abstract:\n\nIn this work, we investigate the ...",1,0,chatgpt
3,"Abstract:\n\nIn this paper, we consider noncon...",1,0,chatgpt
4,"Abstract:\n\nIn this paper, we study the probl...",1,0,chatgpt
...,...,...,...,...
162,Abstract:\n\nFerroelectric materials have the ...,1,0,chatgpt
163,"Abstract:\n\nIn this paper, we investigate the...",1,0,chatgpt
164,Abstract:\n\nThe heavy quark potential plays a...,1,0,chatgpt
165,Abstract:\n\nThis paper discusses the properti...,1,0,chatgpt


{'eval_loss': 0.8153757452964783, 'eval_accuracy': 0.833, 'eval_f1': 0.9088925259138025, 'eval_recall': 0.833, 'eval_precision': 1.0, 'eval_wrongly_classified': array([  1,  17,  19,  24,  31,  40,  44,  50,  54,  58,  59,  62,  69,
        84,  89,  99, 113, 126, 130, 131, 135, 137, 141, 149, 155, 157,
       159, 165, 172, 177, 179, 192, 193, 197, 199, 205, 207, 210, 218,
       221, 238, 242, 248, 262, 264, 270, 273, 274, 276, 278, 281, 302,
       304, 309, 311, 313, 314, 318, 321, 322, 331, 338, 346, 347, 349,
       351, 359, 361, 365, 377, 379, 380, 398, 402, 412, 418, 430, 433,
       444, 449, 454, 458, 459, 465, 469, 475, 490, 494, 500, 511, 513,
       515, 521, 522, 523, 534, 535, 541, 545, 549, 551, 569, 570, 575,
       579, 583, 605, 615, 621, 632, 638, 641, 652, 654, 656, 662, 675,
       677, 679, 686, 689, 693, 696, 697, 700, 702, 710, 716, 728, 739,
       750, 760, 769, 770, 771, 775, 786, 803, 807, 815, 840, 842, 846,
       847, 852, 854, 863, 867, 876, 903, 917, 

second eval how good our trained model is in detecting chatgpt when not trained on chatgpt data.

In [None]:
model = load_model(f"facebook/galactica-{model_params_size}")

Some weights of the model checkpoint at facebook/galactica-125m were not used when initializing OPTForSequenceClassification: ['lm_head.weight']
- This IS expected if you are initializing OPTForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing OPTForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/galactica-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


cuda


In [None]:
train_no_chatgpt_df = train_df[train_df["src"] != "chatgpt"]
tokenized_train_no_chatgpt, tokenized_test_no_chatgpt = get_tokenized_dataset(train_no_chatgpt_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["text"] = start_of_text \


Map:   0%|          | 0/2800 [00:00<?, ? examples/s]

Map:   0%|          | 0/11200 [00:00<?, ? examples/s]

In [None]:
os.environ["WANDB_DISABLED"] = "false"
wandb.init(project = "galactica_paper_classifier")
training_args_no_chatgpt = TrainingArguments(
    output_dir='./results',
    num_train_epochs=4,              # total number of training epochs
    learning_rate=5e-6,
    per_device_train_batch_size=2,  # batch size per device during training
    per_device_eval_batch_size=2,   # batch size for evaluation
    gradient_accumulation_steps = 4,
    weight_decay=0.01,
    warmup_steps=1000,
    logging_dir='./logs',            # directory for storing logs
    logging_steps=1000,
    eval_steps=1000,
    save_steps=1000,
    evaluation_strategy="steps",
    save_total_limit = 1,
    # save_strategy = "no",
    load_best_model_at_end=True,
    fp16=True,
    report_to="wandb"
    )


trainer_no_chatgpt = Trainer(
    model=model,
    args=training_args_no_chatgpt,
    train_dataset=tokenized_train_no_chatgpt,
    eval_dataset=tokenized_test_no_chatgpt,
    compute_metrics=compute_metrics
)

Using cuda_amp half precision backend


In [None]:
trainer_no_chatgpt.train()

The following columns in the training set don't have a corresponding argument in `OPTForSequenceClassification.forward` and have been ignored: __index_level_0__, src, token_type_ids, text. If __index_level_0__, src, token_type_ids, text are not expected by `OPTForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 11200
  Num Epochs = 4
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 4
  Total optimization steps = 5600
  Number of trainable parameters = 125031936
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss,Accuracy,F1,Recall,Precision
1000,0.3653,0.118822,0.9625,0.957473,0.976053,0.939587
2000,0.1754,0.094588,0.974643,0.970673,0.970273,0.971074
3000,0.1445,0.087436,0.982143,0.979356,0.979356,0.979356
4000,0.0997,0.335977,0.941786,0.936551,0.993394,0.885862
5000,0.0413,0.09865,0.982143,0.979475,0.985136,0.973878


The following columns in the evaluation set don't have a corresponding argument in `OPTForSequenceClassification.forward` and have been ignored: __index_level_0__, src, token_type_ids, text. If __index_level_0__, src, token_type_ids, text are not expected by `OPTForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2800
  Batch size = 2
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `OPTForSequenceClassification.forward` and have been ignored: __index_level_0__, src, token_type_ids, text. If __index_level_0__, src, token_type_ids, text are not expected by `OPTForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2800
  Batch size = 2
Saving 

TrainOutput(global_step=5600, training_loss=0.15115598031452723, metrics={'train_runtime': 4779.2262, 'train_samples_per_second': 9.374, 'train_steps_per_second': 1.172, 'total_flos': 4.68243777060864e+16, 'train_loss': 0.15115598031452723, 'epoch': 4.0})

How well does the model perform when tested on OOD ChatGPT dataset.

In [None]:
class_table_no_chatgpt, output_dict_no_chatgpt, df_no_chatgpt_pred= eval_dataset(df = test_df[test_df["src"] == "chatgpt"], model = trainer_no_chatgpt.model)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["text"] = start_of_text \


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the evaluation set don't have a corresponding argument in `OPTForSequenceClassification.forward` and have been ignored: __index_level_0__, src, token_type_ids, text. If __index_level_0__, src, token_type_ids, text are not expected by `OPTForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 2


Trainer is attempting to log a value of "[  1   4   7   8   9  10  13  15  17  18  21  22  23  24  29  30  31  32
  34  35  36  41  42  46  49  51  53  59  60  61  62  64  65  67  68  69
  70  71  72  75  79  80  81  83  84  87  89  90  91  92  96  98  99 100
 101 104 105 106 107 109 110 112 114 115 116 117 118 119 120 121 122 124
 125 126 127 131 133 134 136 137 138 139 140 141 142 143 145 146 149 151
 155 157 159 161 162 163 168 172 175 176 177 179 180 182 185 188 190 191
 192 194 197 198 199 201 202 205 206 207 208 209 210 211 212 220 223 224
 226 228 234 235 236 237 238 239 240 241 242 243 244 246 247 248 254 255
 256 258 261 262 263 264 265 266 267 268 269 270 273 275 276 278 280 282
 283 285 287 288 289 290 291 292 295 297 299 301 306 309 310 313 314 315
 317 318 319 320 321 325 326 327 328 329 331 332 333 335 336 337 342 343
 344 345 346 349 354 355 356 357 361 366 367 368 370 372 373 377 378 382
 384 387 391 394 395 399 400 402 403 405 406 407 408 410 411 412 414 416
 417 418 4

Unnamed: 0,text,label,prediction,src
0,Abstract:\n\nThis study investigates the prese...,1,0,chatgpt
1,"Abstract:\n\nIn this paper, we study the radic...",1,0,chatgpt
2,"Abstract:\n\nIn this note, we explore the conn...",1,0,chatgpt
3,"Abstract:\n\nIn this paper, we investigate the...",1,0,chatgpt
4,"Abstract:\n\nIn this paper, we study the expon...",1,0,chatgpt
...,...,...,...,...
567,"Abstract:\n\nIn this study, we investigate the...",1,0,chatgpt
568,"Abstract:\n\nIn this study, we present a compr...",1,0,chatgpt
569,Abstract:\n\nThis paper discusses the properti...,1,0,chatgpt
570,Abstract:\n\nThe two-body problem in celestial...,1,0,chatgpt


{'eval_loss': 4.679646015167236, 'eval_accuracy': 0.428, 'eval_f1': 0.5994397759103641, 'eval_recall': 0.428, 'eval_precision': 1.0, 'eval_wrongly_classified': array([  1,   4,   7,   8,   9,  10,  13,  15,  17,  18,  21,  22,  23,
        24,  29,  30,  31,  32,  34,  35,  36,  41,  42,  46,  49,  51,
        53,  59,  60,  61,  62,  64,  65,  67,  68,  69,  70,  71,  72,
        75,  79,  80,  81,  83,  84,  87,  89,  90,  91,  92,  96,  98,
        99, 100, 101, 104, 105, 106, 107, 109, 110, 112, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 124, 125, 126, 127, 131, 133, 134,
       136, 137, 138, 139, 140, 141, 142, 143, 145, 146, 149, 151, 155,
       157, 159, 161, 162, 163, 168, 172, 175, 176, 177, 179, 180, 182,
       185, 188, 190, 191, 192, 194, 197, 198, 199, 201, 202, 205, 206,
       207, 208, 209, 210, 211, 212, 220, 223, 224, 226, 228, 234, 235,
       236, 237, 238, 239, 240, 241, 242, 243, 244, 246, 247, 248, 254,
       255, 256, 258, 261, 262, 263, 264, 265, 2

testing on other datasets

In [None]:
class_table, output_dict, df_pred = eval_dataset(df = real2022nlp_df, model = model_exper)

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "[  12   22   87  105  140  184  192  334  359  365  385  401  445  534
  559  599  627  656  657  662  692  704  726  771  774  781  826  828
  838  898  946  957  965 1007 1080 1119 1130 1153 1172 1188 1231 1265
 1306 1366 1370 1380 1400 1401 1431 1525 1560 1567 1574 1649 1668 1692
 1715 1728 1788 1795 1852 1860 1882 1900 1976 2015 2140 2172 2175 2268
 2280 2364 2370 2392 2453 2460 2549 2553 2683 2695 2748 2785 2786 2794
 2797 2820 2834 2836 2851 2897 2905 2910 2914 2987 3003 3007 3008 3033
 3036 3066 3156 3197 3226 3235 3247 3250 3294 3302 3354 3402 3410 3415
 3445 3558 3617 3633 3651 3757 3764 3766 3806 3917 3978]" of type <class 'numpy.ndarray'> for key "eval/wrongly_classified" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[[9.9999654e-01 3.5111661e-06]
 [9.9999475e-01 5.1965367e-06]


Unnamed: 0,text,label,prediction,src
0,Abstract:\n\nThe OGS for non-abelian groups is...,0,1,real2022nlp
1,Abstract:\n\nThis work is concerned with the d...,0,1,real2022nlp
2,Abstract:\n\nGraph Neural Networks (GNNs) have...,0,1,real2022nlp
3,"Abstract:\n\nIn this paper, we present a techn...",0,1,real2022nlp
4,Abstract:\n\nSolomon and Elkin [13] constructe...,0,1,real2022nlp
...,...,...,...,...
118,Abstract:\n\nWe show that any normal toric var...,0,1,real2022nlp
119,Abstract:\n\nWe have studied the structural st...,0,1,real2022nlp
120,Abstract:\n\nCommunity Detection in Social Net...,0,1,real2022nlp
121,Abstract:\n\nWe consider the long-time behavio...,0,1,real2022nlp


{'eval_loss': 0.17397040128707886, 'eval_accuracy': 0.96925, 'eval_f1': 0.0, 'eval_recall': 0.0, 'eval_precision': 0.0, 'eval_wrongly_classified': array([  12,   22,   87,  105,  140,  184,  192,  334,  359,  365,  385,
        401,  445,  534,  559,  599,  627,  656,  657,  662,  692,  704,
        726,  771,  774,  781,  826,  828,  838,  898,  946,  957,  965,
       1007, 1080, 1119, 1130, 1153, 1172, 1188, 1231, 1265, 1306, 1366,
       1370, 1380, 1400, 1401, 1431, 1525, 1560, 1567, 1574, 1649, 1668,
       1692, 1715, 1728, 1788, 1795, 1852, 1860, 1882, 1900, 1976, 2015,
       2140, 2172, 2175, 2268, 2280, 2364, 2370, 2392, 2453, 2460, 2549,
       2553, 2683, 2695, 2748, 2785, 2786, 2794, 2797, 2820, 2834, 2836,
       2851, 2897, 2905, 2910, 2914, 2987, 3003, 3007, 3008, 3033, 3036,
       3066, 3156, 3197, 3226, 3235, 3247, 3250, 3294, 3302, 3354, 3402,
       3410, 3415, 3445, 3558, 3617, 3633, 3651, 3757, 3764, 3766, 3806,
       3917, 3978]), 'eval_softmax_probs': array([

In [None]:
class_table, output_dict, df_pred = eval_dataset(df = gpt3_test, model = model_exper)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Trainer is attempting to log a value of "[  0   1   2   3   4   6   7  10  11  12  13  14  15  16  18  19  22  23
  24  25  26  29  30  31  32  33  34  35  36  37  39  40  41  42  44  45
  46  47  48  49  50  51  52  53  56  57  58  59  60  61  62  63  64  65
  66  67  68  69  70  71  72  75  76  77  78  79  80  81  82  83  84  85
  86  88  89  91  92  93  94  96  97  98  99 100 101 102 103 104 105 106
 107 108 109 110 111 112 113 114 117 118 119 121 123 124 125 126 128 129
 130 131 132 133 134 135 137 138 139 140 142 143 145 146 147 148 149 150
 151 152 153 154 155 156 157 158 159 160 162 163 164 165 166 167 168 169
 170 171 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188
 189 191 192 193 194 196 197 198 199 200 201 202 203 204 205 206 207 208
 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226
 227 228 229 230 232 233 236 237 238 239 240 242 243 244 245 246 248 249
 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267
 268 270 2

Unnamed: 0,text,label,prediction,src
0,Abstract:\n\nWe propose a method for self-supe...,1,0,gpt32022nlp
1,"Abstract:\n\nIn this work, we propose a new gr...",1,0,gpt32022nlp
2,Abstract:\n\nA major limitation of current net...,1,0,gpt32022nlp
3,"Abstract:\n\nIn this paper, we introduce Solo-...",1,0,gpt32022nlp
4,Abstract:\n\nWe address the problem of scalabl...,1,0,gpt32022nlp
...,...,...,...,...
877,Abstract:\n\nConventional image-text represent...,1,0,gpt32022nlp
878,"Abstract:\n\nIn this paper, we present a novel...",1,0,gpt32022nlp
879,Abstract:\n\nWe consider the problem of learni...,1,0,gpt32022nlp
880,"Abstract:\n\nIn this work, we propose to impro...",1,0,gpt32022nlp


{'eval_loss': 9.231558799743652, 'eval_accuracy': 0.118, 'eval_f1': 0.21109123434704832, 'eval_recall': 0.118, 'eval_precision': 1.0, 'eval_wrongly_classified': array([  0,   1,   2,   3,   4,   6,   7,  10,  11,  12,  13,  14,  15,
        16,  18,  19,  22,  23,  24,  25,  26,  29,  30,  31,  32,  33,
        34,  35,  36,  37,  39,  40,  41,  42,  44,  45,  46,  47,  48,
        49,  50,  51,  52,  53,  56,  57,  58,  59,  60,  61,  62,  63,
        64,  65,  66,  67,  68,  69,  70,  71,  72,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  88,  89,  91,  92,  93,
        94,  96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107,
       108, 109, 110, 111, 112, 113, 114, 117, 118, 119, 121, 123, 124,
       125, 126, 128, 129, 130, 131, 132, 133, 134, 135, 137, 138, 139,
       140, 142, 143, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154,
       155, 156, 157, 158, 159, 160, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 173, 174, 175, 176, 177, 

In [44]:
!pip uninstall -y transformers accelerate
!pip install transformers accelerate

Found existing installation: transformers 4.28.0
Uninstalling transformers-4.28.0:
  Successfully uninstalled transformers-4.28.0
Found existing installation: accelerate 0.19.0
Uninstalling accelerate-0.19.0:
  Successfully uninstalled accelerate-0.19.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Using cached transformers-4.29.1-py3-none-any.whl (7.1 MB)
Collecting accelerate
  Using cached accelerate-0.19.0-py3-none-any.whl (219 kB)
Installing collected packages: transformers, accelerate
Successfully installed accelerate-0.19.0 transformers-4.29.1


In [27]:
class_table, output_dict, df_pred = eval_dataset(df = test_df, model = model_exper)

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Trainer is attempting to log a value of "[  14   80  121  136  162  175  177  184  211  213  231  243  286  353
  356  375  461  472  482  522  535  541  585  594  596  650  676  707
  715  729  756  841  849  850  864  866  888  891  894  911  916  917
  929  930  986  992  993 1012 1059 1094 1096 1101 1111 1153 1169 1196
 1234 1274 1291 1315 1326 1388 1389 1436 1527 1583 1593 1614 1619 1628
 1663 1674 1680 1704 1713 1794 1801 1815 1818 1826 1834 1856 1929 1931
 1936 1941 1964 1982 1990 2000 2006 2052 2071 2097 2107 2130 2146 2164
 2176 2188 2191 2224 2225 2300 2327 2360 2393 2409 2458 2462 2474 2514
 2515 2517 2547 2579 2601 2618 2637 2643 2663 2734 2737 2761 2827 2839
 2849 2861 2866 2965 3010 3029 3033 3043 3058 3073 3133 3175 3180 3185
 3220 3227 3230 3274 3302 3303 3304 3327 3329 3359 3368 3379 3446 3467
 3501 3547 3562 3575 3581 3584 3591 3608 3612 3635 3648 3653 3693 3703
 3718 3719 3794 3839 3852 3872 3922 4012 4024 4061 4067 4081 4093 4094
 4183 4186 4187 4269 4289 4293 4299 

Unnamed: 0,text,label,prediction,src
0,"Abstract:\n\nIn this paper, we introduce metho...",0,1,real
1,Abstract:\n\nIn this paper we review studies o...,0,1,real
2,Abstract:\n\nMultiple-Input Multiple-Output (M...,1,0,galactica
3,Abstract:\n\nThe La-Cuprate Superconductor (LC...,1,0,gpt2
4,Abstract:\n\nThe leptonic W boson production a...,1,0,galactica
...,...,...,...,...
395,Abstract:\n\nThis document discusses the speed...,1,0,chatgpt
396,"Abstract:\n\nIn this study, we used mean squar...",1,0,chatgpt
397,"Abstract:\n\nIn this study, we present a compr...",1,0,chatgpt
398,Abstract:\n\nThis paper discusses the properti...,1,0,chatgpt


{'eval_loss': 0.261957049369812, 'eval_accuracy': 0.95, 'eval_f1': 0.9484668899768102, 'eval_recall': 0.92025, 'eval_precision': 0.9784688995215312, 'eval_wrongly_classified': array([  14,   80,  121,  136,  162,  175,  177,  184,  211,  213,  231,
        243,  286,  353,  356,  375,  461,  472,  482,  522,  535,  541,
        585,  594,  596,  650,  676,  707,  715,  729,  756,  841,  849,
        850,  864,  866,  888,  891,  894,  911,  916,  917,  929,  930,
        986,  992,  993, 1012, 1059, 1094, 1096, 1101, 1111, 1153, 1169,
       1196, 1234, 1274, 1291, 1315, 1326, 1388, 1389, 1436, 1527, 1583,
       1593, 1614, 1619, 1628, 1663, 1674, 1680, 1704, 1713, 1794, 1801,
       1815, 1818, 1826, 1834, 1856, 1929, 1931, 1936, 1941, 1964, 1982,
       1990, 2000, 2006, 2052, 2071, 2097, 2107, 2130, 2146, 2164, 2176,
       2188, 2191, 2224, 2225, 2300, 2327, 2360, 2393, 2409, 2458, 2462,
       2474, 2514, 2515, 2517, 2547, 2579, 2601, 2618, 2637, 2643, 2663,
       2734, 2737, 27