# Fine-tuning Galactica for classification

In [None]:
!pip install transformers
!pip install datasets
!pip install transformers[torch]
!pip install accelerate -U

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m49.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m107.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m75.6 MB/s[0m eta [36m0:00:

In [None]:
from transformers import AutoTokenizer
import numpy as np
import os
from transformers import Trainer, TrainingArguments,AutoModelWithLMHead, OPTForSequenceClassification, set_seed
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from datasets import Dataset

In [None]:
model_params_size = "125m"

In [None]:
import random
seed = 42
set_seed(seed)
random.seed(seed)
torch.cuda.manual_seed_all(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)

no_deprecation_warning=True

In [None]:
from transformers import AutoConfig

# Download configuration from huggingface.co and cache.
config = AutoConfig.from_pretrained(f"facebook/galactica-{model_params_size}")
print(config)

Downloading (…)lve/main/config.json:   0%|          | 0.00/787 [00:00<?, ?B/s]

OPTConfig {
  "_name_or_path": "facebook/galactica-125m",
  "_remove_final_layer_norm": false,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "architectures": [
    "OPTForCausalLM"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "do_layer_norm_before": true,
  "dropout": 0.1,
  "enable_bias": true,
  "eos_token_id": 2,
  "ffn_dim": 3072,
  "hidden_size": 768,
  "init_std": 0.02,
  "layer_norm_elementwise_affine": true,
  "layerdrop": 0.0,
  "learned_embeddings": true,
  "max_position_embeddings": 2048,
  "model_type": "opt",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "scale_embeddings": false,
  "torch_dtype": "float32",
  "transformers_version": "4.31.0",
  "use_cache": true,
  "vocab_size": 50000,
  "word_embed_proj_dim": 768
}



here we set the pad token to eos token and make sure that padding and truncation is left sided. Also, we limit the input size to 2048.

In [None]:
# restrict to only 2048 tokens input.
tokenizer = AutoTokenizer.from_pretrained(f"facebook/galactica-{model_params_size}", pad_token = "", eos_token = "")
tokenizer
tokenizer.model_max_length = 2048
# tokenizer.model_max_length = 2560

# pad left since decoder only architecture and the last token is used for classification, unlike bert.
tokenizer.padding_side = "left"
tokenizer.truncation_side = "left"

Downloading (…)okenizer_config.json:   0%|          | 0.00/166 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.14M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/3.00 [00:00<?, ?B/s]

In [None]:
tokenizer.pad_token_id = config.eos_token_id
print(tokenizer.special_tokens_map)

{'pad_token': '</s>'}


In [None]:
# restrict to only 2048 tokens input.
tokenizer.model_max_length = 2048
# tokenizer.model_max_length = 2560

# pad left since decoder only architecture and the last token is used for classification, unlike bert.
tokenizer.padding_side = "left"
tokenizer.truncation_side = "left"

get model and move to cuda.

In [None]:
def load_model(path = f"facebook/galactica-{model_params_size}"):
  model = OPTForSequenceClassification.from_pretrained(path)
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  print(device)
  model.config.pad_token_id = model.config.eos_token_id
  model.resize_token_embeddings(len(tokenizer))
  model = model.to(device)
  return model

In [None]:
model = load_model()

Downloading model.safetensors:   0%|          | 0.00/250M [00:00<?, ?B/s]

Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/galactica-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


cuda


# Data loading and prep

In [None]:
DATA_PATH = os.path.join("..", "data")

In [None]:
train_df = pd.read_csv(os.path.join(DATA_PATH, "classifier_input_restricted_train.csv"))
test_df = pd.read_csv(os.path.join(DATA_PATH, "classifier_input_restricted_test.csv"))

In [None]:
gpt3_test = pd.read_csv(os.path.join(DATA_PATH, "gpt3curienlp2022_restricted_test.csv"))

In [None]:
real2022nlp_df = pd.read_csv(os.path.join(DATA_PATH, "realnlp2022_restricted_4000.csv"))

In [None]:
gpt3_train = pd.read_csv(os.path.join(DATA_PATH, "data_nlp2022", "gpt3curienlp2022_restricted_train.csv"))

In [None]:
from datasets import load_dataset
def get_test_cc_df():
  dataset = load_dataset('tum-nlp/IDMGSP', "test-cc")
  dataset = dataset['test']
  dataset_df = pd.DataFrame.from_dict(dataset)
  return dataset_df

In [None]:
test_cc = get_test_cc_df()

Downloading builder script:   0%|          | 0.00/9.86k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/13.7k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.12M [00:00<?, ?B/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [None]:
train_df[train_df.duplicated()]

Unnamed: 0,id,year,title,abstract,introduction,conclusion,categories,src,label


In [None]:
def transform_df(df, sep = None, start_of_text = None, end_of_text = None) -> pd.DataFrame:
  """
    Transforms a pandas DataFrame by concatenating the "abstract", "introduction", and "conclusion" columns into a new "text" column, with optional separators and text to prepend and append to the "text" column.

    Args:
        df (pandas.DataFrame): The input DataFrame to transform.
        sep (str, optional): A string to separate the text from the label in the output DataFrame. Defaults to None.
        start_of_text (str, optional): A string to prepend to the beginning of the text. Defaults to None.
        end_of_text (str, optional): A string to append to the end of the text. Defaults to None.

    Returns:
        pandas.DataFrame: The transformed DataFrame with a "text" column containing the concatenated text, and optionally a "label" and "src" column.
  """
  end_of_text = end_of_text if end_of_text else ""
  start_of_text = start_of_text if start_of_text else ""
  # Create text
  df["text"] = start_of_text \
    + "Abstract:\n\n" + df["abstract"] \
    + "\n\nIntroduction:\n\n" + df["introduction"] \
    + "\n\nConclusion:\n\n" + df["conclusion"] \
    + end_of_text
  if (sep):
    df["text"] = df["text"] + sep + df["label"].astype(str) + end_of_text
    return df[["text"]]

  return df[["text", "label", "src"]]

In [None]:
from sklearn.model_selection import train_test_split
from datasets import Dataset

def tokenize_func(examples):
  return tokenizer(examples["text"], padding= True, truncation=True)

def get_tokenized_dataset(df, sep = None, start_of_text = None, end_of_text = None, split = True, transformed = False):
  """
    This function takes a pandas DataFrame df and preprocesses it by performing the following steps:
    If transformed is False, it applies the transform_df() function to the df DataFrame to tokenize and preprocess the text data.
    If split is True, it splits the preprocessed data into train and validation sets using a 80-20 split.
    It creates a Hugging Face Dataset object for the train and validation sets using the from_pandas() function.
    It tokenizes the text data using the tokenize_func() function.
    It returns the tokenized train and validation sets as a tuple (tokenized_train, tokenized_val).

  Args:
      df: A pandas DataFrame containing text data.
      sep: The separator to use when tokenizing the text data. Default is None.
      start_of_text: The token to use at the start of each text sequence. Default is None.
      end_of_text: The token to use at the end of each text sequence. Default is None.
      split: A boolean indicating whether or not to split the preprocessed data into train and validation sets. Default is True.
      transformed: A boolean indicating whether or not the df DataFrame has already been preprocessed. Default is False.
      Returns:

  tokenized_train: A Hugging Face Dataset object containing the tokenized train data.
  tokenized_val: A Hugging Face Dataset object containing the tokenized validation data. If split is False, this value is None.
  """
  transformed_df = df
  if (not transformed):
    transformed_df = transform_df(df, sep, start_of_text, end_of_text)
  train_texts = transformed_df
  tokenized_val = None
  if (split):
    train_texts, val_texts = train_test_split(train_texts, test_size=.2, random_state=seed)
    val_texts = val_texts[~val_texts["text"].isna()]
    val_ds = Dataset.from_pandas(val_texts, split="test")
    tokenized_val = val_ds.map(tokenize_func, batched=True)
  # clean NA
  train_texts = train_texts[~train_texts["text"].isna()]
  train_ds = Dataset.from_pandas(train_texts, split="train")
  # tokenize.
  tokenized_train = train_ds.map(tokenize_func, batched=True)
  return tokenized_train, tokenized_val

split data into train and val

In [None]:
tokenized_train, tokenized_val = get_tokenized_dataset(train_df)

Map:   0%|          | 0/3200 [00:00<?, ? examples/s]

Map:   0%|          | 0/12800 [00:00<?, ? examples/s]

In [None]:
tokenized_train

Dataset({
    features: ['text', 'label', 'src', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 12800
})

In [None]:
tokenized_val

Dataset({
    features: ['text', 'label', 'src', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 3200
})

### setting up wandb for logging

In [None]:
!pip install wandb evaluate

Collecting wandb
  Downloading wandb-0.15.7-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.32-py3-none-any.whl (188 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m188.5/188.5 kB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-1.28.1-py2.py3-none-any.whl (214 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m214.7/214.7 kB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting pathtools (from wandb)
  Down

set wandb api key to save to the cloud

In [None]:
import wandb
os.environ["WANDB_API_KEY"] = "9002697077a332a19dc88cdb979643ee7ff3cef1"
wandb.login()
%env WANDB_PROJECT=galactica_paper_classifier

# to disable wandb
# os.environ["WANDB_DISABLED"] = "false"

[34m[1mwandb[0m: Currently logged in as: [33mmohamed-heshamse[0m ([33mtum-nlp[0m). Use [1m`wandb login --relogin`[0m to force relogin


env: WANDB_PROJECT=galactica_paper_classifier


create metrices function to be passed to the trainer later.

In [None]:
import evaluate
acc_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
recall_metric = evaluate.load("recall")
precision_metric = evaluate.load("precision")

def compute_metrics(eval_pred):
    output_dict = {}
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    output_dict.update(acc_metric.compute(predictions=predictions, references=labels))
    output_dict.update(f1_metric.compute(predictions=predictions, references=labels))
    output_dict.update(recall_metric.compute(predictions=predictions, references=labels))
    output_dict.update(precision_metric.compute(predictions=predictions, references=labels))
    return output_dict

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

# Training

Here we train our main model on our training dataset without removing or adding any other datasets. This model will be later saved and called for eval under the variable `model_exper`.

In [None]:
# start a new run
wandb.init(project = "galactica_paper_classifier")
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=4,              # total number of training epochs
    learning_rate=5e-6,
    per_device_train_batch_size=2,  # batch size per device during training
    per_device_eval_batch_size=2,   # batch size for evaluation
    gradient_accumulation_steps = 4,
    weight_decay=0.01,
    warmup_steps=1000,
    logging_dir='./logs',            # directory for storing logs
    logging_steps=1000,
    eval_steps=1000,
    save_steps=1000,
    evaluation_strategy="steps",
    save_total_limit = 1,
    # save_strategy = "no",
    load_best_model_at_end=True,
    fp16=True,
    report_to="wandb"
    )


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    compute_metrics=compute_metrics
)

VBox(children=(Label(value='0.003 MB of 0.011 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.257219…

In [None]:
trainer.train()



Step,Training Loss,Validation Loss,Accuracy,F1,Recall,Precision
1000,0.3648,0.143777,0.956562,0.957557,0.967901,0.947432
2000,0.1835,0.218555,0.951562,0.950621,0.920988,0.982225
3000,0.1163,0.296753,0.955937,0.957873,0.989506,0.928199
4000,0.0729,0.083827,0.985938,0.98615,0.988889,0.983425
5000,0.0431,0.119637,0.983125,0.983506,0.993827,0.973398
6000,0.0168,0.12327,0.984062,0.98437,0.991358,0.97748


TrainOutput(global_step=6400, training_loss=0.12570834636688233, metrics={'train_runtime': 5041.5763, 'train_samples_per_second': 10.156, 'train_steps_per_second': 1.269, 'total_flos': 5.35135745212416e+16, 'train_loss': 0.12570834636688233, 'epoch': 4.0})

In [None]:
trainer.evaluate()

{'eval_loss': 0.08382683992385864,
 'eval_accuracy': 0.9859375,
 'eval_f1': 0.9861495844875346,
 'eval_recall': 0.9888888888888889,
 'eval_precision': 0.9834254143646409,
 'eval_runtime': 85.9431,
 'eval_samples_per_second': 37.234,
 'eval_steps_per_second': 18.617,
 'epoch': 4.0}

In [None]:
# wandb analysis and testing
wandb.finish()

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/accuracy,▂▁▂█▇██
eval/f1,▂▁▂█▇██
eval/loss,▃▅█▁▂▂▁
eval/precision,▃█▁█▇▇█
eval/recall,▆▁█████
eval/runtime,█▂▅▄▆▅▁
eval/samples_per_second,▁▇▄▅▃▄█
eval/steps_per_second,▁▇▄▅▃▄█
train/epoch,▁▁▂▂▄▄▅▅▆▆▇▇██
train/global_step,▁▁▂▂▄▄▅▅▆▆▇▇██

0,1
eval/accuracy,0.98594
eval/f1,0.98615
eval/loss,0.08383
eval/precision,0.98343
eval/recall,0.98889
eval/runtime,85.9431
eval/samples_per_second,37.234
eval/steps_per_second,18.617
train/epoch,4.0
train/global_step,6400.0


In [None]:
!ls ../../../../../../

drive  sample_data


In [None]:
def save_model_locally(model, path = "results"):
  model_path = "../../../../../../" + path
  model.save_pretrained(model_path)       # save the model
  tokenizer.save_pretrained(model_path)

In [None]:
save_model_locally(trainer.model)

In [None]:

!zip -r ../../../../../../galactica_train_no_ligature.zip ../../../../../../results

  adding: ../../../../../../results/ (stored 0%)
  adding: ../../../../../../results/special_tokens_map.json (stored 0%)
  adding: ../../../../../../results/pytorch_model.bin (deflated 8%)
  adding: ../../../../../../results/tokenizer_config.json (deflated 26%)
  adding: ../../../../../../results/tokenizer.json (deflated 72%)
  adding: ../../../../../../results/config.json (deflated 52%)


# eval and Experiments

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
from scipy.special import softmax
def compute_metrics_eval(eval_pred):
    output_dict = {}
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    wrongly_classified = np.where(predictions != labels)[0]
    output_dict.update(acc_metric.compute(predictions=predictions, references=labels))
    output_dict.update(f1_metric.compute(predictions=predictions, references=labels))
    output_dict.update(recall_metric.compute(predictions=predictions, references=labels))
    output_dict.update(precision_metric.compute(predictions=predictions, references=labels))
    output_dict.update({"wrongly_classified": wrongly_classified})
    output_dict.update({"softmax_probs": softmax(logits, axis = -1)})
    return output_dict

## Functions used for eval.

In [None]:
def get_falsely_classified(index_arr, tokenized_dataset):
  """
  Given an index array and a tokenized dataset, returns a DataFrame containing the texts, labels, and predicted labels of the dataset at the specified indices. If the dataset contains source data, it is also included in the returned DataFrame.

    Parameters:
    index_arr (List[int]): A list of indices corresponding to the elements of the tokenized dataset to include in the returned DataFrame.
    tokenized_dataset (datasets.Dataset): A tokenized dataset containing text data, label data, and optionally source data.

    Returns:
    pandas.DataFrame: A DataFrame containing the texts, labels, predicted labels, and optionally source data of the dataset at the specified indices.
  """
  texts = tokenized_dataset[index_arr]['text']
  label = tokenized_dataset[index_arr]['label']
  src = tokenized_dataset[index_arr].get('src', [])
  preds = [1 if lbl == 0 else 0 for lbl in label]
  df = pd.DataFrame({'text': texts, 'label': label, 'prediction': preds, 'src': src})
  return df

In [None]:
def eval_dataset(df, model, tokenized = False, transformed = False, get_probs_df = True):
  """
  Evaluates a model on a given dataset and returns information about the evaluation. If the dataset is not already tokenized, it will be tokenized automatically. If the dataset contains source data, it will be included in the returned DataFrame.

    Parameters:
    df (pandas.DataFrame): The dataset to evaluate the model on. The dataset should have columns "text" and "label", and optionally "src" if source data is present.
    model (transformers.Trainer): The model to evaluate the dataset on.
    tokenized (bool, optional): If True, assumes the dataset is already tokenized. Defaults to False.
    transformed (bool, optional): If True, assumes the dataset is already transformed. Defaults to False.
    get_probs_df (bool, optional): If True, returns a DataFrame containing the softmax probabilities of each label for each text. Defaults to True.

    Returns:
    Tuple[pandas.DataFrame, dict, pandas.DataFrame]: A tuple containing three elements:
        1. A DataFrame containing the texts, labels, and predicted labels of the elements in the dataset that were wrongly classified by the model.
        2. A dictionary containing information about the evaluation, including the loss and accuracy of the model on the dataset.
        3. A DataFrame containing the texts, labels, predicted labels, and optionally source data and softmax probabilities of the entire dataset.
  """
  if (not tokenized):
    tokenized_df, _ = get_tokenized_dataset(df, transformed = transformed, split = False)
  else:
    tokenized_df = df
  training_args_best = TrainingArguments(per_device_eval_batch_size=2,  output_dir="./output_eval")
  trainer_eval = Trainer(model=model, args=training_args_best, compute_metrics=compute_metrics_eval)
  output = trainer_eval.evaluate(tokenized_df)

  wrongly_classified = output["eval_wrongly_classified"]
  wrongly_classified_df = get_falsely_classified(wrongly_classified, tokenized_df)
  display(wrongly_classified_df)

  df_complete = pd.DataFrame({"text":tokenized_df["text"], "label":tokenized_df["label"], "src": tokenized_df["src"]})
  softmax_arr = np.array(output["eval_softmax_probs"])
  df_complete['prediction'] = softmax_arr.argmax(-1)

  if (get_probs_df):
    softmax_probs = pd.DataFrame(softmax_arr)
    df_complete = pd.concat([df_complete, softmax_probs], axis = 1)

  print(output)

  return wrongly_classified_df, output, df_complete

## eval on the test dataset

In [None]:
# load the model trained on TRAIN dataset.
model_exper = trainer.model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
model_exper.config.pad_token_id = model_exper.config.eos_token_id
model_exper.resize_token_embeddings(len(tokenizer))
model_exper = model_exper.to(device)

# eval the model on TEST dataset.
class_table, output_dict, df_test_complete = eval_dataset(df = test_df, model = model_exper)
# class_table, output_dict, df_test_complete = eval_dataset(df = test_df, model = trainer.model)

cuda


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Trainer is attempting to log a value of "[  20   36  136  162  184  211  213  286  425  443  472  489  596  756
  827  841  849  972  974 1101 1326 1370 1409 1756 1794 1834 1880 2052
 2100 2164 2282 2393 2409 2462 2637 2726 2820 2851 2861 3003 3010 3029
 3043 3133 3185 3245 3274 3612 3665 3872 3883 3918 4075 4092 4289 4290
 4299 4310 4417 4545 4564 4588 4615 4634 4647 4687 4711 4822 4912 4913
 4918 4932 4949 4974 4990 5017 5097 5098 5341 5372 5421 5425 5438 5485
 5491 5517 5537 5582 5713 5809 5827 6151 6164 6172 6178 6227 6229 6240
 6402 6490 6527 6583 6638 6768 6790 6833 6836 6906 7028 7106 7122 7236
 7282 7323 7364 7458 7467 7501 7705 7762 7839 7846 7876 7902 7903]" of type <class 'numpy.ndarray'> for key "eval/wrongly_classified" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[[6.5012944e-07 9.9999940e-01]
 [6.1074013e-07 9.9999940e-01]
 [5.3478198e-07 9.9999952e-01]
 ...
 [4.52094

Unnamed: 0,text,label,prediction,src
0,Abstract:\n\nThe Cygnus Loop is a well-studied...,0,1,real
1,Abstract:\n\nWe study the power and energy uti...,1,0,gpt2
2,Abstract:\n\nThe La-Cuprate Superconductor (LC...,1,0,gpt2
3,Abstract:\n\nThe leptonic W boson production a...,1,0,galactica
4,Abstract:\n\nWe study the locally-defined soci...,0,1,real
...,...,...,...,...
120,Abstract:\n\nWe investigate the spaces of rati...,0,1,real
121,Abstract:\n\nThe purpose of this note is to pr...,1,0,galactica
122,Abstract:\n\nWe test the cosmological implicat...,0,1,real
123,Abstract:\n\nThe Digital Ludeme Project (DLP) ...,0,1,real


{'eval_loss': 0.0940784439444542, 'eval_accuracy': 0.984375, 'eval_f1': 0.9844469329351749, 'eval_recall': 0.989, 'eval_precision': 0.9799355957394105, 'eval_wrongly_classified': array([  20,   36,  136,  162,  184,  211,  213,  286,  425,  443,  472,
        489,  596,  756,  827,  841,  849,  972,  974, 1101, 1326, 1370,
       1409, 1756, 1794, 1834, 1880, 2052, 2100, 2164, 2282, 2393, 2409,
       2462, 2637, 2726, 2820, 2851, 2861, 3003, 3010, 3029, 3043, 3133,
       3185, 3245, 3274, 3612, 3665, 3872, 3883, 3918, 4075, 4092, 4289,
       4290, 4299, 4310, 4417, 4545, 4564, 4588, 4615, 4634, 4647, 4687,
       4711, 4822, 4912, 4913, 4918, 4932, 4949, 4974, 4990, 5017, 5097,
       5098, 5341, 5372, 5421, 5425, 5438, 5485, 5491, 5517, 5537, 5582,
       5713, 5809, 5827, 6151, 6164, 6172, 6178, 6227, 6229, 6240, 6402,
       6490, 6527, 6583, 6638, 6768, 6790, 6833, 6836, 6906, 7028, 7106,
       7122, 7236, 7282, 7323, 7364, 7458, 7467, 7501, 7705, 7762, 7839,
       7846, 7876,

## Robustness check

Does the model have similar results when presented with real data coming from a different pdf parser?

In [None]:
model_exper = trainer.model

In [None]:
class_table_real, output_dict_real, df_real_pred = eval_dataset(df = real2022nlp_df, model = model_exper)

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "[   6   12   22   87   92  105  111  140  184  192  214  219  225  227
  279  290  295  362  365  376  385  416  423  445  460  474  534  538
  565  599  627  655  656  657  692  726  727  760  781  898  957  965
  974  986 1007 1010 1080 1119 1130 1153 1188 1200 1265 1306 1313 1370
 1380 1384 1398 1400 1434 1477 1484 1515 1525 1536 1549 1559 1560 1574
 1605 1611 1612 1625 1649 1674 1680 1692 1702 1715 1728 1788 1790 1795
 1814 1841 1843 1848 1860 1875 1900 1912 1913 1992 2015 2140 2172 2175
 2268 2270 2280 2331 2364 2370 2448 2453 2546 2549 2553 2610 2621 2646
 2648 2670 2679 2697 2700 2745 2748 2752 2785 2794 2797 2803 2820 2836
 2851 2885 2897 2910 2918 2925 2985 2987 3007 3033 3036 3066 3109 3156
 3159 3170 3171 3179 3197 3199 3226 3235 3240 3247 3257 3291 3300 3312
 3326 3351 3360 3410 3470 3473 3494 3558 3587 3614 3617 3633 3651 3676
 3678 3757 3764 3766 3772 3806 3917 3930 3943 3978]"

Unnamed: 0,text,label,prediction,src
0,Abstract:\n\nThe family of all k-independent s...,0,1,real2022nlp
1,Abstract:\n\nThe OGS for non-abelian groups is...,0,1,real2022nlp
2,Abstract:\n\nThis work is concerned with the d...,0,1,real2022nlp
3,Abstract:\n\nGraph Neural Networks (GNNs) have...,0,1,real2022nlp
4,Abstract:\n\nNeural Architecture Search (NAS) ...,0,1,real2022nlp
...,...,...,...,...
173,Abstract:\n\nCommunity Detection in Social Net...,0,1,real2022nlp
174,Abstract:\n\nWe consider the long-time behavio...,0,1,real2022nlp
175,Abstract:\n\nWe study the GIT quotient of the ...,0,1,real2022nlp
176,Abstract:\n\nGenerative commonsense reasoning ...,0,1,real2022nlp


{'eval_loss': 0.3227936327457428, 'eval_accuracy': 0.9555, 'eval_f1': 0.0, 'eval_recall': 0.0, 'eval_precision': 0.0, 'eval_wrongly_classified': array([   6,   12,   22,   87,   92,  105,  111,  140,  184,  192,  214,
        219,  225,  227,  279,  290,  295,  362,  365,  376,  385,  416,
        423,  445,  460,  474,  534,  538,  565,  599,  627,  655,  656,
        657,  692,  726,  727,  760,  781,  898,  957,  965,  974,  986,
       1007, 1010, 1080, 1119, 1130, 1153, 1188, 1200, 1265, 1306, 1313,
       1370, 1380, 1384, 1398, 1400, 1434, 1477, 1484, 1515, 1525, 1536,
       1549, 1559, 1560, 1574, 1605, 1611, 1612, 1625, 1649, 1674, 1680,
       1692, 1702, 1715, 1728, 1788, 1790, 1795, 1814, 1841, 1843, 1848,
       1860, 1875, 1900, 1912, 1913, 1992, 2015, 2140, 2172, 2175, 2268,
       2270, 2280, 2331, 2364, 2370, 2448, 2453, 2546, 2549, 2553, 2610,
       2621, 2646, 2648, 2670, 2679, 2697, 2700, 2745, 2748, 2752, 2785,
       2794, 2797, 2803, 2820, 2836, 2851, 2885, 289

In [None]:
class_table_gpt3, output_dict_gpt3, df_gpt3_pred = eval_dataset(df = gpt3_test, model = model_exper)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Trainer is attempting to log a value of "[  0   1   2   3   4   6   8  10  11  12  13  14  15  16  17  18  19  21
  22  23  24  25  26  28  29  30  31  32  34  35  36  37  39  40  41  42
  44  45  46  47  48  49  50  51  52  53  56  58  59  60  61  62  64  67
  68  69  71  72  75  78  80  81  83  85  86  87  89  91  92  94  95  96
  97  98 100 101 104 106 107 108 109 110 112 113 114 116 118 119 121 123
 124 125 126 127 128 130 131 133 134 137 138 140 142 145 147 148 149 150
 151 152 153 155 156 158 159 160 162 163 165 166 167 168 169 170 171 173
 174 175 176 177 178 179 180 181 182 183 184 185 186 189 191 192 193 194
 196 197 198 199 200 201 202 203 204 205 207 208 209 211 212 213 214 215
 217 219 220 221 222 223 224 225 226 227 229 231 233 237 238 239 240 241
 242 244 246 248 249 250 251 252 253 254 255 256 257 258 259 260 261 264
 265 266 267 268 269 270 271 272 273 275 276 277 278 279 282 283 284 285
 286 288 289 290 294 296 297 298 299 301 302 303 304 305 306 309 310 311
 312 313 3

Unnamed: 0,text,label,prediction,src
0,Abstract:\n\nWe propose a method for self-supe...,1,0,gpt32022nlp
1,"Abstract:\n\nIn this work, we propose a new gr...",1,0,gpt32022nlp
2,Abstract:\n\nA major limitation of current net...,1,0,gpt32022nlp
3,"Abstract:\n\nIn this paper, we introduce Solo-...",1,0,gpt32022nlp
4,Abstract:\n\nWe address the problem of scalabl...,1,0,gpt32022nlp
...,...,...,...,...
736,Abstract:\n\nConventional image-text represent...,1,0,gpt32022nlp
737,"Abstract:\n\nIn this paper, we present a novel...",1,0,gpt32022nlp
738,Abstract:\n\nWe consider the problem of learni...,1,0,gpt32022nlp
739,"Abstract:\n\nIn this work, we propose to impro...",1,0,gpt32022nlp


{'eval_loss': 7.249135971069336, 'eval_accuracy': 0.259, 'eval_f1': 0.41143764892772045, 'eval_recall': 0.259, 'eval_precision': 1.0, 'eval_wrongly_classified': array([  0,   1,   2,   3,   4,   6,   8,  10,  11,  12,  13,  14,  15,
        16,  17,  18,  19,  21,  22,  23,  24,  25,  26,  28,  29,  30,
        31,  32,  34,  35,  36,  37,  39,  40,  41,  42,  44,  45,  46,
        47,  48,  49,  50,  51,  52,  53,  56,  58,  59,  60,  61,  62,
        64,  67,  68,  69,  71,  72,  75,  78,  80,  81,  83,  85,  86,
        87,  89,  91,  92,  94,  95,  96,  97,  98, 100, 101, 104, 106,
       107, 108, 109, 110, 112, 113, 114, 116, 118, 119, 121, 123, 124,
       125, 126, 127, 128, 130, 131, 133, 134, 137, 138, 140, 142, 145,
       147, 148, 149, 150, 151, 152, 153, 155, 156, 158, 159, 160, 162,
       163, 165, 166, 167, 168, 169, 170, 171, 173, 174, 175, 176, 177,
       178, 179, 180, 181, 182, 183, 184, 185, 186, 189, 191, 192, 193,
       194, 196, 197, 198, 199, 200, 201, 202, 

In [None]:
class_table_chatgpt, output_dict_chatgpt, df_chatgpt_pred = eval_dataset(df = test_df[test_df["src"] == "chatgpt"], model = model_exper)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["text"] = start_of_text \


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Trainer is attempting to log a value of "[  1   7  17  19  24  31  44  53  59  62  63  69  84  87  89  99 110 126
 135 137 141 149 154 155 157 159 172 177 192 199 205 207 210 218 221 234
 238 242 243 248 262 264 267 270 273 276 278 302 309 310 313 314 321 322
 326 331 338 344 349 351 359 361 365 373 377 380 382 398 402 412 416 418
 423 430 433 444 449 459 460 465 468 469 475 479 483 490 494 495 511 514
 515 521 522 523 535 540 541 542 545 549 551 553 569 579 583 586 605 612
 621 628 632 638 641 652 653 654 656 662 676 691 693 696 697 698 700 702
 703 710 716 724 726 728 739 745 747 750 758 759 760 765 769 770 775 803
 807 815 816 842 847 856 863 868 892 896 903 908 911 917 919 923 924 927
 928 931 932 941 944 948 949 959 962 965 972 976 978 988 996 998]" of type <class 'numpy.ndarray'> for key "eval/wrongly_classified" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[[2.6322837e-06 9.9

Unnamed: 0,text,label,prediction,src
0,Abstract:\n\nThis study investigates the prese...,1,0,chatgpt
1,"Abstract:\n\nIn this note, we explore the conn...",1,0,chatgpt
2,"Abstract:\n\nIn this document, we describe the...",1,0,chatgpt
3,"Abstract:\n\nIn this work, we investigate the ...",1,0,chatgpt
4,"Abstract:\n\nIn this paper, we consider noncon...",1,0,chatgpt
...,...,...,...,...
173,"Abstract:\n\nIn this paper, we investigate the...",1,0,chatgpt
174,"Abstract:\n\nIn this paper, we propose a hiera...",1,0,chatgpt
175,"Abstract:\n\nIn this paper, we investigate the...",1,0,chatgpt
176,Abstract:\n\nThis paper discusses the properti...,1,0,chatgpt


{'eval_loss': 0.8421607613563538, 'eval_accuracy': 0.822, 'eval_f1': 0.9023051591657518, 'eval_recall': 0.822, 'eval_precision': 1.0, 'eval_wrongly_classified': array([  1,   7,  17,  19,  24,  31,  44,  53,  59,  62,  63,  69,  84,
        87,  89,  99, 110, 126, 135, 137, 141, 149, 154, 155, 157, 159,
       172, 177, 192, 199, 205, 207, 210, 218, 221, 234, 238, 242, 243,
       248, 262, 264, 267, 270, 273, 276, 278, 302, 309, 310, 313, 314,
       321, 322, 326, 331, 338, 344, 349, 351, 359, 361, 365, 373, 377,
       380, 382, 398, 402, 412, 416, 418, 423, 430, 433, 444, 449, 459,
       460, 465, 468, 469, 475, 479, 483, 490, 494, 495, 511, 514, 515,
       521, 522, 523, 535, 540, 541, 542, 545, 549, 551, 553, 569, 579,
       583, 586, 605, 612, 621, 628, 632, 638, 641, 652, 653, 654, 656,
       662, 676, 691, 693, 696, 697, 698, 700, 702, 703, 710, 716, 724,
       726, 728, 739, 745, 747, 750, 758, 759, 760, 765, 769, 770, 775,
       803, 807, 815, 816, 842, 847, 856, 863, 

In [None]:
eval_dataset(df = test_cc, model = model_exper)

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Trainer is attempting to log a value of "[   0    1    2 ... 3997 3998 3999]" of type <class 'numpy.ndarray'> for key "eval/wrongly_classified" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[[9.9999797e-01 2.0661043e-06]
 [9.9999714e-01 2.8461761e-06]
 [9.9988151e-01 1.1843644e-04]
 ...
 [9.9990594e-01 9.4059928e-05]
 [9.9999821e-01 1.7810935e-06]
 [9.9999857e-01 1.4367506e-06]]" of type <class 'numpy.ndarray'> for key "eval/softmax_probs" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


Unnamed: 0,text,label,prediction,src
0,Abstract:\n\nRecent calculations have pointed ...,1,0,chatgpt-paraphrased
1,"Abstract:\n\nIn recent times, there has been s...",1,0,chatgpt-paraphrased
2,"Abstract:\n\nNowadays, with the booming develo...",1,0,chatgpt-paraphrased
3,Abstract:\n\nTaking into account the drivers' ...,1,0,chatgpt-paraphrased
4,"Abstract:\n\nIn this study, we employ the Wang...",1,0,chatgpt-paraphrased
...,...,...,...,...
3723,"Abstract:\n\nIn this paper, we propose a model...",1,0,chatgpt-paraphrased
3724,Abstract:\n\nIn the reaction of the antiproton...,1,0,chatgpt-paraphrased
3725,Abstract:\n\nWe investigate the dynamics of so...,1,0,chatgpt-paraphrased
3726,Abstract:\n\nNear Field Communication (NFC) st...,1,0,chatgpt-paraphrased


{'eval_loss': 11.280269622802734, 'eval_accuracy': 0.068, 'eval_f1': 0.12734082397003746, 'eval_recall': 0.068, 'eval_precision': 1.0, 'eval_wrongly_classified': array([   0,    1,    2, ..., 3997, 3998, 3999]), 'eval_softmax_probs': array([[9.9999797e-01, 2.0661043e-06],
       [9.9999714e-01, 2.8461761e-06],
       [9.9988151e-01, 1.1843644e-04],
       ...,
       [9.9990594e-01, 9.4059928e-05],
       [9.9999821e-01, 1.7810935e-06],
       [9.9999857e-01, 1.4367506e-06]], dtype=float32), 'eval_runtime': 110.3104, 'eval_samples_per_second': 36.261, 'eval_steps_per_second': 18.131}


(                                                   text  label  prediction  \
 0     Abstract:\n\nRecent calculations have pointed ...      1           0   
 1     Abstract:\n\nIn recent times, there has been s...      1           0   
 2     Abstract:\n\nNowadays, with the booming develo...      1           0   
 3     Abstract:\n\nTaking into account the drivers' ...      1           0   
 4     Abstract:\n\nIn this study, we employ the Wang...      1           0   
 ...                                                 ...    ...         ...   
 3723  Abstract:\n\nIn this paper, we propose a model...      1           0   
 3724  Abstract:\n\nIn the reaction of the antiproton...      1           0   
 3725  Abstract:\n\nWe investigate the dynamics of so...      1           0   
 3726  Abstract:\n\nNear Field Communication (NFC) st...      1           0   
 3727  Abstract:\n\nLi\'enard-type equations are used...      1           0   
 
                       src  
 0     chatgpt-paraph

## GPT-3 out of distribution

Using the GPT3 dataset from lab of 2022 NLP to asses how good the model is in classifying out of domain generators. We also train the model using few more examples from GPT3 to see if there is any improvement.

In [None]:
# unvomment when using the already trained model.
# model_exper = load_model("results/85-TrainedOnGPT3")

how well does the model perform on OOD GPT3?

In [None]:
traingpt3_df = pd.concat([train_df, gpt3_train])
tokenized_train_traingpt3, tokenized_test_traingpt3 = get_tokenized_dataset(traingpt3_df)

Map:   0%|          | 0/3440 [00:00<?, ? examples/s]

Map:   0%|          | 0/13760 [00:00<?, ? examples/s]

In [None]:
model_gpt3 = load_model(f"facebook/galactica-{model_params_size}")

Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/galactica-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


cuda


In [None]:
os.environ["WANDB_DISABLED"] = "false"
wandb.init(project = "galactica_paper_classifier")
training_args_gpt3 = TrainingArguments(
    output_dir='./results',
    num_train_epochs=4,              # total number of training epochs
    learning_rate=5e-6,
    per_device_train_batch_size=2,  # batch size per device during training
    per_device_eval_batch_size=2,   # batch size for evaluation
    gradient_accumulation_steps = 4,
    weight_decay=0.01,
    warmup_steps=1000,
    logging_dir='./logs',            # directory for storing logs
    logging_steps=1000,
    eval_steps=1000,
    save_steps=1000,
    evaluation_strategy="steps",
    save_total_limit = 1,
    # save_strategy = "no",
    load_best_model_at_end=True,
    fp16=True,
    report_to="wandb"
    )


trainer_gpt3 = Trainer(
    model=model_gpt3,
    args=training_args_gpt3,
    train_dataset=tokenized_train_traingpt3,
    eval_dataset=tokenized_test_traingpt3,
    compute_metrics=compute_metrics
)

In [None]:
trainer_gpt3.train()



Step,Training Loss,Validation Loss,Accuracy,F1,Recall,Precision
1000,0.3695,0.350734,0.911047,0.923994,0.990943,0.865519
2000,0.2313,0.456766,0.926453,0.936544,0.994672,0.884834
3000,0.1326,0.217109,0.95843,0.963211,0.997336,0.931343
4000,0.0819,0.060987,0.986628,0.987792,0.991476,0.984135
5000,0.0583,0.075148,0.986047,0.987214,0.987214,0.987214
6000,0.0336,0.115705,0.982558,0.984169,0.993607,0.974909


TrainOutput(global_step=6880, training_loss=0.13413474934045658, metrics={'train_runtime': 5418.8425, 'train_samples_per_second': 10.157, 'train_steps_per_second': 1.27, 'total_flos': 5.752709261033472e+16, 'train_loss': 0.13413474934045658, 'epoch': 4.0})

how well does the model perform on GPT-3 when trained on a few GPT-3 data?

In [None]:
class_table_gpt3, output_dict_gpt3, df_gpt3_pred = eval_dataset(df = gpt3_test, model = trainer_gpt3.model)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Trainer is attempting to log a value of "[  2   4  11  15  18  19  32  36  38  39  41  44  47  49  51  52  53  60
  61  65  67  69  71  76  77  78  83  85  86  91  92  96  97  98 107 108
 123 124 130 133 137 138 144 146 147 148 151 153 154 158 159 160 162 167
 168 169 173 174 175 176 177 179 181 182 183 189 193 194 196 198 201 211
 215 225 234 238 241 242 244 246 248 260 266 268 269 275 278 282 283 289
 290 294 298 299 309 310 313 317 320 323 325 327 328 333 342 345 346 348
 356 360 362 363 365 367 368 370 373 375 376 379 385 387 393 395 396 404
 409 411 413 423 428 430 432 438 442 443 446 447 454 456 460 464 465 466
 469 472 475 476 479 481 483 487 489 491 492 493 498 501 503 506 510 517
 524 530 532 534 536 537 543 544 547 549 552 553 560 562 567 568 574 577
 580 585 597 600 602 604 606 615 616 618 619 624 627 630 631 635 642 648
 650 654 656 658 665 672 673 675 688 691 703 705 711 712 714 721 726 730
 732 734 735 740 742 744 747 749 750 752 762 768 771 772 775 777 782 785
 790 792 7

Unnamed: 0,text,label,prediction,src
0,Abstract:\n\nA major limitation of current net...,1,0,gpt32022nlp
1,Abstract:\n\nWe address the problem of scalabl...,1,0,gpt32022nlp
2,Abstract:\n\nUnsupervised image-to-image trans...,1,0,gpt32022nlp
3,"Abstract:\n\nIn this paper, we propose a metho...",1,0,gpt32022nlp
4,Abstract:\n\nOur selfsupervised event segmenta...,1,0,gpt32022nlp
...,...,...,...,...
283,"Abstract:\n\nWe propose NeuCrowd, a neural sam...",1,0,gpt32022nlp
284,Abstract:\n\nWe propose a novel framework for ...,1,0,gpt32022nlp
285,Abstract:\n\nThe ability to caption images is ...,1,0,gpt32022nlp
286,Abstract:\n\nWe develop a theory that explains...,1,0,gpt32022nlp


{'eval_loss': 1.2516989707946777, 'eval_accuracy': 0.712, 'eval_f1': 0.8317757009345794, 'eval_recall': 0.712, 'eval_precision': 1.0, 'eval_wrongly_classified': array([  2,   4,  11,  15,  18,  19,  32,  36,  38,  39,  41,  44,  47,
        49,  51,  52,  53,  60,  61,  65,  67,  69,  71,  76,  77,  78,
        83,  85,  86,  91,  92,  96,  97,  98, 107, 108, 123, 124, 130,
       133, 137, 138, 144, 146, 147, 148, 151, 153, 154, 158, 159, 160,
       162, 167, 168, 169, 173, 174, 175, 176, 177, 179, 181, 182, 183,
       189, 193, 194, 196, 198, 201, 211, 215, 225, 234, 238, 241, 242,
       244, 246, 248, 260, 266, 268, 269, 275, 278, 282, 283, 289, 290,
       294, 298, 299, 309, 310, 313, 317, 320, 323, 325, 327, 328, 333,
       342, 345, 346, 348, 356, 360, 362, 363, 365, 367, 368, 370, 373,
       375, 376, 379, 385, 387, 393, 395, 396, 404, 409, 411, 413, 423,
       428, 430, 432, 438, 442, 443, 446, 447, 454, 456, 460, 464, 465,
       466, 469, 472, 475, 476, 479, 481, 483, 

other datasets

In [None]:
class_table, output_dict, df_pred = eval_dataset(df = test_df, model = trainer_gpt3.model)

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Trainer is attempting to log a value of "[  20   36  140  211  297  375  425  489  496  756  841  849 1101 1167
 1326 1370 1468 1490 1756 1794 1818 1834 1884 1965 2089 2100 2164 2172
 2282 2462 2472 2585 2861 2886 3010 3031 3133 3195 3274 3508 3612 3653
 3856 3872 3918 4024 4092 4127 4260 4289 4290 4299 4310 4347 4436 4581
 4588 4647 4687 4711 4726 4822 4918 4990 5017 5081 5098 5400 5421 5469
 5491 5537 5582 5607 5659 5905 6130 6164 6172 6227 6229 6358 6402 6561
 6573 6583 6620 6637 6638 6747 6750 6768 6790 6833 6906 6921 6925 7028
 7033 7163 7279 7282 7364 7439 7458 7467 7501 7565 7657 7760 7762 7830
 7846 7906]" of type <class 'numpy.ndarray'> for key "eval/wrongly_classified" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[[3.3275157e-06 9.9999666e-01]
 [2.8684992e-06 9.9999714e-01]
 [3.0654517e-06 9.9999690e-01]
 ...
 [2.9595544e-06 9.9999702e-01]
 [9.9985790e-01 1.4202364e-04]
 [

Unnamed: 0,text,label,prediction,src
0,Abstract:\n\nThe Cygnus Loop is a well-studied...,0,1,real
1,Abstract:\n\nWe study the power and energy uti...,1,0,gpt2
2,Abstract:\n\nInfluential users play an importa...,0,1,real
3,Abstract:\n\nThis is a continuation of our stu...,0,1,real
4,Abstract:\n\nWe present results from the high ...,0,1,real
...,...,...,...,...
109,Abstract:\n\nWe investigate the possibility of...,0,1,real
110,Abstract:\n\nThis paper considers stochastic f...,0,1,real
111,Abstract:\n\nDropout as regularization has bee...,0,1,real
112,Abstract:\n\nThe purpose of this note is to pr...,1,0,galactica


{'eval_loss': 0.07017944008111954, 'eval_accuracy': 0.98575, 'eval_f1': 0.9857997010463377, 'eval_recall': 0.98925, 'eval_precision': 0.9823733862959285, 'eval_wrongly_classified': array([  20,   36,  140,  211,  297,  375,  425,  489,  496,  756,  841,
        849, 1101, 1167, 1326, 1370, 1468, 1490, 1756, 1794, 1818, 1834,
       1884, 1965, 2089, 2100, 2164, 2172, 2282, 2462, 2472, 2585, 2861,
       2886, 3010, 3031, 3133, 3195, 3274, 3508, 3612, 3653, 3856, 3872,
       3918, 4024, 4092, 4127, 4260, 4289, 4290, 4299, 4310, 4347, 4436,
       4581, 4588, 4647, 4687, 4711, 4726, 4822, 4918, 4990, 5017, 5081,
       5098, 5400, 5421, 5469, 5491, 5537, 5582, 5607, 5659, 5905, 6130,
       6164, 6172, 6227, 6229, 6358, 6402, 6561, 6573, 6583, 6620, 6637,
       6638, 6747, 6750, 6768, 6790, 6833, 6906, 6921, 6925, 7028, 7033,
       7163, 7279, 7282, 7364, 7439, 7458, 7467, 7501, 7565, 7657, 7760,
       7762, 7830, 7846, 7906]), 'eval_softmax_probs': array([[3.3275157e-06, 9.9999666e-

In [None]:
class_table, output_dict, df_pred = eval_dataset(df = real2022nlp_df, model = trainer_gpt3.model)

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "[  50   85   87   92  105  116  140  184  205  214  219  237  261  279
  290  326  339  362  365  385  407  416  437  439  446  472  473  474
  476  534  536  539  544  546  551  562  599  627  634  656  657  681
  692  728  771  781  784  809  826  886  898  957  974  997 1029 1045
 1153 1187 1200 1205 1215 1255 1265 1279 1283 1306 1352 1353 1384 1387
 1401 1413 1424 1427 1434 1484 1485 1490 1525 1560 1574 1587 1590 1600
 1606 1674 1680 1715 1717 1728 1762 1804 1814 1844 1860 1881 1903 1915
 1916 1976 1979 1992 2140 2163 2172 2175 2194 2258 2267 2269 2270 2280
 2299 2303 2354 2364 2413 2418 2423 2430 2453 2460 2549 2570 2621 2632
 2648 2670 2700 2731 2749 2752 2785 2794 2796 2797 2803 2820 2851 2870
 2878 2885 2893 2925 2928 2937 2949 2985 2997 3007 3012 3031 3033 3036
 3038 3052 3136 3171 3179 3235 3247 3284 3287 3291 3295 3312 3324 3326
 3400 3410 3470 3485 3494 3502 3503 3544 3581 3594 3

Unnamed: 0,text,label,prediction,src
0,Abstract:\n\nThis paper proposes an inverse re...,0,1,real2022nlp
1,Abstract:\n\n: We present a method for making ...,0,1,real2022nlp
2,Abstract:\n\nGraph Neural Networks (GNNs) have...,0,1,real2022nlp
3,Abstract:\n\nNeural Architecture Search (NAS) ...,0,1,real2022nlp
4,"Abstract:\n\nIn this paper, we present a techn...",0,1,real2022nlp
...,...,...,...,...
188,Abstract:\n\nWe investigate the use of a non-p...,0,1,real2022nlp
189,Abstract:\n\nWhile many real-world data stream...,0,1,real2022nlp
190,Abstract:\n\nGenerative commonsense reasoning ...,0,1,real2022nlp
191,Abstract:\n\n: The cross section of the proces...,0,1,real2022nlp


{'eval_loss': 0.28473666310310364, 'eval_accuracy': 0.95175, 'eval_f1': 0.0, 'eval_recall': 0.0, 'eval_precision': 0.0, 'eval_wrongly_classified': array([  50,   85,   87,   92,  105,  116,  140,  184,  205,  214,  219,
        237,  261,  279,  290,  326,  339,  362,  365,  385,  407,  416,
        437,  439,  446,  472,  473,  474,  476,  534,  536,  539,  544,
        546,  551,  562,  599,  627,  634,  656,  657,  681,  692,  728,
        771,  781,  784,  809,  826,  886,  898,  957,  974,  997, 1029,
       1045, 1153, 1187, 1200, 1205, 1215, 1255, 1265, 1279, 1283, 1306,
       1352, 1353, 1384, 1387, 1401, 1413, 1424, 1427, 1434, 1484, 1485,
       1490, 1525, 1560, 1574, 1587, 1590, 1600, 1606, 1674, 1680, 1715,
       1717, 1728, 1762, 1804, 1814, 1844, 1860, 1881, 1903, 1915, 1916,
       1976, 1979, 1992, 2140, 2163, 2172, 2175, 2194, 2258, 2267, 2269,
       2270, 2280, 2299, 2303, 2354, 2364, 2413, 2418, 2423, 2430, 2453,
       2460, 2549, 2570, 2621, 2632, 2648, 2670, 2

In [None]:
class_table, output_dict, df_pred = eval_dataset(df = test_df[test_df["src"] == "chatgpt"], model = trainer_gpt3.model)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["text"] = start_of_text \


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Trainer is attempting to log a value of "[  1   7   9  17  19  24  26  31  40  44  50  59  62  69  75  80  81  89
  99 102 103 110 126 130 138 149 161 165 169 172 177 178 179 184 197 199
 203 205 207 210 221 233 234 238 248 255 262 270 273 274 276 278 279 280
 291 293 302 304 313 314 318 320 321 325 326 331 338 340 343 344 346 351
 359 361 365 376 378 379 380 389 398 399 400 404 407 411 412 417 418 423
 430 449 454 455 466 469 475 482 484 490 494 500 508 511 513 515 521 522
 523 529 531 535 541 545 549 551 562 569 575 579 583 587 590 603 605 612
 615 619 621 628 632 634 638 643 652 653 654 656 662 675 676 677 679 685
 689 691 693 694 697 698 700 702 706 710 716 726 728 739 742 750 752 760
 769 780 794 803 805 807 815 840 841 842 846 847 852 854 868 903 908 910
 911 917 919 921 924 927 928 931 941 948 949 953 958 959 962 976 979 986
 987 996 998]" of type <class 'numpy.ndarray'> for key "eval/wrongly_classified" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorr

Unnamed: 0,text,label,prediction,src
0,Abstract:\n\nThis study investigates the prese...,1,0,chatgpt
1,"Abstract:\n\nIn this note, we explore the conn...",1,0,chatgpt
2,"Abstract:\n\nIn this paper, we study the expon...",1,0,chatgpt
3,"Abstract:\n\nIn this document, we describe the...",1,0,chatgpt
4,"Abstract:\n\nIn this work, we investigate the ...",1,0,chatgpt
...,...,...,...,...
196,Abstract:\n\nThe heavy quark potential plays a...,1,0,chatgpt
197,Abstract:\n\nDoped SrTiO3 thin films have emer...,1,0,chatgpt
198,"Abstract:\n\nIn this study, we used mean squar...",1,0,chatgpt
199,Abstract:\n\nThis paper discusses the properti...,1,0,chatgpt


{'eval_loss': 0.7857471704483032, 'eval_accuracy': 0.799, 'eval_f1': 0.8882712618121179, 'eval_recall': 0.799, 'eval_precision': 1.0, 'eval_wrongly_classified': array([  1,   7,   9,  17,  19,  24,  26,  31,  40,  44,  50,  59,  62,
        69,  75,  80,  81,  89,  99, 102, 103, 110, 126, 130, 138, 149,
       161, 165, 169, 172, 177, 178, 179, 184, 197, 199, 203, 205, 207,
       210, 221, 233, 234, 238, 248, 255, 262, 270, 273, 274, 276, 278,
       279, 280, 291, 293, 302, 304, 313, 314, 318, 320, 321, 325, 326,
       331, 338, 340, 343, 344, 346, 351, 359, 361, 365, 376, 378, 379,
       380, 389, 398, 399, 400, 404, 407, 411, 412, 417, 418, 423, 430,
       449, 454, 455, 466, 469, 475, 482, 484, 490, 494, 500, 508, 511,
       513, 515, 521, 522, 523, 529, 531, 535, 541, 545, 549, 551, 562,
       569, 575, 579, 583, 587, 590, 603, 605, 612, 615, 619, 621, 628,
       632, 634, 638, 643, 652, 653, 654, 656, 662, 675, 676, 677, 679,
       685, 689, 691, 693, 694, 697, 698, 700, 

In [None]:
model_imported = load_model("./results/train+gpt3")
eval_dataset(df = test_cc, model = model_imported)

cuda


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Trainer is attempting to log a value of "[   0    1    2 ... 3997 3998 3999]" of type <class 'numpy.ndarray'> for key "eval/wrongly_classified" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[[9.9999607e-01 3.9150150e-06]
 [9.9998903e-01 1.1025603e-05]
 [9.9998236e-01 1.7586750e-05]
 ...
 [9.9995697e-01 4.3077140e-05]
 [9.9999440e-01 5.6141339e-06]
 [9.9999368e-01 6.3284829e-06]]" of type <class 'numpy.ndarray'> for key "eval/softmax_probs" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


Unnamed: 0,text,label,prediction,src
0,Abstract:\n\nRecent calculations have pointed ...,1,0,chatgpt-paraphrased
1,"Abstract:\n\nIn recent times, there has been s...",1,0,chatgpt-paraphrased
2,"Abstract:\n\nNowadays, with the booming develo...",1,0,chatgpt-paraphrased
3,Abstract:\n\nTaking into account the drivers' ...,1,0,chatgpt-paraphrased
4,"Abstract:\n\nIn this study, we employ the Wang...",1,0,chatgpt-paraphrased
...,...,...,...,...
3515,"Abstract:\n\nIn this paper, we propose a model...",1,0,chatgpt-paraphrased
3516,Abstract:\n\nIn the reaction of the antiproton...,1,0,chatgpt-paraphrased
3517,Abstract:\n\nWe investigate the dynamics of so...,1,0,chatgpt-paraphrased
3518,Abstract:\n\nNear Field Communication (NFC) st...,1,0,chatgpt-paraphrased


{'eval_loss': 9.91174030303955, 'eval_accuracy': 0.12, 'eval_f1': 0.21428571428571425, 'eval_recall': 0.12, 'eval_precision': 1.0, 'eval_wrongly_classified': array([   0,    1,    2, ..., 3997, 3998, 3999]), 'eval_softmax_probs': array([[9.9999607e-01, 3.9150150e-06],
       [9.9998903e-01, 1.1025603e-05],
       [9.9998236e-01, 1.7586750e-05],
       ...,
       [9.9995697e-01, 4.3077140e-05],
       [9.9999440e-01, 5.6141339e-06],
       [9.9999368e-01, 6.3284829e-06]], dtype=float32), 'eval_runtime': 780.7127, 'eval_samples_per_second': 5.124, 'eval_steps_per_second': 2.562}


(                                                   text  label  prediction  \
 0     Abstract:\n\nRecent calculations have pointed ...      1           0   
 1     Abstract:\n\nIn recent times, there has been s...      1           0   
 2     Abstract:\n\nNowadays, with the booming develo...      1           0   
 3     Abstract:\n\nTaking into account the drivers' ...      1           0   
 4     Abstract:\n\nIn this study, we employ the Wang...      1           0   
 ...                                                 ...    ...         ...   
 3515  Abstract:\n\nIn this paper, we propose a model...      1           0   
 3516  Abstract:\n\nIn the reaction of the antiproton...      1           0   
 3517  Abstract:\n\nWe investigate the dynamics of so...      1           0   
 3518  Abstract:\n\nNear Field Communication (NFC) st...      1           0   
 3519  Abstract:\n\nLi\'enard-type equations are used...      1           0   
 
                       src  
 0     chatgpt-paraph

## Out-of-distribution ChatGPT

In [None]:
# unvomment when using the already trained model.
# model_exper = load_model("results/84-noChatGPT")

first eval how good our trained model is in detecting chatgpt when trained on chatgpt data.

second eval how good our trained model is in detecting chatgpt when not trained on chatgpt data.

In [None]:
model = load_model(f"facebook/galactica-{model_params_size}")

Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/galactica-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


cuda


In [None]:
train_no_chatgpt_df = train_df[train_df["src"] != "chatgpt"]
tokenized_train_no_chatgpt, tokenized_test_no_chatgpt = get_tokenized_dataset(train_no_chatgpt_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["text"] = start_of_text \


Map:   0%|          | 0/2800 [00:00<?, ? examples/s]

Map:   0%|          | 0/11200 [00:00<?, ? examples/s]

In [None]:
os.environ["WANDB_DISABLED"] = "false"
wandb.init(project = "galactica_paper_classifier")
training_args_no_chatgpt = TrainingArguments(
    output_dir='./results',
    num_train_epochs=4,              # total number of training epochs
    learning_rate=5e-6,
    per_device_train_batch_size=2,  # batch size per device during training
    per_device_eval_batch_size=2,   # batch size for evaluation
    gradient_accumulation_steps = 4,
    weight_decay=0.01,
    warmup_steps=1000,
    logging_dir='./logs',            # directory for storing logs
    logging_steps=1000,
    eval_steps=1000,
    save_steps=1000,
    evaluation_strategy="steps",
    save_total_limit = 1,
    # save_strategy = "no",
    load_best_model_at_end=True,
    fp16=True,
    report_to="wandb"
    )



trainer_no_chatgpt = Trainer(
    model=model,
    args=training_args_no_chatgpt,
    train_dataset=tokenized_train_no_chatgpt,
    eval_dataset=tokenized_test_no_chatgpt,
    compute_metrics=compute_metrics
)

In [None]:
trainer_no_chatgpt.train()



Step,Training Loss,Validation Loss,Accuracy,F1,Recall,Precision
1000,0.361,0.152097,0.9575,0.949725,0.928159,0.972318
2000,0.1938,0.374398,0.918214,0.913552,0.999174,0.841446
3000,0.117,0.071595,0.984643,0.982106,0.974401,0.989933
4000,0.0705,0.109002,0.977857,0.973906,0.955409,0.993133
5000,0.0307,0.06986,0.988571,0.98681,0.988439,0.985185


TrainOutput(global_step=5600, training_loss=0.14078683359282357, metrics={'train_runtime': 4370.8464, 'train_samples_per_second': 10.25, 'train_steps_per_second': 1.281, 'total_flos': 4.68243777060864e+16, 'train_loss': 0.14078683359282357, 'epoch': 4.0})

How well does the model perform when tested on OOD ChatGPT dataset.

In [None]:
save_model_locally(trainer_no_chatgpt.model, "results_train-cg")

In [None]:
class_table_no_chatgpt, output_dict_no_chatgpt, df_no_chatgpt_pred= eval_dataset(df = test_df[test_df["src"] == "chatgpt"], model = trainer_no_chatgpt.model)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["text"] = start_of_text \


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Trainer is attempting to log a value of "[  1   7   9  17  18  19  21  22  23  24  30  31  32  34  42  46  49  50
  53  54  60  61  62  64  67  68  69  70  75  80  81  83  87  89  90  91
  98  99 100 101 104 105 106 107 109 110 114 115 119 120 121 122 126 130
 131 135 137 139 140 141 143 149 151 156 157 159 161 162 172 177 179 191
 192 197 199 201 205 207 208 210 211 212 218 220 221 223 226 228 234 236
 238 240 241 242 244 247 248 262 263 264 265 267 268 269 270 272 273 274
 275 278 280 281 282 284 287 292 299 302 309 310 313 314 315 317 318 321
 325 326 329 334 336 338 343 344 346 349 356 357 359 361 365 372 377 379
 380 382 384 387 391 394 395 398 400 402 403 405 407 411 412 416 418 420
 422 424 430 432 445 447 448 449 450 452 454 457 459 460 462 465 466 467
 468 469 473 475 478 479 482 484 488 489 490 499 500 504 506 507 508 509
 511 512 513 515 519 521 522 523 528 530 531 532 534 535 536 540 541 542
 543 546 547 548 550 551 559 564 565 566 567 568 569 571 574 575 577 579
 580 583 5

Unnamed: 0,text,label,prediction,src
0,Abstract:\n\nThis study investigates the prese...,1,0,chatgpt
1,"Abstract:\n\nIn this note, we explore the conn...",1,0,chatgpt
2,"Abstract:\n\nIn this paper, we study the expon...",1,0,chatgpt
3,"Abstract:\n\nIn this document, we describe the...",1,0,chatgpt
4,"Abstract:\n\nIn this document, we will discuss...",1,0,chatgpt
...,...,...,...,...
382,"Abstract:\n\nIn this study, we used mean squar...",1,0,chatgpt
383,"Abstract:\n\nIn this study, we present a compr...",1,0,chatgpt
384,Abstract:\n\nThis paper discusses the properti...,1,0,chatgpt
385,Abstract:\n\nThe two-body problem in celestial...,1,0,chatgpt


{'eval_loss': 2.76407527923584, 'eval_accuracy': 0.613, 'eval_f1': 0.7600743955362678, 'eval_recall': 0.613, 'eval_precision': 1.0, 'eval_wrongly_classified': array([  1,   7,   9,  17,  18,  19,  21,  22,  23,  24,  30,  31,  32,
        34,  42,  46,  49,  50,  53,  54,  60,  61,  62,  64,  67,  68,
        69,  70,  75,  80,  81,  83,  87,  89,  90,  91,  98,  99, 100,
       101, 104, 105, 106, 107, 109, 110, 114, 115, 119, 120, 121, 122,
       126, 130, 131, 135, 137, 139, 140, 141, 143, 149, 151, 156, 157,
       159, 161, 162, 172, 177, 179, 191, 192, 197, 199, 201, 205, 207,
       208, 210, 211, 212, 218, 220, 221, 223, 226, 228, 234, 236, 238,
       240, 241, 242, 244, 247, 248, 262, 263, 264, 265, 267, 268, 269,
       270, 272, 273, 274, 275, 278, 280, 281, 282, 284, 287, 292, 299,
       302, 309, 310, 313, 314, 315, 317, 318, 321, 325, 326, 329, 334,
       336, 338, 343, 344, 346, 349, 356, 357, 359, 361, 365, 372, 377,
       379, 380, 382, 384, 387, 391, 394, 395, 39

testing on other datasets

In [None]:
class_table, output_dict, df_pred = eval_dataset(df = real2022nlp_df, model = trainer_no_chatgpt.model)

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "[  12   22   87  140  184  192  279  289  359  365  376  440  445  472
  534  599  656  657  662  774  775  781  875  898  957  965 1119 1130
 1153 1172 1265 1273 1306 1316 1342 1370 1380 1401 1480 1525 1551 1560
 1567 1574 1608 1692 1715 1728 1795 1860 1900 1919 2015 2088 2172 2175
 2280 2364 2432 2448 2549 2610 2785 2786 2797 2820 2836 2885 2897 2910
 2914 2957 3003 3007 3031 3171 3226 3235 3240 3247 3250 3294 3410 3558
 3617 3639 3651 3757 3766 3768 3817 3917 3930 3978]" of type <class 'numpy.ndarray'> for key "eval/wrongly_classified" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[[9.9999809e-01 1.9183128e-06]
 [9.9999917e-01 7.8114829e-07]
 [9.9999964e-01 3.1438216e-07]
 ...
 [9.9999964e-01 3.3205370e-07]
 [9.9999952e-01 5.1630769e-07]
 [9.9999940e-01 6.4507009e-07]]" of type <class '

Unnamed: 0,text,label,prediction,src
0,Abstract:\n\nThe OGS for non-abelian groups is...,0,1,real2022nlp
1,Abstract:\n\nThis work is concerned with the d...,0,1,real2022nlp
2,Abstract:\n\nGraph Neural Networks (GNNs) have...,0,1,real2022nlp
3,Abstract:\n\nSolomon and Elkin [13] constructe...,0,1,real2022nlp
4,Abstract:\n\nLet G be a connected semisimple g...,0,1,real2022nlp
...,...,...,...,...
89,"Abstract:\n\nIn this article, we deal with the...",0,1,real2022nlp
90,Abstract:\n\nWe show that every indecomposable...,0,1,real2022nlp
91,Abstract:\n\nWe consider the long-time behavio...,0,1,real2022nlp
92,Abstract:\n\nWe study the GIT quotient of the ...,0,1,real2022nlp


{'eval_loss': 0.1478167176246643, 'eval_accuracy': 0.9765, 'eval_f1': 0.0, 'eval_recall': 0.0, 'eval_precision': 0.0, 'eval_wrongly_classified': array([  12,   22,   87,  140,  184,  192,  279,  289,  359,  365,  376,
        440,  445,  472,  534,  599,  656,  657,  662,  774,  775,  781,
        875,  898,  957,  965, 1119, 1130, 1153, 1172, 1265, 1273, 1306,
       1316, 1342, 1370, 1380, 1401, 1480, 1525, 1551, 1560, 1567, 1574,
       1608, 1692, 1715, 1728, 1795, 1860, 1900, 1919, 2015, 2088, 2172,
       2175, 2280, 2364, 2432, 2448, 2549, 2610, 2785, 2786, 2797, 2820,
       2836, 2885, 2897, 2910, 2914, 2957, 3003, 3007, 3031, 3171, 3226,
       3235, 3240, 3247, 3250, 3294, 3410, 3558, 3617, 3639, 3651, 3757,
       3766, 3768, 3817, 3917, 3930, 3978]), 'eval_softmax_probs': array([[9.9999809e-01, 1.9183128e-06],
       [9.9999917e-01, 7.8114829e-07],
       [9.9999964e-01, 3.1438216e-07],
       ...,
       [9.9999964e-01, 3.3205370e-07],
       [9.9999952e-01, 5.1630769e-07

In [None]:
class_table, output_dict, df_pred = eval_dataset(df = gpt3_test, model = trainer_no_chatgpt.model)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Trainer is attempting to log a value of "[  0   1   2   3   4   6   7   8  10  11  12  13  14  15  16  17  18  19
  21  22  23  24  25  26  27  28  29  30  31  34  35  36  37  38  39  40
  41  42  43  44  45  46  47  48  49  50  51  52  53  56  58  59  60  61
  62  64  67  68  69  71  72  75  77  78  79  81  83  84  85  86  88  89
  90  91  92  93  94  96  97  98  99 100 101 102 103 104 106 107 108 109
 110 112 113 114 117 118 119 121 122 123 124 125 126 127 128 129 130 131
 132 133 134 136 137 138 139 140 141 142 143 144 145 147 148 149 150 151
 152 153 155 156 158 159 160 162 163 165 166 167 168 170 171 172 174 175
 176 177 178 179 180 181 182 183 184 185 186 187 188 189 191 192 193 194
 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213
 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231
 232 233 237 238 239 240 241 242 243 244 246 247 248 249 250 251 252 253
 254 255 256 257 258 259 260 261 262 264 265 266 267 268 269 270 272 273
 274 275 2

Unnamed: 0,text,label,prediction,src
0,Abstract:\n\nWe propose a method for self-supe...,1,0,gpt32022nlp
1,"Abstract:\n\nIn this work, we propose a new gr...",1,0,gpt32022nlp
2,Abstract:\n\nA major limitation of current net...,1,0,gpt32022nlp
3,"Abstract:\n\nIn this paper, we introduce Solo-...",1,0,gpt32022nlp
4,Abstract:\n\nWe address the problem of scalabl...,1,0,gpt32022nlp
...,...,...,...,...
871,Abstract:\n\nConventional image-text represent...,1,0,gpt32022nlp
872,"Abstract:\n\nIn this paper, we present a novel...",1,0,gpt32022nlp
873,Abstract:\n\nWe consider the problem of learni...,1,0,gpt32022nlp
874,"Abstract:\n\nIn this work, we propose to impro...",1,0,gpt32022nlp


{'eval_loss': 10.253606796264648, 'eval_accuracy': 0.124, 'eval_f1': 0.22064056939501778, 'eval_recall': 0.124, 'eval_precision': 1.0, 'eval_wrongly_classified': array([  0,   1,   2,   3,   4,   6,   7,   8,  10,  11,  12,  13,  14,
        15,  16,  17,  18,  19,  21,  22,  23,  24,  25,  26,  27,  28,
        29,  30,  31,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,
        44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  56,  58,  59,
        60,  61,  62,  64,  67,  68,  69,  71,  72,  75,  77,  78,  79,
        81,  83,  84,  85,  86,  88,  89,  90,  91,  92,  93,  94,  96,
        97,  98,  99, 100, 101, 102, 103, 104, 106, 107, 108, 109, 110,
       112, 113, 114, 117, 118, 119, 121, 122, 123, 124, 125, 126, 127,
       128, 129, 130, 131, 132, 133, 134, 136, 137, 138, 139, 140, 141,
       142, 143, 144, 145, 147, 148, 149, 150, 151, 152, 153, 155, 156,
       158, 159, 160, 162, 163, 165, 166, 167, 168, 170, 171, 172, 174,
       175, 176, 177, 178, 179, 180, 181, 182,

In [None]:
!pip uninstall -y transformers accelerate
!pip install transformers accelerate

In [None]:
class_table, output_dict, df_pred = eval_dataset(df = test_df, model = trainer_no_chatgpt.model)

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Trainer is attempting to log a value of "[  20  121  162  175  177  184  211  213  231  275  286  329  353  375
  392  461  482  541  585  650  756  837  849  870  888  891  894  911
  916  930  986 1096 1101 1153 1196 1234 1274 1326 1370 1409 1523 1527
 1583 1593 1619 1672 1678 1704 1713 1756 1794 1801 1815 1818 1826 1834
 1880 1936 1971 1982 1990 2052 2097 2146 2164 2170 2176 2188 2191 2243
 2282 2285 2327 2393 2409 2458 2462 2474 2514 2547 2601 2618 2637 2663
 2684 2726 2734 2737 2761 2827 2846 2849 2861 2866 2881 2885 2886 3010
 3052 3058 3073 3175 3180 3227 3230 3274 3303 3327 3359 3368 3446 3459
 3562 3591 3608 3612 3635 3661 3695 3703 3718 3719 3752 3872 3918 4012
 4024 4061 4067 4081 4092 4093 4094 4186 4207 4246 4269 4290 4293 4299
 4310 4313 4319 4364 4382 4397 4436 4461 4468 4545 4564 4566 4624 4647
 4650 4694 4710 4711 4729 4747 4770 4794 4822 4912 4914 4932 4939 4955
 4967 4974 4976 4989 5002 5017 5052 5064 5066 5091 5097 5098 5127 5181
 5198 5206 5341 5349 5372 5416 5421 

Unnamed: 0,text,label,prediction,src
0,Abstract:\n\nThe Cygnus Loop is a well-studied...,0,1,real
1,Abstract:\n\nMultiple-Input Multiple-Output (M...,1,0,galactica
2,Abstract:\n\nThe leptonic W boson production a...,1,0,galactica
3,"Abstract:\n\nIn this document, we describe the...",1,0,chatgpt
4,"Abstract:\n\nIn this document, we will discuss...",1,0,chatgpt
...,...,...,...,...
276,"Abstract:\n\nIn this paper, I examine what I r...",0,1,real
277,"Abstract:\n\nIn this study, we used mean squar...",1,0,chatgpt
278,Abstract:\n\nBy the time that the first phase ...,0,1,real
279,Abstract:\n\nThis paper discusses the properti...,1,0,chatgpt


{'eval_loss': 0.21259896457195282, 'eval_accuracy': 0.964875, 'eval_f1': 0.964181007010835, 'eval_recall': 0.9455, 'eval_precision': 0.9836150845253576, 'eval_wrongly_classified': array([  20,  121,  162,  175,  177,  184,  211,  213,  231,  275,  286,
        329,  353,  375,  392,  461,  482,  541,  585,  650,  756,  837,
        849,  870,  888,  891,  894,  911,  916,  930,  986, 1096, 1101,
       1153, 1196, 1234, 1274, 1326, 1370, 1409, 1523, 1527, 1583, 1593,
       1619, 1672, 1678, 1704, 1713, 1756, 1794, 1801, 1815, 1818, 1826,
       1834, 1880, 1936, 1971, 1982, 1990, 2052, 2097, 2146, 2164, 2170,
       2176, 2188, 2191, 2243, 2282, 2285, 2327, 2393, 2409, 2458, 2462,
       2474, 2514, 2547, 2601, 2618, 2637, 2663, 2684, 2726, 2734, 2737,
       2761, 2827, 2846, 2849, 2861, 2866, 2881, 2885, 2886, 3010, 3052,
       3058, 3073, 3175, 3180, 3227, 3230, 3274, 3303, 3327, 3359, 3368,
       3446, 3459, 3562, 3591, 3608, 3612, 3635, 3661, 3695, 3703, 3718,
       3719, 3752

In [None]:
eval_dataset(df = test_cc, model = trainer_no_chatgpt.model)

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Trainer is attempting to log a value of "[   0    1    2 ... 3997 3998 3999]" of type <class 'numpy.ndarray'> for key "eval/wrongly_classified" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[[9.9998236e-01 1.7640201e-05]
 [9.9999821e-01 1.8448237e-06]
 [9.9998629e-01 1.3738256e-05]
 ...
 [9.9976999e-01 2.3005706e-04]
 [9.9999964e-01 3.5071864e-07]
 [9.9999964e-01 3.1684789e-07]]" of type <class 'numpy.ndarray'> for key "eval/softmax_probs" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


Unnamed: 0,text,label,prediction,src
0,Abstract:\n\nRecent calculations have pointed ...,1,0,chatgpt-paraphrased
1,"Abstract:\n\nIn recent times, there has been s...",1,0,chatgpt-paraphrased
2,"Abstract:\n\nNowadays, with the booming develo...",1,0,chatgpt-paraphrased
3,Abstract:\n\nTaking into account the drivers' ...,1,0,chatgpt-paraphrased
4,"Abstract:\n\nIn this study, we employ the Wang...",1,0,chatgpt-paraphrased
...,...,...,...,...
3897,Abstract:\n\nThe atoms of a regular language a...,1,0,chatgpt-paraphrased
3898,"Abstract:\n\nIn this paper, we propose a model...",1,0,chatgpt-paraphrased
3899,Abstract:\n\nWe investigate the dynamics of so...,1,0,chatgpt-paraphrased
3900,Abstract:\n\nNear Field Communication (NFC) st...,1,0,chatgpt-paraphrased


{'eval_loss': 13.148548126220703, 'eval_accuracy': 0.0245, 'eval_f1': 0.04782820888238165, 'eval_recall': 0.0245, 'eval_precision': 1.0, 'eval_wrongly_classified': array([   0,    1,    2, ..., 3997, 3998, 3999]), 'eval_softmax_probs': array([[9.9998236e-01, 1.7640201e-05],
       [9.9999821e-01, 1.8448237e-06],
       [9.9998629e-01, 1.3738256e-05],
       ...,
       [9.9976999e-01, 2.3005706e-04],
       [9.9999964e-01, 3.5071864e-07],
       [9.9999964e-01, 3.1684789e-07]], dtype=float32), 'eval_runtime': 110.5984, 'eval_samples_per_second': 36.167, 'eval_steps_per_second': 18.083}


(                                                   text  label  prediction  \
 0     Abstract:\n\nRecent calculations have pointed ...      1           0   
 1     Abstract:\n\nIn recent times, there has been s...      1           0   
 2     Abstract:\n\nNowadays, with the booming develo...      1           0   
 3     Abstract:\n\nTaking into account the drivers' ...      1           0   
 4     Abstract:\n\nIn this study, we employ the Wang...      1           0   
 ...                                                 ...    ...         ...   
 3897  Abstract:\n\nThe atoms of a regular language a...      1           0   
 3898  Abstract:\n\nIn this paper, we propose a model...      1           0   
 3899  Abstract:\n\nWe investigate the dynamics of so...      1           0   
 3900  Abstract:\n\nNear Field Communication (NFC) st...      1           0   
 3901  Abstract:\n\nLi\'enard-type equations are used...      1           0   
 
                       src  
 0     chatgpt-paraph