# Natural Language Processing using Transformers

... using the [HuggingFace][1] library

[1]: https://huggingface.co/docs/transformers/installation :hugs:

## Setup

In [None]:
from pathlib import Path
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    AutoModelForSequenceClassification,
)
import numpy as np

In [None]:
ROOT_PATH = Path.cwd().parent
DATA_PATH = ROOT_PATH / "data"
DATA_PATH.joinpath("nlp").mkdir(exist_ok=True)

## Load data

In [None]:
train_df = pd.read_csv(DATA_PATH / "nlp" / "train.csv")

In [None]:
test_df = pd.read_csv(DATA_PATH / "nlp" / "test.csv")

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
train_df.head()

## Process data

### Create input column

In [None]:
def make_input_col(df: pd.DataFrame) -> pd.DataFrame:
    df["input"] = (
        "TEXT1: "
        + df["context"]
        + "; TEXT2: "
        + df["target"]
        + "; ANC1: "
        + df["anchor"]
    )
    return df

In [None]:
train_df = make_input_col(df=train_df)

In [None]:
test_df = make_input_col(df=test_df)

In [None]:
train_df["input"].head()

### Tokenize

In [None]:
train_ds = Dataset.from_pandas(train_df)

In [None]:
train_ds

In [None]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-small")

In [None]:
tok_train_ds = train_ds.map(lambda x: tokenizer(x["input"]))

In [None]:
tok_train_ds[0]

In [None]:
tok_train_ds = tok_train_ds.rename_column(
    original_column_name="score", new_column_name="labels"
)

In [None]:
tok_train_ds

In [None]:
tok_test_ds = Dataset.from_pandas(test_df).map(lambda x: tokenizer(x["input"]))

In [None]:
tok_test_ds

In [None]:
tok_dds = tok_train_ds.train_test_split(0.25, seed=1337)

In [None]:
tok_dds

## Train model

In [None]:
def calc_corr(x, y):
    return np.corrcoef(x, y)[0][1]

In [None]:
def get_corr_metric(pred):
    return {"pearson": calc_corr(*pred)}

In [None]:
bs = 128
epochs = 4
lr = 8e-5

In [None]:
args = TrainingArguments(
    output_dir="outputs",
    learning_rate=lr,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    fp16=True,
    evaluation_strategy="epoch",
    per_device_train_batch_size=bs,
    per_device_eval_batch_size=bs * 2,
    num_train_epochs=epochs,
    weight_decay=-0.01,
    report_to="none",
)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    "microsoft/deberta-v3-small", num_labels=1
)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tok_dds["train"],
    eval_dataset=tok_dds["test"],
    tokenizer=tokenizer,
    compute_metrics=get_corr_metric,
)

In [None]:
trainer.train()

### Generate predictions

In [None]:
preds = trainer.predict(test_dataset=tok_test_ds)

In [None]:
preds.predictions[:10]

## Submit to Kaggle

In [None]:
sub_df = tok_test_ds.remove_columns(
    column_names=[
        "anchor",
        "target",
        "context",
        "input",
        "input_ids",
        "token_type_ids",
        "attention_mask",
    ]
).to_pandas()

In [None]:
sub_df["score"] = preds.predictions

In [None]:
sub_df.loc[sub_df["score"] < 0, "score"] = 0

In [None]:
sub_df.to_csv(DATA_PATH / "nlp" / "submission.csv")