In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import Dataset
from transformers import (
    AutoModel,
    AutoTokenizer,
    EarlyStoppingCallback,
    Trainer,
    TrainingArguments,
)

In [2]:
exp = "exp025"

path = Path(f"../outputs/{exp}")
path.mkdir(exist_ok=True)

In [3]:
train_df = pd.read_csv("../data/train.csv")
test_df = pd.read_csv("../data/test.csv")
clothing_master_df = pd.read_csv("../data/clothing_master.csv")
sample_submission_df = pd.read_csv("../data/sample_submission.csv")

# Preprocessing

In [4]:
train_column_names = {
    "Clothing ID": "clothing_id",
    "Age": "age",
    "Title": "title",
    "Review Text": "review_text",
    "Rating": "rating",
    "Recommended IND": "recommended",
    "Positive Feedback Count": "positive_feedback_count",
}

test_column_names = {
    "Clothing ID": "clothing_id",
    "Age": "age",
    "Title": "title",
    "Review Text": "review_text",
}

clothing_master_column_names = {
    "Clothing ID": "clothing_id",
    "Division Name": "division_name",
    "Department Name": "department_name",
    "Class Name": "class_name",
}

train_df = train_df.rename(columns=train_column_names)
test_df = test_df.rename(columns=test_column_names)
clothing_master_df = clothing_master_df.rename(columns=clothing_master_column_names)

train_df = pd.merge(train_df, clothing_master_df, on="clothing_id", how="left")
test_df = pd.merge(test_df, clothing_master_df, on="clothing_id", how="left")

# Dataset

In [6]:
class TextDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=512, mode="train"):
        self.texts = df["text"]
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.mode = mode

        if mode == "train":
            self.labels = df["recommended"]
            self.clothing_ids = df["clothing_id"]
            self.rating = df["rating"]

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )

        if self.mode == "train":
            label = self.labels[idx]
            output = {
                "input_ids": encoding["input_ids"].flatten(),
                "attention_mask": encoding["attention_mask"].flatten(),
                "labels": torch.tensor(label, dtype=torch.long),
                "rating": torch.tensor(self.rating[idx], dtype=torch.float32),
            }
            return output
        else:
            output = {
                "input_ids": encoding["input_ids"].flatten(),
                "attention_mask": encoding["attention_mask"].flatten(),
            }
            return output

In [7]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()

    def forward(self, x, mask):
        return (x * mask.unsqueeze(-1)).sum(1) / mask.sum(1).unsqueeze(-1)


class BERTModel(torch.nn.Module):
    def __init__(self, model_name: str, num_labels: int, tokenizer=None):
        super(BERTModel, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.bert.resize_token_embeddings(len(tokenizer))

        self.pooling = MeanPooling()

        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
        self.rating_head = nn.Linear(self.bert.config.hidden_size, 1)

        self.num_labels = num_labels
        self.loss_fn = nn.CrossEntropyLoss()
        self.mse = nn.MSELoss()

        self.freeze(4)

    def freeze(self, num_freeze_layers):
        """
        Freeze the BERT model up to num_freeze_layers
        """
        for layer in self.bert.encoder.layer[:num_freeze_layers]:
            for param in layer.parameters():
                param.requires_grad = False

    def forward(
        self,
        input_ids,
        attention_mask,
        labels=None,
        rating=None,
    ):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        output = self.pooling(outputs.last_hidden_state, attention_mask)

        logits = self.classifier(output)

        loss = None
        if labels is not None:
            rating_pred = self.rating_head(output).squeeze(dim=-1)
            ce_loss = self.loss_fn(logits, labels)
            rating_loss = self.mse(rating_pred, rating)

            loss = ce_loss + rating_loss

        return (loss, logits) if loss is not None else logits

In [8]:
def create_text_column(df: pd.DataFrame, sep_token: str) -> pd.DataFrame:
    text_df = df.copy()
    text_df["text"] = (
        "title: "
        + text_df["title"].fillna("no title").astype(str)
        + sep_token
        + "review_text: "
        + text_df["review_text"].fillna("no review").astype(str)
        + sep_token
        + "age: "
        + text_df["age"].fillna("nan").astype(str)
        + sep_token
        + "division_name: "
        + text_df["division_name"].fillna("nan").astype(str)
        + sep_token
        + "department_name: "
        + text_df["department_name"].fillna("nan").astype(str)
        + sep_token
        + "class_name: "
        + text_df["class_name"].fillna("nan").astype(str)
    )
    return text_df

In [9]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = torch.tensor(pred.predictions)
    preds = F.softmax(preds, dim=1)[:, 1].numpy()

    if len(set(labels)) > 1:
        auc = roc_auc_score(labels, preds)
        return {"auc": auc}
    else:
        return {}

# Add newline Token

In [10]:
model_name = "microsoft/deberta-v3-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
new_tokens = ["<newline>"]
tokenizer.add_tokens(new_tokens)



1

In [11]:
def replace_newlines_with_token(text, token="<newline>"):
    return text.replace("\n", token)

In [12]:
labels = train_df["recommended"].to_numpy()

train_text_df = create_text_column(train_df, sep_token=tokenizer.sep_token)
test_text_df = create_text_column(test_df, sep_token=tokenizer.sep_token)

train_text_df["text"] = train_text_df["text"].apply(replace_newlines_with_token)
test_text_df["text"] = test_text_df["text"].apply(replace_newlines_with_token)

test_dataset = TextDataset(test_text_df, tokenizer, max_len=192, mode="test")

# Train

In [14]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof = np.zeros(len(train_df))
for fold, (train_idx, val_idx) in enumerate(
    cv.split(train_df, labels, groups=train_df["clothing_id"])
):
    _train_text_df = train_text_df.iloc[train_idx].reset_index(drop=True)
    _valid_text_df = train_text_df.iloc[val_idx].reset_index(drop=True)

    train_dataset = TextDataset(
        _train_text_df,
        tokenizer,
        max_len=192,
    )
    val_dataset = TextDataset(_valid_text_df, tokenizer, max_len=192)

    model = BERTModel(model_name, num_labels=2, tokenizer=tokenizer)

    training_args = TrainingArguments(
        output_dir=f"../outputs/{exp}",
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        learning_rate=1e-5,
        warmup_ratio=0.1,
        weight_decay=0.01,
        logging_dir=f"../outputs/{exp}/logs",
        logging_steps=10,
        evaluation_strategy="steps",
        save_strategy="steps",
        eval_steps=100,
        load_best_model_at_end=True,
        metric_for_best_model="auc",
        greater_is_better=True,
        max_grad_norm=1.0,
        lr_scheduler_type="cosine",
        dataloader_num_workers=4,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
    )

    trainer.train()

    trainer.save_model(f"../outputs/{exp}/fold{fold}")

    valid_pred = trainer.predict(val_dataset)
    valid_pred = torch.tensor(valid_pred.predictions)
    valid_pred = F.softmax(valid_pred, dim=1)[:, 1].numpy()
    oof[val_idx] = valid_pred

    np.save(f"../outputs/{exp}/valid_pred_fold{fold}.npy", valid_pred)

np.save(f"../outputs/{exp}/oof.npy", oof)
overall_auc = roc_auc_score(labels, oof)
print(f"Overall AUC: {overall_auc}")

  return self.fget.__get__(instance, owner)()
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mshu421[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Auc
100,1.346,0.862409,0.949706
200,0.5383,0.787335,0.963363
300,0.7861,0.606786,0.963181
400,0.6319,0.588965,0.966034
500,0.5905,0.645538,0.973004
600,0.3955,0.713999,0.970397
700,0.4854,0.60011,0.971741
800,0.5306,0.539679,0.974131
900,0.4155,0.525679,0.973574
1000,0.4364,0.504025,0.974868




Step,Training Loss,Validation Loss,Auc
100,0.7283,0.788369,0.942808
200,0.8095,1.058834,0.969434
300,0.7299,0.596365,0.970071
400,0.6853,0.572534,0.969119
500,0.6512,0.596622,0.971782
600,0.4745,0.585214,0.969578
700,0.4887,0.550899,0.974594
800,0.5691,0.501674,0.973937
900,0.3159,0.514106,0.972905
1000,0.3188,0.578184,0.972568




Step,Training Loss,Validation Loss,Auc
100,0.8304,0.94729,0.930074
200,0.9247,0.614818,0.960961
300,0.6627,0.622003,0.967142
400,0.4853,0.553373,0.969574
500,0.5282,0.583163,0.97244
600,0.4086,0.521775,0.975846
700,0.4555,0.565306,0.971364
800,0.4251,0.583154,0.972972
900,0.4392,0.59906,0.973822
1000,0.3575,0.567931,0.973707




Step,Training Loss,Validation Loss,Auc
100,0.9208,1.414712,0.913845
200,0.6955,0.721839,0.946867
300,0.4773,0.663318,0.958243
400,0.6318,0.576784,0.962774
500,0.5288,0.594078,0.965556
600,0.3641,0.656586,0.966049
700,0.3568,0.688003,0.963138
800,0.3002,0.531404,0.966284
900,0.5537,0.512427,0.968852
1000,0.6058,0.522218,0.969206




Step,Training Loss,Validation Loss,Auc
100,0.942,0.914464,0.904659
200,0.6668,0.654985,0.965749
300,0.6533,0.812767,0.968573
400,0.5173,0.548471,0.969716
500,0.5663,0.508103,0.969769
600,0.4961,0.590014,0.969637
700,0.3937,0.570586,0.971595
800,0.513,0.55918,0.972527
900,0.4843,0.532066,0.970782
1000,0.5419,0.534174,0.971262


Overall AUC: 0.9725615635677455


# Inference

In [16]:
import numpy as np
import torch
import torch.nn.functional as F
from safetensors.torch import load_file
from transformers import Trainer, TrainingArguments


def load_model_and_predict(model_path, test_dataset, model_class, model_name):
    model = model_class(model_name, num_labels=2, tokenizer=tokenizer)
    state_dict = load_file(f"{model_path}/model.safetensors")
    model.load_state_dict(state_dict)

    training_args = TrainingArguments(
        output_dir="./outputs",
        per_device_eval_batch_size=16,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
    )

    predictions = trainer.predict(test_dataset)
    predictions = torch.tensor(predictions.predictions)
    predictions = F.softmax(predictions, dim=1)[:, 1].numpy()

    return predictions


test_text_df = create_text_column(test_df, sep_token=tokenizer.sep_token)
test_dataset = TextDataset(test_text_df, tokenizer, max_len=256, mode="test")

test_predictions = np.zeros((len(test_text_df), 5))

for fold in range(5):
    model_path = f"../outputs/{exp}/fold{fold}"
    test_predictions[:, fold] = load_model_and_predict(
        model_path, test_dataset, BERTModel, model_name
    )

np.save(f"../outputs/{exp}/test_predictions_{exp}.npy", test_predictions)
final_predictions = test_predictions.mean(axis=1)

[0.99894567 0.47369413 0.99834218 ... 0.99875703 0.99887633 0.99915528]


In [17]:
sample_submission_df["target"] = final_predictions
sample_submission_df.to_csv(f"../outputs/{exp}/submission_{exp}.csv", index=False)
sample_submission_df

Unnamed: 0,target
0,0.998946
1,0.473694
2,0.998342
3,0.037251
4,0.998061
...,...
11150,0.998645
11151,0.998808
11152,0.998757
11153,0.998876
