# Combinalion of linguistic features and fine-tuned `roberta-base`

In [1]:
import json
import os
import pickle
from collections import Counter

import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
from transformers import AutoTokenizer, AutoModel
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MaxAbsScaler
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from scipy.sparse import hstack, load_npz
from tqdm.auto import tqdm

In [7]:
DIRECTORY = "./processed_data"
DIRECTORY_MAIN = "."

## Data

In [9]:
def jsonl_read(path):
    """Read jsonl data from file.

    Args:
    path: path to the jsonl file.

    Returns:
    list of json objects.
    """

    with open(os.path.join(DIRECTORY_MAIN, path), "r") as f:
        data = [json.loads(line) for line in f]
    return data

def jsonl_write(path, data):
    """Write jsonl data to file.

    Args:
    path: path to the output jsonl file.
    data: list of json objects.
    """

    with open(os.path.join(DIRECTORY_MAIN, path), "w") as f:
        for entry in data:
            f.write(json.dumps(entry) + "\n")

def get_data():
    train = jsonl_read("SubtaskA/subtaskA_train_monolingual.jsonl")
    dev = jsonl_read("SubtaskA/subtaskA_dev_monolingual.jsonl")
    return train, dev

To get train and dev datasets:

In [10]:
train, test = get_data()
y_train = [entry["label"] for entry in train]
y_test = [entry["label"] for entry in test]

### Features: Stylometry

In [11]:
def get_stylometry_features(have_svd=True):
    train_pos_tf = load_npz(os.path.join(DIRECTORY, "pos_tf.npz"))
    train_shape_tf = load_npz(os.path.join(DIRECTORY, "shape_tf.npz"))
    style_train = hstack([train_pos_tf, train_shape_tf])

    dev_pos_tf = load_npz(os.path.join(DIRECTORY, "pos_tf_dev.npz"))
    dev_shape_tf = load_npz(os.path.join(DIRECTORY, "shape_tf_dev.npz"))
    style_test = hstack([dev_pos_tf, dev_shape_tf])

    transformer = MaxAbsScaler().fit(style_train)
    style_train = transformer.transform(style_train)
    style_test = transformer.transform(style_test)

    if not have_svd:
        svd = TruncatedSVD(n_components=768)
        svd.fit(style_train)
        with open(os.path.join(DIRECTORY, "svd_768.pkl"), "wb") as file:
            pickle.dump(svd, file)

    with open(os.path.join(DIRECTORY, "svd_768.pkl"), "rb") as file:
        svd = pickle.load(file)
    style_train = svd.transform(style_train)
    style_test = svd.transform(style_test)
    return style_train, style_test

To get dense stylometry vectors:

In [13]:
style_train, style_test = get_stylometry_features()

## Features: Others

In [14]:
def combine_data(file_name_div: str,
                 file_name_read: str,
                 file_name_rst: str,
                 file_name_ent: str) -> pd.DataFrame:
    data_div = pd.read_json(os.path.join(DIRECTORY, file_name_div), lines=True)
    data_div.rename(columns={col: f"DIV_{col}" for col in data_div.columns[4:]},
                    inplace=True)

    data_read = pd.read_json(os.path.join(DIRECTORY, file_name_read), lines=True)
    if "text" in data_read:
        data_read.drop(columns=["text"], inplace=True)
    data_read.dropna(subset=["id"], inplace=True)
    data_read["id"] = data_read["id"].astype("int")
    data_read.reset_index(inplace=True)
    data_read.rename(columns={col: f"READ_{col}" for col in data_read.columns[5:]},
                    inplace=True)

    data_rst = pd.read_csv(os.path.join(DIRECTORY, file_name_rst), index_col=0)
    data_rst["id"] = data_rst["id"].astype("int")
    data_rst.rename(columns={col: f"RST_{col}" for col in data_rst.columns[3:]},
                    inplace=True)

    data_ent = pd.read_csv(os.path.join(DIRECTORY, file_name_ent), index_col=0)
    data_ent.sort_values(by="id", inplace=True)
    data_ent["id"] = data_ent["id"].astype("int")
    data_ent.rename(columns={col: f"ENT_{col}" for col in data_ent.columns[1:]},
                    inplace=True)

    data = pd.concat([data_div,
                    data_read.iloc[:, 5:],
                    data_rst.iloc[:, 3:],
                    data_ent.iloc[:, 1:]], axis=1)
    data.dropna(inplace=True)
    return data

To get `pd.DataFrame` with other features:

In [15]:
features_train_ = combine_data("train_diversities.jsonl",
                              "readability_train.jsonl",
                              "rst_train.csv",
                              "entity_grid_train.csv").iloc[:, 4:]
features_test_ = combine_data("dev_diversities.jsonl",
                              "readability_dev.jsonl",
                              "rst_dev.csv",
                              "entity_grid_dev.csv").iloc[:, 4:]

## Fine-tuned `roberta-base`

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

id2label = {0: "human", 1: "machine"}
label2id = {"human": 0, "machine": 1}

tokenizer = AutoTokenizer.from_pretrained(os.path.join(DIRECTORY_MAIN, "best_roberta"))
model = AutoModel.from_pretrained(os.path.join(DIRECTORY_MAIN, "best_roberta"),
                                  num_labels=len(label2id),
                                  id2label=id2label,
                                  label2id=label2id,
                                  torch_dtype=torch.float16).to(device)

Some weights of RobertaModel were not initialized from the model checkpoint at /content/drive/MyDrive/data_semeval/task8/best_roberta and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
def get_text_embeddings(texts: list) -> torch.Tensor:
    batch_size = 16
    model.eval()
    results = []
    for i in tqdm(range(0, len(texts), batch_size)):
        encoded_inputs = tokenizer(texts[i:i+batch_size], padding=True, truncation=True, return_tensors="pt")
        batch_inputs = {k: v.to(device) for k, v in encoded_inputs.items()}
        with torch.no_grad():
            outputs = model(**batch_inputs)
        pooled_output = outputs[1]
        results.extend(list(pooled_output.cpu().numpy()))
    return results

To get embeddings:

In [20]:
train_texts = [entry["text"] for entry in train]
train_embeddings = np.array(get_text_embeddings(train_texts))

test_texts = [entry["text"] for entry in test]
test_embeddings = np.array(get_text_embeddings(test_texts))

  0%|          | 0/7485 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

## Configuration

Here you may configure which features will be passed to the feed-forward neural network.

In [39]:
reduce_train = True

emb = True
sty = False
feats_all = False
feat = "div" # "read", "rst", "ent"; None if do not need any of these features

In [51]:
features_train = []
features_test = []

if emb:
    features_train.append(train_embeddings)
    features_test.append(test_embeddings)
if sty:
    features_train.append(style_train)
    features_test.append(style_test)
if feats_all:
    features_train.append(features_train_.to_numpy())
    features_test.append(features_test_.to_numpy())
elif feat:
    prefix = f"{feat.upper()}_"
    features_train.append(features_train_.filter(like=prefix, axis=1).to_numpy())
    features_test.append(features_test_.filter(like=prefix, axis=1).to_numpy())

if reduce_train:
    X_train_main = np.concatenate(tuple(features_train), axis=1)[:70000]
    y_train_main = y_train[:70000]
else:
    X_train_main = np.concatenate(tuple(features_train), axis=1)
    y_train_main = y_train
X_test_main = np.concatenate(tuple(features_test), axis=1)
y_test_main = y_test

input_dim = X_train_main.shape[1]

In [53]:
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

To prepare data for the feed-forward neural network:

In [69]:
feats_train = torch.tensor(X_train_main, dtype=torch.float).to(device)
labels_train = torch.tensor(y_train_main, dtype=torch.long).to(device)
dataset_train = TextDataset(feats_train, labels_train)
dataloader_train = DataLoader(dataset_train, batch_size=128, shuffle=True)

feats_test = torch.tensor(X_test_main, dtype=torch.float).to(device)
labels_test = torch.tensor(y_test_main, dtype=torch.long).to(device)
dataset_test = TextDataset(feats_test, labels_test)
dataloader_test = DataLoader(dataset_test, batch_size=128, shuffle=False)

## Feed-forward neural network

In [70]:
class CombinedModel(nn.Module):
    def __init__(self, input_dim):
        super(CombinedModel, self).__init__()
        self.network = nn.Sequential(nn.Linear(input_dim, 512),
                                     nn.BatchNorm1d(512),
                                     nn.ReLU(),
                                     nn.Dropout(0.5),

                                     nn.Linear(512, 64),
                                     nn.BatchNorm1d(64),
                                     nn.ReLU(),
                                     nn.Dropout(0.5),

                                     nn.Linear(64, 2))

    def forward(self, combined_input):
        logits = self.network(combined_input)
        return logits

Hyperparameters:

In [82]:
epochs = 25
learning_rate = 0.00005
weight_decay = 0.01

early_stopping = True
patience = 10
patience_counter = 0
best_metric = 0

In [83]:
model = CombinedModel(input_dim=input_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(),
                       lr=learning_rate,
                       weight_decay=weight_decay)

To run training:

In [84]:
for epoch in range(epochs):
    for texts, labels in dataloader_train:
        model.train()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if early_stopping:
        model.eval()
        y_preds = []
        y_golds = []
        with torch.no_grad():
            for texts, labels in dataloader_test:
                y_pred = model(texts)
                y_pred = torch.argmax(nn.functional.softmax(y_pred, dim=1), dim=1)
                y_preds.extend(y_pred.cpu().numpy())
                y_golds.extend(labels.cpu().numpy())
            val_metric = accuracy_score(y_golds, y_preds)
            if val_metric > best_metric:
                best_metric = val_metric
                patience_counter = 0
                torch.save(model.state_dict(), os.path.join(DIRECTORY, "best_model_combination.pt"))
            else:
                patience_counter += 1
            print(f"Epoch {epoch}, Loss: {loss.item()}")
            if patience_counter >= patience:
                print(f"Stopping early at epoch {epoch}")
                print("Best metric:", round(best_metric, 2))
                break

if early_stopping:
    model.load_state_dict(torch.load(os.path.join(DIRECTORY, "best_model_combination.pt")))

Epoch 0, Loss: 0.12067081034183502
Epoch 1, Loss: 0.07534424960613251
Epoch 2, Loss: 0.02977200411260128
Epoch 3, Loss: 0.023252204060554504
Epoch 4, Loss: 0.01755269058048725
Epoch 5, Loss: 0.016290143132209778
Epoch 6, Loss: 0.010352612473070621
Epoch 7, Loss: 0.010338732041418552
Epoch 8, Loss: 0.01187317818403244
Epoch 9, Loss: 0.009315061382949352
Epoch 10, Loss: 0.010957881808280945
Epoch 11, Loss: 0.014286325313150883
Epoch 12, Loss: 0.00576710095629096
Epoch 13, Loss: 0.006135058123618364
Epoch 14, Loss: 0.00753409881144762
Epoch 15, Loss: 0.00587966525927186
Epoch 16, Loss: 0.006713004317134619
Epoch 17, Loss: 0.010569528676569462
Epoch 18, Loss: 0.007149464916437864
Epoch 19, Loss: 0.008431333117187023
Epoch 20, Loss: 0.008094169199466705
Epoch 21, Loss: 0.007831797003746033
Epoch 22, Loss: 0.009384943172335625
Epoch 23, Loss: 0.009126202203333378
Epoch 24, Loss: 0.008481038734316826
Stopping early at epoch 24
Best metric: 0.95


### Evaluation and predictions

In [85]:
model.eval()
y_preds = []
y_golds = []

with torch.no_grad():
    for texts, labels in dataloader_test:
        y_pred = model(texts)
        y_pred = torch.argmax(nn.functional.softmax(y_pred, dim=1), dim=1)
        y_preds.extend(y_pred.cpu().numpy())
        y_golds.extend(labels.cpu().numpy())
    print(f"Accuracy: {round(accuracy_score(y_golds, y_preds), 2)}, F1: {round(f1_score(y_golds, y_preds), 2)}")

Accuracy: 0.95, F1: 0.95


In [87]:
result = []
for i, pred in enumerate(y_preds):
    entry = dict()
    entry["id"] = int(test[i]["id"])
    entry["label"] = int(pred)
    result.append(entry)

jsonl_write(os.path.join(DIRECTORY, "best_predictions.jsonl"), result)