Convert the csv file (the dataset containing 6000 sentences over 3 levels) to pandas dataframe

In [98]:
import pandas as pd
import torch

df = pd.read_csv('all_sents.csv')

sentences = df["本文"]
gold_labels = df["Level"]

Split the dataset into train, dev, and test

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, train_size=0.8, random_state=42)

test, dev = train_test_split(test, train_size=0.5, random_state=42)

In [None]:
!pip install transformers
!pip install unidic_lite
!pip install sentencepiece
!pip install fugashi

The max length for padding falls within the 95th percentile of all the data

In [None]:
from transformers import AutoTokenizer, AutoModel
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("line-corporation/line-distilbert-base-japanese", trust_remote_code=True)
model = AutoModel.from_pretrained("line-corporation/line-distilbert-base-japanese")

sentence_lengths = [len(tokenizer.tokenize(sent)) for sent in sentences.dropna()]

max_length = int(np.percentile(sentence_lengths, 95))

print(max_length)

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("line-corporation/line-distilbert-base-japanese", trust_remote_code=True)
model = AutoModelForSequenceClassification.from_pretrained("line-corporation/line-distilbert-base-japanese", num_labels=3)

train_encodings = tokenizer(list(train["本文"].dropna()), padding="max_length", truncation=True, max_length=68, return_tensors="pt")
dev_encodings = tokenizer(list(dev["本文"].dropna()), padding="max_length", truncation=True, max_length=68, return_tensors="pt")
test_encodings = tokenizer(list(test["本文"].dropna()), padding="max_length", truncation=True, max_length=68, return_tensors="pt")

train_labels = torch.tensor(list(train["Level"]), dtype=torch.long)
dev_labels = torch.tensor(list(dev["Level"]), dtype=torch.long)
test_labels = torch.tensor(list(test["Level"]), dtype=torch.long)

train_encodings, train_labels

In [None]:
type(train_encodings), type(train_labels)

In [None]:
from torch.utils.data import Dataset, DataLoader

class PolitenessDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        # The range of labels is originally from 1 to 3
        # so subtract 1 from all the labels to adjust "num_labels=3" (expecting the labels to be 0 to 2)
        self.labels = labels - 1
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        # Get tokenized inputs
        item = {key: val[idx] for key, val in self.encodings.items()} 
        # Add corresponding label
        item["labels"] = self.labels[idx] 
        return item

# Create datasets
train_dataset = PolitenessDataset(train_encodings, train_labels)
dev_dataset = PolitenessDataset(dev_encodings, dev_labels)
test_dataset = PolitenessDataset(test_encodings, test_labels)

# Create DataLoaders for batch training
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
dev_dataloader = DataLoader(dev_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

train_dataset, train_dataloader

In [None]:
from transformers import AdamW

# Define optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
from transformers import get_scheduler
from tqdm import tqdm

num_training_steps = len(train_dataloader) * 3
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Define loss function (CrossEntropy for classification)
loss_fn = torch.nn.CrossEntropyLoss()

epochs = 3
for epoch in range(epochs):
    # set the model to training mode
    model.train()
    total_loss = 0
    
    for batch in tqdm(train_dataloader, desc="Per Batch", unit="batch"):
        batch = {key: val for key, val in batch.items()}
        outputs = model(**batch)

        loss = outputs.loss
        total_loss += loss.item()

        # reset the gradient descent
        optimizer.zero_grad()

        # Backpropagation
        loss.backward()

        # updates the parameters and the learning rate
        optimizer.step()
        lr_scheduler.step()
    
    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}, Training Loss: {avg_loss:.4f}")

**Development**

In [None]:
from sklearn.metrics import accuracy_score

# set the model to evaluating mode
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in tqdm(dev_dataloader, desc="Per Batch", unit="batch"):
        batch = {key: val for key, val in batch.items()}
        outputs = model(**batch)

        # Convert the tensor to the numpy array for the sake of scikit-learn
        preds = torch.argmax(outputs.logits, dim=1).numpy()
        labels = batch["labels"].numpy()

        all_preds.extend(preds)
        all_labels.extend(labels)

# Calculate accuracy and F1 score
accuracy = accuracy_score(all_labels, all_preds)
print(f"Validation Accuracy: {accuracy:.4f}")
        

**Test**

In [None]:
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Per Batch", unit="batch"):
        batch = {key: val for key, val in batch.items()}
        outputs = model(**batch)

        preds = torch.argmax(outputs.logits, dim=1).numpy()
        labels = batch["labels"].numpy()

        all_preds.extend(preds)
        all_labels.extend(labels)

# Calculate accuracy and F1 score
accuracy = accuracy_score(all_labels, all_preds)
print(f"Test Accuracy: {accuracy:.4f}")

**Classify the Polteness Given Input Sentences**

In [None]:
def classify_politeness(text):
    # Tokenize input text
    inputs = tokenizer(text, padding=True, truncation=True, max_length=68, return_tensors="pt")
    
    # Get model predictions
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    # Convert logits to predicted class
    predicted_class = torch.argmax(logits, dim=1).item()
    
    # Mapping class index to politeness level
    label_map = {0: "Polite", 1: "Neutral", 2: "Impolite"}
    
    return label_map[predicted_class]


classify_politeness("")