https://www.kaggle.com/code/taranmarley/distilbert-and-eda-tutorial

https://www.kaggle.com/code/butorinvasiliy/notebook-1-nlp-disaster-tweets

## Importing some libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# Core packages for text processing.

import string
import re

# Libraries for text preprocessing.

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

# Loading some sklearn packaces for modelling.

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.metrics import f1_score, accuracy_score

# Some packages for word clouds and NER.

from wordcloud import WordCloud, STOPWORDS
from collections import Counter, defaultdict
from PIL import Image
import spacy

# Setting some options for general use.

stop = set(stopwords.words('english'))
plt.style.use('fivethirtyeight')
sns.set(font_scale=1.5)
pd.options.display.max_columns = 250
pd.options.display.max_rows = 250

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to /home/sophot/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/sophot/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/sophot/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/sophot/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Load the data from the CSV file into a DataFrame
data = pd.read_csv('preprocessed_train.csv')

# Preview the first few rows of the data
data.head(2)

Unnamed: 0,id,label,lemma_str
0,TRAIN_00000,3,israel parliament start winter session jerusal...
1,TRAIN_00001,2,twothirds business owner say prepared outbreak...


### Basic Data Exploration

In [3]:
print(data.shape)

(47399, 3)


In [4]:
print(data.dtypes)

id           object
label         int64
lemma_str    object
dtype: object


In [5]:
print(data['label'].value_counts())

0    14146
1    10961
2     9379
3     8946
4     2461
5     1022
6      278
7      206
Name: label, dtype: int64


In [6]:
print(data.isnull().sum())

id           0
label        0
lemma_str    0
dtype: int64


In [7]:
num_classes = data['label'].nunique()
print("Number of classes: ", num_classes)

Number of classes:  8


## Preprocessing
Some common preprocessing steps for news articles include:

**Tokenization**: breaking the text into individual words or tokens <br />
**Lowercasing**: converting all text to lowercase to avoid duplicate words due to capitalization <br />
**Removing stop words**: common words like "the", "and", "is", etc. that do not provide much meaning. <br />
**Stemming or lemmatization**: reducing words to their base form to group together similar words <br />
**Removing special characters, numbers, and punctuation** <br />

In [3]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)


def remove_emoji(text):
    emoji_pattern = re.compile(
        '['
        u'\U0001F600-\U0001F64F'  # emoticons
        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
        u'\U0001F680-\U0001F6FF'  # transport & map symbols
        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


def remove_html(text):
    html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    return re.sub(html, '', text)


def remove_punct(text):
    table = str.maketrans('', '', string.punctuation)
    return text.translate(table)

# Applying helper functions

data['text_clean'] = data['text'].apply(lambda x: remove_URL(x))
data['text_clean'] = data['text_clean'].apply(lambda x: remove_emoji(x))
data['text_clean'] = data['text_clean'].apply(lambda x: remove_html(x))
data['text_clean'] = data['text_clean'].apply(lambda x: remove_punct(x))

In [4]:
data.head(2)

Unnamed: 0,id,text,text_clean
0,TEST_00000,"According to the regional office, the foreign ...",According to the regional office the foreign m...
1,TEST_00001,"According to a defense reporter, Foreign Minis...",According to a defense reporter Foreign Minist...


In [5]:
data['tokenized'] = data['text_clean'].apply(word_tokenize)
data.drop(['text_clean', 'text'], axis=1, inplace=True)
data.head(2)

Unnamed: 0,id,tokenized
0,TEST_00000,"[According, to, the, regional, office, the, fo..."
1,TEST_00001,"[According, to, a, defense, reporter, Foreign,..."


In [7]:
# Lower casing clean text.

data['lower'] = data['tokenized'].apply(
    lambda x: [word.lower() for word in x])

data.drop(['tokenized'], axis=1, inplace=True)

data.head(2)

Unnamed: 0,id,lower
0,TEST_00000,"[according, to, the, regional, office, the, fo..."
1,TEST_00001,"[according, to, a, defense, reporter, foreign,..."


In [8]:
# Removing stopwords.

data['stopwords_removed'] = data['lower'].apply(
    lambda x: [word for word in x if word not in stop])

data.drop(['lower'], axis=1, inplace=True)

data.head(2)

Unnamed: 0,id,stopwords_removed
0,TEST_00000,"[according, regional, office, foreign, ministe..."
1,TEST_00001,"[according, defense, reporter, foreign, minist..."


In [9]:
# Applying part of speech tags.

data['pos_tags'] = data['stopwords_removed'].apply(nltk.tag.pos_tag)

data.drop(['stopwords_removed'], axis=1, inplace=True)

data.head(2)

Unnamed: 0,id,pos_tags
0,TEST_00000,"[(according, VBG), (regional, JJ), (office, NN..."
1,TEST_00001,"[(according, VBG), (defense, NN), (reporter, N..."


In [10]:
# Converting part of speeches to wordnet format.

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN


data['wordnet_pos'] = data['pos_tags'].apply(
    lambda x: [(word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in x])

data.drop(['pos_tags'], axis=1, inplace=True)

data.head(2)

Unnamed: 0,id,wordnet_pos
0,TEST_00000,"[(according, v), (regional, a), (office, n), (..."
1,TEST_00001,"[(according, v), (defense, n), (reporter, n), ..."


In [12]:
# Applying word lemmatizer.

wnl = WordNetLemmatizer()

data['lemmatized'] = data['wordnet_pos'].apply(
    lambda x: [wnl.lemmatize(word, tag) for word, tag in x])

data.drop(['wordnet_pos'], axis=1, inplace=True)

data['lemmatized'] = data['lemmatized'].apply(
    lambda x: [word for word in x if word not in stop])

data['lemma_str'] = [' '.join(map(str, l)) for l in data['lemmatized']]

data.drop(['lemmatized'], axis=1, inplace=True)

data.head(2)

Unnamed: 0,id,lemma_str
0,TEST_00000,accord regional office foreign minister member...
1,TEST_00001,accord defense reporter foreign minister moham...


In [13]:
# Save preprocessed data to file
data.to_csv('preprocessed_test.csv', index=False)

## Above codes are preprocessing to remove unnecessaries (ex: hashtag)

### DistilRobert-a

In [3]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, f1_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, AdamW

MODEL_NAME = "microsoft/deberta-v3-base"

# Define the hyperparameters
max_length = 512

# Split the data into training and validation sets
train_data, val_data = train_test_split(data, test_size=0.1, random_state=42)

# Load the pretrained DistilRoberta tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Tokenize the text data and convert it to PyTorch tensors
train_encodings = tokenizer(train_data['lemma_str'].tolist(), truncation=True, padding=True, max_length=max_length)
val_encodings = tokenizer(val_data['lemma_str'].tolist(), truncation=True, padding=True, max_length=max_length)

train_labels = train_data['label'].tolist()
val_labels = val_data['label'].tolist()

Downloading (…)okenizer_config.json: 100%|█████████████████████████| 52.0/52.0 [00:00<00:00, 4.94kB/s]
Downloading (…)lve/main/config.json: 100%|███████████████████████████| 579/579 [00:00<00:00, 62.1kB/s]
Downloading spm.model: 100%|█████████████████████████████████████| 2.46M/2.46M [00:00<00:00, 11.2MB/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        # calculate focal loss
        alpha = [0.139, 0.179, 0.212, 0.221, 0.815, 1.891, 6.016, 8.694]
        alpha = torch.tensor(alpha).to(self.args.device)
        gamma = 2.0

        ce_loss = torch.nn.functional.cross_entropy(logits, labels, reduction='none')
        pt = torch.exp(-ce_loss)
        focal_loss = alpha[labels] * (1-pt)**gamma * ce_loss

        loss = torch.mean(focal_loss)

        return (loss, outputs) if return_outputs else loss

In [5]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)
    

train_dataset = MyDataset(train_encodings, train_labels)
val_dataset = MyDataset(val_encodings, val_labels)

# Load the AutoModel
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=data['label'].nunique())


# Set the device to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

Downloading pytorch_model.bin: 100%|███████████████████████████████| 371M/371M [00:40<00:00, 9.22MB/s]
Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a m

DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine

In [6]:
learning_rate = 5e-5
epochs = 7

# Define a function to compute the F1 score
def compute_f1(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(y_true=labels, y_pred=preds, average='macro')
    return {"f1": f1}


# Set up the training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=epochs,         # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=8,   # batch size for evaluation
    warmup_steps=500,               # number of warmup steps for learning rate scheduler
    weight_decay=0.01,              # strength of weight decay
    save_total_limit=1,             # limit the total number of checkpoints to save
    gradient_accumulation_steps=4,  # number of gradient accumulation steps
    learning_rate=learning_rate,    # learning rate
    metric_for_best_model='f1',     # use F1 score to determine the best model
    greater_is_better=True,          # maximize the F1 score
    load_best_model_at_end=True,
    fp16=True,
    lr_scheduler_type='cosine',
    evaluation_strategy="epoch",
    save_strategy="epoch"
)

# Set up the Trainer class
trainer = CustomTrainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    data_collator=lambda data: {'input_ids': torch.stack([item['input_ids'] for item in data]),
                                'attention_mask': torch.stack([item['attention_mask'] for item in data]),
                                'labels': torch.tensor([item['labels'] for item in data])},
    compute_metrics=compute_f1           # evaluation metric function
)

Using cuda_amp half precision backend


## Training

In [None]:
# Train the Deberta-v3 model
trainer.train()   # Batch 8, 7 Epochs

***** Running training *****
  Num examples = 42659
  Num Epochs = 7
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 4
  Total optimization steps = 9331
  Number of trainable parameters = 184428296


Epoch,Training Loss,Validation Loss


In [14]:
# Train the model
trainer.train()   # Batch 8, 7 Epochs

***** Running training *****
  Num examples = 42659
  Num Epochs = 7
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 4
  Total optimization steps = 9331
  Number of trainable parameters = 82124552


Epoch,Training Loss,Validation Loss,F1
0,0.0723,0.046667,0.895573
1,0.0331,0.034288,0.912785
2,0.0233,0.028082,0.93044
3,0.0118,0.028225,0.936845
4,0.0066,0.034382,0.927745
5,0.0034,0.039941,0.934975
6,0.0022,0.043164,0.937116


***** Running Evaluation *****
  Num examples = 4740
  Batch size = 8
Saving model checkpoint to ./results/checkpoint-1333
Configuration saved in ./results/checkpoint-1333/config.json
Model weights saved in ./results/checkpoint-1333/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 4740
  Batch size = 8
Saving model checkpoint to ./results/checkpoint-2666
Configuration saved in ./results/checkpoint-2666/config.json
Model weights saved in ./results/checkpoint-2666/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 4740
  Batch size = 8
Saving model checkpoint to ./results/checkpoint-3999
Configuration saved in ./results/checkpoint-3999/config.json
Model weights saved in ./results/checkpoint-3999/pytorch_model.bin
Deleting older checkpoint [results/checkpoint-1333] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 4740
  Batch size = 8
Saving model checkpoint to ./results/checkpoint-5332
Configuration saved in ./results/checkpoint

TrainOutput(global_step=9331, training_loss=0.02902296196672135, metrics={'train_runtime': 8205.7641, 'train_samples_per_second': 36.391, 'train_steps_per_second': 1.137, 'total_flos': 3.956032247291904e+16, 'train_loss': 0.02902296196672135, 'epoch': 7.0})

## Save & Load Model

In [15]:
# specify the directory where you want to save the model
model_dir = "./deberta-v3-focalloss"

# save the trained model
model.save_pretrained(model_dir)

Configuration saved in ./distilroberta-focalloss/config.json
Model weights saved in ./distilroberta-focalloss/pytorch_model.bin


In [7]:
# # load the saved model
model_dir = ""
model = AutoModelForSequenceClassification.from_pretrained(model_dir)
model.to(device)
model.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (

## Validation

In [16]:
# initialize the true and predicted labels
true_labels = []
predicted_labels = []

# iterate over the validation set and make predictions
with torch.no_grad():
    for idx in tqdm(range(len(val_dataset))):
        # get the input features and labels
        inputs = val_dataset[idx]
        labels = inputs.pop("labels")
        
        # move the inputs and labels to the device
        input_ids = torch.tensor(inputs["input_ids"]).to(device)
        attention_mask = torch.tensor(inputs["attention_mask"]).to(device)

        labels = labels.to(device)
        
        # make predictions
        outputs = model(input_ids.unsqueeze(0), attention_mask.unsqueeze(0))
        logits = outputs.logits
        
        predictions = torch.argmax(logits, dim=1)
        
        # append the true and predicted labels to the lists
        true_labels.append(labels.cpu().numpy())
        predicted_labels.append(predictions.cpu().numpy())

# compute the F1 score using macro-average
f1 = f1_score(true_labels, predicted_labels, average="macro")
print(f"Macro-Average F1 Score: {f1:.4f}")

100%|███████████████████████████████████████████████████████████| 4740/4740 [00:54<00:00, 86.27it/s]

Macro-Average F1 Score: 0.9371





## Test

In [17]:
# Load the data from the CSV file into a DataFrame
test_data = pd.read_csv('preprocessed_test.csv')
print(test_data.shape)

(83334, 2)


In [18]:
# Set the device to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [19]:
import torch.nn as nn

In [20]:
# initialize the true and predicted labels
predicted_proba = []

# iterate over the test set and make predictions
with torch.no_grad():
    for idx in tqdm(range(len(test_data))):
        # get the input features and labels
        test_encodings = tokenizer(test_data.iloc[idx]['lemma_str'], truncation=True, padding=True, max_length=max_length)
        
        # move the inputs and labels to the device
        input_ids = torch.tensor(test_encodings["input_ids"]).to(device)
        attention_mask = torch.tensor(test_encodings["attention_mask"]).to(device)
        
        # make predictions
        outputs = model(input_ids.unsqueeze(0), attention_mask.unsqueeze(0))
        
        logits = outputs.logits
        
        predicted_proba.append(nn.Softmax(dim=1)(logits).squeeze(0).cpu().numpy())


np.savetxt('deberta_predictions_proba.txt', np.array(predicted_proba))


100%|████████████████████████████████████████████████████████| 83334/83334 [13:12<00:00, 105.19it/s]


In [21]:
predicted_labels = np.argmax(predicted_proba, axis=1)

# Make predictions on the test data and save to a CSV file
submission_df = pd.DataFrame({'id': test_data['id'], 'label': predicted_labels})
submission_df.to_csv('submission.csv', index=False)