https://www.kaggle.com/code/taranmarley/distilbert-and-eda-tutorial

https://www.kaggle.com/code/butorinvasiliy/notebook-1-nlp-disaster-tweets

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# Core packages for text processing.

import string
import re

# Libraries for text preprocessing.

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

# Loading some sklearn packaces for modelling.

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.metrics import f1_score, accuracy_score

# Some packages for word clouds and NER.

from wordcloud import WordCloud, STOPWORDS
from collections import Counter, defaultdict
from PIL import Image
import spacy

# Setting some options for general use.

stop = set(stopwords.words('english'))
plt.style.use('fivethirtyeight')
sns.set(font_scale=1.5)
pd.options.display.max_columns = 250
pd.options.display.max_rows = 250

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to /home/sophot/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/sophot/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/sophot/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/sophot/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Load the data from the CSV file into a DataFrame
data = pd.read_csv('preprocessed_train.csv')

# Preview the first few rows of the data
data.head(2)

Unnamed: 0,id,label,lemma_str
0,TRAIN_00000,3,israel parliament start winter session jerusal...
1,TRAIN_00001,2,twothirds business owner say prepared outbreak...


### Basic Data Exploration

In [3]:
print(data.shape)

(47399, 3)


In [4]:
print(data.dtypes)

id       object
text     object
label     int64
dtype: object


In [5]:
print(data['label'].value_counts())

0    14146
1    10961
2     9379
3     8946
4     2461
5     1022
6      278
7      206
Name: label, dtype: int64


In [6]:
print(data.isnull().sum())

id       0
text     0
label    0
dtype: int64


In [7]:
num_classes = data['label'].nunique()
print("Number of classes: ", num_classes)

Number of classes:  8


## Preprocessing
Some common preprocessing steps for news articles include:

**Tokenization**: breaking the text into individual words or tokens <br />
**Lowercasing**: converting all text to lowercase to avoid duplicate words due to capitalization <br />
**Removing stop words**: common words like "the", "and", "is", etc. that do not provide much meaning. <br />
**Stemming or lemmatization**: reducing words to their base form to group together similar words <br />
**Removing special characters, numbers, and punctuation** <br />

In [3]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)


def remove_emoji(text):
    emoji_pattern = re.compile(
        '['
        u'\U0001F600-\U0001F64F'  # emoticons
        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
        u'\U0001F680-\U0001F6FF'  # transport & map symbols
        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


def remove_html(text):
    html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    return re.sub(html, '', text)


def remove_punct(text):
    table = str.maketrans('', '', string.punctuation)
    return text.translate(table)

# Applying helper functions

data['text_clean'] = data['text'].apply(lambda x: remove_URL(x))
data['text_clean'] = data['text_clean'].apply(lambda x: remove_emoji(x))
data['text_clean'] = data['text_clean'].apply(lambda x: remove_html(x))
data['text_clean'] = data['text_clean'].apply(lambda x: remove_punct(x))

In [4]:
data.head(2)

Unnamed: 0,id,text,text_clean
0,TEST_00000,"According to the regional office, the foreign ...",According to the regional office the foreign m...
1,TEST_00001,"According to a defense reporter, Foreign Minis...",According to a defense reporter Foreign Minist...


In [5]:
data['tokenized'] = data['text_clean'].apply(word_tokenize)
data.drop(['text_clean', 'text'], axis=1, inplace=True)
data.head(2)

Unnamed: 0,id,tokenized
0,TEST_00000,"[According, to, the, regional, office, the, fo..."
1,TEST_00001,"[According, to, a, defense, reporter, Foreign,..."


In [7]:
# Lower casing clean text.

data['lower'] = data['tokenized'].apply(
    lambda x: [word.lower() for word in x])

data.drop(['tokenized'], axis=1, inplace=True)

data.head(2)

Unnamed: 0,id,lower
0,TEST_00000,"[according, to, the, regional, office, the, fo..."
1,TEST_00001,"[according, to, a, defense, reporter, foreign,..."


In [8]:
# Removing stopwords.

data['stopwords_removed'] = data['lower'].apply(
    lambda x: [word for word in x if word not in stop])

data.drop(['lower'], axis=1, inplace=True)

data.head(2)

Unnamed: 0,id,stopwords_removed
0,TEST_00000,"[according, regional, office, foreign, ministe..."
1,TEST_00001,"[according, defense, reporter, foreign, minist..."


In [9]:
# Applying part of speech tags.

data['pos_tags'] = data['stopwords_removed'].apply(nltk.tag.pos_tag)

data.drop(['stopwords_removed'], axis=1, inplace=True)

data.head(2)

Unnamed: 0,id,pos_tags
0,TEST_00000,"[(according, VBG), (regional, JJ), (office, NN..."
1,TEST_00001,"[(according, VBG), (defense, NN), (reporter, N..."


In [10]:
# Converting part of speeches to wordnet format.

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN


data['wordnet_pos'] = data['pos_tags'].apply(
    lambda x: [(word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in x])

data.drop(['pos_tags'], axis=1, inplace=True)

data.head(2)

Unnamed: 0,id,wordnet_pos
0,TEST_00000,"[(according, v), (regional, a), (office, n), (..."
1,TEST_00001,"[(according, v), (defense, n), (reporter, n), ..."


In [12]:
# Applying word lemmatizer.

wnl = WordNetLemmatizer()

data['lemmatized'] = data['wordnet_pos'].apply(
    lambda x: [wnl.lemmatize(word, tag) for word, tag in x])

data.drop(['wordnet_pos'], axis=1, inplace=True)

data['lemmatized'] = data['lemmatized'].apply(
    lambda x: [word for word in x if word not in stop])

data['lemma_str'] = [' '.join(map(str, l)) for l in data['lemmatized']]

data.drop(['lemmatized'], axis=1, inplace=True)

data.head(2)

Unnamed: 0,id,lemma_str
0,TEST_00000,accord regional office foreign minister member...
1,TEST_00001,accord defense reporter foreign minister moham...


In [13]:
data.to_csv('preprocessed_test.csv', index=False)

### DistilRobert-a

In [3]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, f1_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, AdamW


# Define the hyperparameters
max_length = 512

# Split the data into training and validation sets
train_data, val_data = train_test_split(data, test_size=0.1, random_state=42)
# train_data = data.sample(frac=1, random_state=42)

# Load the pretrained DistilRoberta tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilroberta-base')

# Tokenize the text data and convert it to PyTorch tensors
train_encodings = tokenizer(train_data['lemma_str'].tolist(), truncation=True, padding=True, max_length=max_length)
val_encodings = tokenizer(val_data['lemma_str'].tolist(), truncation=True, padding=True, max_length=max_length)

train_labels = train_data['label'].tolist()
val_labels = val_data['label'].tolist()

In [4]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)
    

train_dataset = MyDataset(train_encodings, train_labels)
val_dataset = MyDataset(val_encodings, val_labels)

# Load the AutoModel
model = AutoModelForSequenceClassification.from_pretrained('distilroberta-base', num_labels=data['label'].nunique())


# Set the device to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.bias', 'lm_head.bias', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.out_proj.bias

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (

In [17]:
model_dir = "./distilroberta-b8"
model = AutoModelForSequenceClassification.from_pretrained(model_dir)
model.to(device)
model.train()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (

In [19]:
learning_rate = 1e-5
epochs = 7

# Define a function to compute the F1 score
def compute_f1(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(y_true=labels, y_pred=preds, average='macro')
    return {"f1": f1}


# Set up the training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=epochs,         # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=8,   # batch size for evaluation
    warmup_steps=500,               # number of warmup steps for learning rate scheduler
    weight_decay=0.01,              # strength of weight decay
    save_total_limit=1,             # limit the total number of checkpoints to save
    gradient_accumulation_steps=1,  # number of gradient accumulation steps
    learning_rate=learning_rate,    # learning rate
    metric_for_best_model='f1',     # use F1 score to determine the best model
    greater_is_better=True,          # maximize the F1 score
    load_best_model_at_end=True,
    evaluation_strategy="epoch",
    save_strategy="epoch"
)

# Set up the Trainer class
trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    data_collator=lambda data: {'input_ids': torch.stack([item['input_ids'] for item in data]),
                                'attention_mask': torch.stack([item['attention_mask'] for item in data]),
                                'labels': torch.tensor([item['labels'] for item in data])},
    compute_metrics=compute_f1           # evaluation metric function
)

## Training

In [11]:
# Train the model
trainer.train()   # Batch 8, 5 Epochs

Epoch,Training Loss,Validation Loss,F1
1,0.301,0.289892,0.905679
2,0.2251,0.293112,0.901241
3,0.1622,0.305452,0.930455
4,0.1265,0.345614,0.931539
5,0.0635,0.367131,0.933267


TrainOutput(global_step=26665, training_loss=0.1832344667472961, metrics={'train_runtime': 6604.138, 'train_samples_per_second': 32.297, 'train_steps_per_second': 4.038, 'total_flos': 2.825765708402688e+16, 'train_loss': 0.1832344667472961, 'epoch': 5.0})

In [20]:
# Train the model
trainer.train()   # Batch 8, 7 More Epochs

Epoch,Training Loss,Validation Loss,F1
1,0.0874,0.40161,0.929664
2,0.0485,0.471587,0.926474
3,0.0357,0.493743,0.927241
4,0.0223,0.540775,0.932634
5,0.027,0.604059,0.93272
6,0.0222,0.59171,0.93295
7,0.0068,0.593348,0.934793


TrainOutput(global_step=37331, training_loss=0.03548563080741394, metrics={'train_runtime': 9261.6863, 'train_samples_per_second': 32.242, 'train_steps_per_second': 4.031, 'total_flos': 3.956071991763763e+16, 'train_loss': 0.03548563080741394, 'epoch': 7.0})

In [14]:
# Train the model
trainer.train()   # Batch 16, 12 Epochs

Epoch,Training Loss,Validation Loss,F1
1,0.3064,0.304685,0.874865
2,0.2231,0.24997,0.911973
3,0.1863,0.23943,0.917664
4,0.1465,0.269311,0.934357
5,0.1179,0.28969,0.92413
6,0.1065,0.327523,0.926621
7,0.0847,0.35081,0.930388
8,0.0571,0.377904,0.922655
9,0.0476,0.436992,0.923027
10,0.0328,0.438376,0.926796


TrainOutput(global_step=32004, training_loss=0.13328417001759807, metrics={'train_runtime': 15220.909, 'train_samples_per_second': 33.632, 'train_steps_per_second': 2.103, 'total_flos': 6.781837700166451e+16, 'train_loss': 0.13328417001759807, 'epoch': 12.0})

## Save & Load Model

In [22]:
# specify the directory where you want to save the model
model_dir = "./distilroberta-b8"

# save the trained model

model.save_pretrained(model_dir)
# torch.save({'model_state_dict': model.state_dict(),}, f'./results/roberta_model.pt')

In [7]:
# # load the saved model
model_dir = "./distilroberta-b16"
model = AutoModelForSequenceClassification.from_pretrained(model_dir)
model.to(device)
model.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (

## Validation

In [28]:
# initialize the true and predicted labels
true_labels = []
predicted_labels = []

# iterate over the validation set and make predictions
with torch.no_grad():
    for idx in tqdm(range(len(val_dataset))):
        # get the input features and labels
        inputs = val_dataset[idx]
        labels = inputs.pop("labels")
        
        # move the inputs and labels to the device
        input_ids = torch.tensor(inputs["input_ids"]).to(device)
        attention_mask = torch.tensor(inputs["attention_mask"]).to(device)

        labels = labels.to(device)
        
        # make predictions
        outputs = model(input_ids.unsqueeze(0), attention_mask.unsqueeze(0))
        logits = outputs.logits
        
        predictions = torch.argmax(logits, dim=1)
        
        # append the true and predicted labels to the lists
        true_labels.append(labels.cpu().numpy())
        predicted_labels.append(predictions.cpu().numpy())

# compute the F1 score using macro-average
f1 = f1_score(true_labels, predicted_labels, average="macro")
print(f"Macro-Average F1 Score: {f1:.4f}")

100%|██████████████████████████████████████████████████████████████| 4740/4740 [00:54<00:00, 87.59it/s]

Macro-Average F1 Score: 0.9344





## Test

In [26]:
# Load the data from the CSV file into a DataFrame
test_data = pd.read_csv('preprocessed_test.csv')
print(test_data.shape)

(83334, 2)


In [24]:
# Set the device to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [25]:
import torch.nn as nn

In [26]:
# initialize the true and predicted labels
predicted_proba = []

# iterate over the test set and make predictions
with torch.no_grad():
    for idx in tqdm(range(len(test_data))):
        # get the input features and labels
        test_encodings = tokenizer(test_data.iloc[idx]['lemma_str'], truncation=True, padding=True, max_length=max_length)
        
        # move the inputs and labels to the device
        input_ids = torch.tensor(test_encodings["input_ids"]).to(device)
        attention_mask = torch.tensor(test_encodings["attention_mask"]).to(device)
        
        # make predictions
        outputs = model(input_ids.unsqueeze(0), attention_mask.unsqueeze(0))
        
        logits = outputs.logits
        
        predicted_proba.append(nn.Softmax(dim=1)(logits).squeeze(0).cpu().numpy())


np.savetxt('predictions_proba-b8.txt', np.array(predicted_proba))


100%|███████████████████████████████████████████████████████████| 83334/83334 [13:08<00:00, 105.63it/s]


In [28]:
predicted_labels = np.argmax(predicted_proba, axis=1)

# Make predictions on the test data and save to a CSV file
submission_df = pd.DataFrame({'id': test_data['id'], 'label': predicted_labels})
submission_df.to_csv('submission.csv', index=False)

In [12]:
b8 = np.loadtxt('predictions_proba-b8.txt')
b16  = np.loadtxt('predictions_proba.txt')

In [23]:
predicted_labels = np.argmax((b8 + b16) / 2, axis=1)

In [25]:
predictied_labels = predicted_labels.tolist()

In [31]:
np.argmax(b8, axis=1).tolist()

[3,
 2,
 0,
 0,
 0,
 2,
 0,
 2,
 0,
 0,
 0,
 1,
 0,
 2,
 3,
 0,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 0,
 0,
 2,
 1,
 2,
 2,
 2,
 0,
 0,
 3,
 2,
 0,
 0,
 3,
 0,
 0,
 2,
 0,
 3,
 0,
 2,
 0,
 0,
 2,
 1,
 0,
 3,
 0,
 0,
 0,
 0,
 3,
 0,
 0,
 2,
 1,
 0,
 2,
 3,
 0,
 3,
 0,
 0,
 0,
 0,
 2,
 3,
 0,
 0,
 0,
 0,
 3,
 0,
 3,
 0,
 0,
 0,
 0,
 2,
 0,
 2,
 6,
 0,
 2,
 0,
 1,
 1,
 0,
 2,
 0,
 2,
 0,
 0,
 0,
 2,
 0,
 3,
 0,
 0,
 0,
 0,
 0,
 2,
 0,
 0,
 2,
 3,
 0,
 2,
 0,
 0,
 0,
 2,
 0,
 0,
 2,
 0,
 3,
 1,
 2,
 0,
 0,
 0,
 2,
 2,
 2,
 2,
 2,
 2,
 0,
 3,
 0,
 0,
 0,
 3,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 2,
 2,
 0,
 2,
 0,
 0,
 2,
 0,
 3,
 5,
 2,
 0,
 0,
 0,
 0,
 2,
 0,
 2,
 0,
 2,
 0,
 2,
 2,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 3,
 0,
 0,
 2,
 0,
 0,
 0,
 3,
 0,
 0,
 2,
 1,
 1,
 2,
 0,
 0,
 0,
 0,
 3,
 0,
 2,
 0,
 4,
 2,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
 2,
 2,
 0,
 1,
 0,
 0,
 2,
 3,
 0,
 0,
 7,
 3,
 0,
 0,
 6,
 0,
 2,
 0,
 2,
 0,
 2,
 2,
 0,
 3,
 3,
 0,
 0,
 0,
 0,
 2,
 3,
 0,
 2,
