In [3]:
!pip install torchtext



In [1]:
import spacy
from torchtext.data.utils import get_tokenizer

nlp = spacy.load('en_core_web_sm')
tokenizer = get_tokenizer('spacy')

def preprocess(text):
    # Tokenize the text
    tokens = tokenizer(text)
    
    # Convert everything to lowercase
    tokens = [t.lower() for t in tokens]
    
    # Remove stopwords
    tokens = [t for t in tokens if not nlp.vocab[t].is_stop]
    
    return tokens

ModuleNotFoundError: No module named 'torchtext'

In [None]:
from torchtext.data import Field

TEXT = Field(sequential=True, tokenize=preprocess)
LABEL = Field(sequential=False, use_vocab=False, dtype=torch.float32)

In [None]:
from torchtext.data import TabularDataset

train_data = TabularDataset(
    path='train.csv',
    format='csv',
    skip_header=True,
    fields=[('text', TEXT), ('label', LABEL)]
)


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, f1_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load the data from the CSV file into a DataFrame
data = pd.read_csv('train.csv')

# Preview the first few rows of the data
data.head(2)

Unnamed: 0,id,text,label
0,TRAIN_00000,Israel Parliament to Start Winter Session JERU...,3
1,TRAIN_00001,Two-thirds of business owners say they are pre...,2


In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device");

Using cuda device


In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, AdamW


# Define the hyperparameters
max_length = 512
learning_rate = 1e-5
epochs = 3

# Split the data into training and validation sets
train_data, val_data = train_test_split(data, test_size=0.1)
# train_data = data.sample(frac=1, random_state=42)

# Load the pretrained DistilRoberta tokenizer
MODEL_NAME = 'bert-large-uncased'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, do_lower_case=True)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, 
                                                           num_labels=data['label'].nunique(),
                                                           output_attentions = False, # Whether the model returns attentions weights.
                                                           output_hidden_states = False
                                                          ).to(device)

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint a

In [5]:
# Tokenize the text data and convert it to PyTorch tensors
train_encodings = tokenizer(train_data['text'].tolist(), truncation=True, padding=True, max_length=max_length)
val_encodings = tokenizer(val_data['text'].tolist(), truncation=True, padding=True, max_length=max_length)

train_labels = train_data['label'].tolist()
val_labels = val_data['label'].tolist()

In [6]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)
    

train_dataset = MyDataset(train_encodings, train_labels)
val_dataset = MyDataset(val_encodings, val_labels)

In [6]:
# from torch.utils.data import Dataset
# from torch.utils.data import DataLoader

# class MyDataset(Dataset):
#     def __init__(self, texts, labels, tokenizer, max_len):
#         self.texts = texts
#         self.labels = labels
#         self.tokenizer = tokenizer
#         self.max_len = max_len

#     def __len__(self):
#         return len(self.texts)

#     def __getitem__(self, idx):
#         text = self.texts[idx]
#         label = self.labels[idx]

#         encoding = self.tokenizer.encode_plus(
#             text,
#             add_special_tokens=True,
#             max_length=self.max_len,
#             padding='max_length',
#             return_attention_mask=True,
#             return_tensors='pt',
#             truncation=True
#         )

#         input_ids = encoding['input_ids'].squeeze(0)
#         attention_mask = encoding['attention_mask'].squeeze(0)

#         return {
#             'input_ids': input_ids,
#             'attention_mask': attention_mask,
#             'label': torch.tensor(label, dtype=torch.long)
#         }


# train_labels = train_data['label'].tolist()
# val_labels = val_data['label'].tolist()
# # assuming X_train and y_train are your preprocessed data and labels
# train_dataset = MyDataset(train_data['text'].tolist(), train_labels, tokenizer, max_len=max_length)
# val_dataset = MyDataset(val_data['text'].tolist(), val_labels, tokenizer, max_len=max_length)


In [7]:
# Define a function to compute the F1 score
def compute_f1(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average='macro')
    return {"f1": f1}

# assuming you have created a transformer model called "model" and defined a "train_dataset" and a "eval_dataset"
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=epochs,              # total number of training epochs
    per_device_train_batch_size=3,  # batch size per device during training
    per_device_eval_batch_size=3,   # batch size for evaluation
    warmup_steps=0,                # number of warmup steps for learning rate scheduler
    save_total_limit=1,
    weight_decay=0.01,               # strength of weight decay
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    greater_is_better=True,
    learning_rate=learning_rate,
    evaluation_strategy="epoch",
    save_strategy="epoch"
)

trainer = Trainer(
    model=model,                         # the PyTorch transformer model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset, defined above
    eval_dataset=val_dataset,
    data_collator=lambda data: {'input_ids': torch.stack([item['input_ids'] for item in data]),
                                'attention_mask': torch.stack([item['attention_mask'] for item in data]),
                                'labels': torch.tensor([item['labels'] for item in data])},
    compute_metrics=compute_f1
)

In [8]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,0.3184,0.268035,0.924514
2,0.2118,0.260191,0.932608
3,0.0832,0.309674,0.940714


TrainOutput(global_step=42660, training_loss=0.2383115242637495, metrics={'train_runtime': 25862.3236, 'train_samples_per_second': 4.948, 'train_steps_per_second': 1.65, 'total_flos': 1.1926819792849306e+17, 'train_loss': 0.2383115242637495, 'epoch': 3.0})

In [9]:
torch.save({'model_state_dict': model.state_dict(),}, f'./results/bert_large_model.pt')

In [9]:
checkpoint = torch.load('./results/bert_large_model.pt')
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,

## Test

In [10]:
# Load the data from the CSV file into a DataFrame
test_data = pd.read_csv('test.csv')
print(test_data.shape)

(83334, 2)


In [11]:
# Set the device to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [12]:
# initialize the true and predicted labels
predicted_labels = []

# iterate over the test set and make predictions
with torch.no_grad():
    for idx in tqdm(range(len(test_data))):
        # get the input features and labels
        test_encodings = tokenizer(test_data.iloc[idx]['text'], truncation=True, padding=True, max_length=max_length)
        
        # move the inputs and labels to the device
        input_ids = torch.tensor(test_encodings["input_ids"]).to(device)
        attention_mask = torch.tensor(test_encodings["attention_mask"]).to(device)
        
        # make predictions
        outputs = model(input_ids.unsqueeze(0), attention_mask.unsqueeze(0))
        logits = outputs.logits
        
        predictions = torch.argmax(logits, dim=1)
        
        # append the predicted labels to the lists

        predicted_labels.append(predictions.squeeze(0).cpu().numpy())


100%|██████████████████████████████████████████████████████████| 83334/83334 [1:13:40<00:00, 18.85it/s]


In [13]:
# Make predictions on the test data and save to a CSV file
submission_df = pd.DataFrame({'id': test_data['id'], 'label': predicted_labels})
submission_df.to_csv('submission-bert-large-uncased.csv', index=False)