
References

https://medium.com/@khang.pham.exxact/text-classification-with-bert-7afaacc5e49b

In [3]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import pandas as pd

In [4]:
df = pd.read_csv("../input/dataset_ita_eng_cleaned.csv", header = None)
df.columns = ['ID', 'text', 'label']
df.head()

Unnamed: 0,ID,text,label
0,UsNB_SOM2_DOC1_0012,Refresh tecnologico della SOM che prevede:o ...,1
1,UsNB_SOM2_DOC1_0013,Realizzazione di una seconda sala server pre...,1
2,UsNB_SOM2_DOC1_0014,Realizzazione di una seconda Sala Operativa ...,1
3,UsNB_SOM2_DOC1_0015,Porting del sw Alstom sulla nuova Infrastrut...,1
4,UsNB_SOM2_DOC1_0016,Spostamento o sostituzione di software (ad e...,1


In [5]:
import re
def clean_text(text):
    
    text = text.lower()
    
    text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text) 
    
    punctuation = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'
    for p in punctuation:
        text = text.replace(p,'') 
    
    return text

df['text'] = df['text'].apply(lambda x: clean_text(x))

df.head()

Unnamed: 0,ID,text,label
0,UsNB_SOM2_DOC1_0012,refresh tecnologico della som che prevede o d...,1
1,UsNB_SOM2_DOC1_0013,realizzazione di una seconda sala server pres...,1
2,UsNB_SOM2_DOC1_0014,realizzazione di una seconda sala operativa d...,1
3,UsNB_SOM2_DOC1_0015,porting del sw alstom sulla nuova infrastrutt...,1
4,UsNB_SOM2_DOC1_0016,spostamento o sostituzione di software ad es ...,1


In [8]:
texts = df["text"].tolist()
labels =df["label"].tolist()

In [9]:
# Split the data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, random_state=42, test_size=0.2)

# Load the pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  # Assuming binary classification, change num_labels accordingly

# Tokenize and encode the training and validation sets
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

# Convert the tokenized data to PyTorch tensors
train_dataset = TensorDataset(
    torch.tensor(train_encodings['input_ids']),
    torch.tensor(train_encodings['attention_mask']),
    torch.tensor(train_labels)
)

val_dataset = TensorDataset(
    torch.tensor(val_encodings['input_ids']),
    torch.tensor(val_encodings['attention_mask']),
    torch.tensor(val_labels)
)

# Define DataLoader for training and validation sets
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# Set up the optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()

# Training loop
num_epochs = 3  # You may need to adjust this based on your specific dataset and model convergence
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(num_epochs):
    model.train()
    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}'):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Validation loop
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f'Epoch {epoch + 1}/{num_epochs} - Validation'):
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Calculate validation accuracy
    val_accuracy = accuracy_score(all_labels, all_preds)
    print(f'Epoch {epoch + 1}/{num_epochs} - Validation Accuracy: {val_accuracy:.4f}')


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

Epoch 1/3 - Validation Accuracy: 0.8316


Epoch 2/3: 100%|██████████████████████████| 1221/1221 [2:53:01<00:00,  8.50s/it]
Epoch 2/3 - Validation: 100%|█████████████████| 306/306 [12:49<00:00,  2.51s/it]


Epoch 2/3 - Validation Accuracy: 0.8439


Epoch 3/3: 100%|██████████████████████████| 1221/1221 [2:51:23<00:00,  8.42s/it]
Epoch 3/3 - Validation: 100%|█████████████████| 306/306 [12:39<00:00,  2.48s/it]

Epoch 3/3 - Validation Accuracy: 0.8583





In [10]:
# Save the trained model if needed
torch.save(model.state_dict(), 'bert_classifier.pth')