In [None]:
!pip install transformers

In [3]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score, f1_score

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
train_data = pd.read_csv('/content/drive/MyDrive/Data/train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/Data/test.csv')

In [6]:
texts_train_BERT = train_data['text'].tolist()
classes_train_BERT = train_data['Class'].tolist()

texts_test_BERT = test_data['text'].tolist()
classes_test_BERT = test_data['Class'].tolist()

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
print('model_loaded______')
train_encodings = tokenizer(texts_train_BERT, truncation=True, padding=True, max_length=100)
test_encodings = tokenizer(texts_test_BERT, truncation=True, padding=True, max_length=100)
print('Tokenazation done______')
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = TextDataset(train_encodings, classes_train_BERT)
val_dataset = TextDataset(test_encodings, classes_test_BERT)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
print('Data_loader done______')
model = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=2)

optimizer = AdamW(model.parameters(), lr=5e-5)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print('Start training______')
for epoch in range(6):
    model.train()
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    model.eval()
    predictions, true_labels = [], []
    for batch in val_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions.extend(torch.argmax(logits, dim=-1).tolist())
        true_labels.extend(batch['labels'].tolist())

    val_accuracy = accuracy_score(true_labels, predictions)
    print(f'Epoch {epoch}: Validation Accuracy: {val_accuracy}')


Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model_loaded______
Tokenazation done______
Data_loader done______


Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Start training______
Epoch 0: Validation Accuracy: 0.8712624584717608
Epoch 1: Validation Accuracy: 0.909468438538206
Epoch 2: Validation Accuracy: 0.8995016611295681
Epoch 3: Validation Accuracy: 0.8995016611295681
Epoch 4: Validation Accuracy: 0.8837209302325582
Epoch 5: Validation Accuracy: 0.9028239202657807


In [7]:
model.save_pretrained('/content/drive/MyDrive/Models/BERT_base_model')
tokenizer.save_pretrained('/content/drive/MyDrive/Models/BERT_base_model')

('/content/drive/MyDrive/Models/BERT_base_model/tokenizer_config.json',
 '/content/drive/MyDrive/Models/BERT_base_model/special_tokens_map.json',
 '/content/drive/MyDrive/Models/BERT_base_model/vocab.txt',
 '/content/drive/MyDrive/Models/BERT_base_model/added_tokens.json')

In [8]:
model = BertForSequenceClassification.from_pretrained('/content/drive/MyDrive/Models/BERT_base_model')
tokenizer = BertTokenizer.from_pretrained('/content/drive/MyDrive/Models/BERT_base_model')

In [9]:
def predict(text, model, tokenizer):
    inputs = tokenizer(text, padding=True, truncation=True, max_length=100, return_tensors="pt")

    with torch.no_grad():
        outputs = model(**inputs)

    probs = outputs.logits.softmax(dim=-1)
    print(probs)
    predicted_class = torch.argmax(probs, dim=-1)
    return predicted_class

predicted_class = predict(texts_test_BERT, model, tokenizer)

tensor([[2.5546e-03, 9.9745e-01],
        [9.9863e-01, 1.3664e-03],
        [9.9918e-01, 8.1832e-04],
        ...,
        [9.1243e-02, 9.0876e-01],
        [9.9909e-01, 9.1377e-04],
        [9.9906e-01, 9.3598e-04]])


In [10]:
f1_score(classes_test_BERT, predicted_class)
accuracy_score(classes_test_BERT, predicted_class)

0.9028239202657807