In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score, f1_score

In [None]:
train_data = pd.read_csv('/content/drive/MyDrive/BMI 550/Final_project/train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/BMI 550/Final_project/test.csv')

In [None]:
texts_train_BERT = train_data['text'].tolist()
classes_train_BERT = train_data['Class'].tolist()

texts_test_BERT = test_data['text'].tolist()
classes_test_BERT = test_data['Class'].tolist()

tokenizer = BertTokenizer.from_pretrained('bert-large-cased')
print('model_loaded______')
train_encodings = tokenizer(texts_train_BERT, truncation=True, padding=True, max_length=100)
test_encodings = tokenizer(texts_test_BERT, truncation=True, padding=True, max_length=100)
print('Tokenazation done______')
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = TextDataset(train_encodings, classes_train_BERT)
val_dataset = TextDataset(test_encodings, classes_test_BERT)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
print('Data_loader done______')
model = BertForSequenceClassification.from_pretrained('bert-large-cased', num_labels=2)

optimizer = AdamW(model.parameters(), lr=5e-5)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print('Start training______')
for epoch in range(2): 
    model.train()
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    model.eval()
    predictions, true_labels = [], []
    for batch in val_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions.extend(torch.argmax(logits, dim=-1).tolist())
        true_labels.extend(batch['labels'].tolist())

    val_accuracy = accuracy_score(true_labels, predictions)
    print(f'Epoch {epoch}: Validation Accuracy: {val_accuracy}')


In [None]:
model.save_pretrained('/content/drive/MyDrive/BMI 550/Final_project/BERT_large_model')
tokenizer.save_pretrained('/content/drive/MyDrive/BMI 550/Final_project/BERT_large_model')

In [None]:
model = BertForSequenceClassification.from_pretrained('/content/drive/MyDrive/BMI 550/Final_project/BERT_large_model')
tokenizer = BertTokenizer.from_pretrained('/content/drive/MyDrive/BMI 550/Final_project/BERT_large_model')

In [None]:
def predict(text, model, tokenizer):
    inputs = tokenizer(text, padding=True, truncation=True, max_length=100, return_tensors="pt")

    with torch.no_grad():
        outputs = model(**inputs)

    probs = outputs.logits.softmax(dim=-1)
    print(probs)
    predicted_class = torch.argmax(probs, dim=-1)
    return predicted_class

predicted_class = predict(texts_test_BERT, model, tokenizer)

In [None]:
f1_score(classes_test_BERT, predicted_class)
accuracy_score(classes_test_BERT, predicted_class)