In [None]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("/content/seq_final.csv")
df = df.rename(columns={'story' : 'text', 'story_type' : 'label'})
df = df[['text', 'label']]

le = LabelEncoder()
le.fit(['recalled', 'imagined'])

df['label'] = le.transform(df['label'])


train, test, _, _ = train_test_split( df, df['label'], test_size=0.2, random_state=42)

train, valid, _,_ = train_test_split( train, train['label'], test_size=0.1, random_state=42)

test

Unnamed: 0,text,label
2629,A family member needed help doing a birthday p...,0
1235,The trip to the hospital seemed to take foreve...,0
2181,About a month ago I went on a vacation to my c...,0
230,"Three months ago, I went to the funeral of a f...",0
4829,My very first time performing at a show was su...,1
...,...,...
3828,"Around six months ago, on my birthday, my frie...",1
1319,I'll never forget today as long as I live. I ...,0
4081,Last month I was notified by Pacific Gas & Ele...,1
70,"Dear Diary,I still can't believe it worked! I...",0


In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW
from tqdm import tqdm
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score


class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

train_df, valid_df, test_df = train, valid, test


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')


for param in bert_model.parameters():
    param.requires_grad = True


class CustomBERTClassifier(torch.nn.Module):
    def __init__(self, bert_model, num_labels):
        super(CustomBERTClassifier, self).__init__()
        self.bert = bert_model
        self.classifier = torch.nn.Linear(bert_model.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        logits = self.classifier(pooled_output)
        return logits


train_dataset = CustomDataset(train_df['text'].values, train_df['label'].values, tokenizer)
val_dataset = CustomDataset(valid_df['text'].values, valid_df['label'].values, tokenizer)
test_dataset = CustomDataset(test_df['text'].values, test_df['label'].values, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)


model = CustomBERTClassifier(bert_model, num_labels=2)
optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 5


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_correct_train = 0
    total_samples_train = 0
    all_predictions_train = []
    all_labels_train = []


    train_loader_tqdm = tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}, Training')

    for batch in train_loader_tqdm:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        logits = model(input_ids, attention_mask=attention_mask)
        loss = torch.nn.functional.cross_entropy(logits, labels)
        loss.backward()
        optimizer.step()

        predictions = torch.argmax(logits, dim=1)
        all_predictions_train.extend(predictions.cpu().numpy())
        all_labels_train.extend(labels.cpu().numpy())

        total_correct_train += torch.sum(predictions == labels).item()
        total_samples_train += labels.size(0)


        train_loader_tqdm.set_postfix({'Loss': loss.item(), 'Accuracy': total_correct_train / total_samples_train})


    f1_train = f1_score(all_labels_train, all_predictions_train, average='weighted')
    print(f'Epoch {epoch + 1}/{num_epochs}, Training Weighted F1 Score: {f1_train}')


    model.eval()
    with torch.no_grad():
        total_correct_val = 0
        total_samples_val = 0
        all_predictions_val = []
        all_labels_val = []

        val_loader_tqdm = tqdm(val_loader, desc=f'Epoch {epoch + 1}/{num_epochs}, Validation')

        for batch in val_loader_tqdm:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            logits = model(input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(logits, dim=1)

            all_predictions_val.extend(predictions.cpu().numpy())
            all_labels_val.extend(labels.cpu().numpy())

            total_correct_val += torch.sum(predictions == labels).item()
            total_samples_val += labels.size(0)


            val_loader_tqdm.set_postfix({'Accuracy': total_correct_val / total_samples_val})


        f1_val = f1_score(all_labels_val, all_predictions_val, average='weighted')
        print(f'Epoch {epoch + 1}/{num_epochs}, Validation Weighted F1 Score: {f1_val}')


model.eval()
with torch.no_grad():
    total_correct_test = 0
    total_samples_test = 0
    all_predictions_test = []
    all_labels_test = []

    test_loader_tqdm = tqdm(test_loader, desc='Testing')

    for batch in test_loader_tqdm:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        logits = model(input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(logits, dim=1)

        all_predictions_test.extend(predictions.cpu().numpy())
        all_labels_test.extend(labels.cpu().numpy())

        total_correct_test += torch.sum(predictions == labels).item()
        total_samples_test += labels.size(0)


        test_loader_tqdm.set_postfix({'Accuracy': total_correct_test / total_samples_test})


    f1_test = f1_score(all_labels_test, all_predictions_test, average='weighted')
    print(f'Test Weighted F1 Score: {f1_test}')


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch 1/5, Training: 100%|██████████| 499/499 [06:33<00:00,  1.27it/s, Loss=0.396, Accuracy=0.63]


Epoch 1/5, Training Weighted F1 Score: 0.6299857136218493


Epoch 1/5, Validation: 100%|██████████| 56/56 [00:16<00:00,  3.33it/s, Accuracy=0.727]


Epoch 1/5, Validation Weighted F1 Score: 0.7242515769227582


Epoch 2/5, Training: 100%|██████████| 499/499 [06:34<00:00,  1.27it/s, Loss=1.04, Accuracy=0.786]


Epoch 2/5, Training Weighted F1 Score: 0.7861991858519966


Epoch 2/5, Validation: 100%|██████████| 56/56 [00:16<00:00,  3.42it/s, Accuracy=0.767]


Epoch 2/5, Validation Weighted F1 Score: 0.7671671260553223


Epoch 3/5, Training: 100%|██████████| 499/499 [06:33<00:00,  1.27it/s, Loss=0.00761, Accuracy=0.87]


Epoch 3/5, Training Weighted F1 Score: 0.8699880659173843


Epoch 3/5, Validation: 100%|██████████| 56/56 [00:16<00:00,  3.37it/s, Accuracy=0.691]


Epoch 3/5, Validation Weighted F1 Score: 0.6700145858824953


Epoch 4/5, Training: 100%|██████████| 499/499 [06:33<00:00,  1.27it/s, Loss=0.0112, Accuracy=0.946]


Epoch 4/5, Training Weighted F1 Score: 0.9460429488543833


Epoch 4/5, Validation: 100%|██████████| 56/56 [00:16<00:00,  3.41it/s, Accuracy=0.786]


Epoch 4/5, Validation Weighted F1 Score: 0.7853559544831534


Epoch 5/5, Training: 100%|██████████| 499/499 [06:33<00:00,  1.27it/s, Loss=0.00687, Accuracy=0.975]


Epoch 5/5, Training Weighted F1 Score: 0.974653979245706


Epoch 5/5, Validation: 100%|██████████| 56/56 [00:16<00:00,  3.43it/s, Accuracy=0.77]


Epoch 5/5, Validation Weighted F1 Score: 0.7697634255196333


Testing: 100%|██████████| 139/139 [00:41<00:00,  3.37it/s, Accuracy=0.778]

Test Weighted F1 Score: 0.7777244447927415



