In [1]:
import pandas as pd
from datasets import load_dataset
from tqdm import tqdm

In [2]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import wandb

from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    T5EncoderModel,
    get_linear_schedule_with_warmup
)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\z910l567\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
!nvidia-smi

Thu Nov 23 13:09:51 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 536.67                 Driver Version: 536.67       CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA RTX A6000             WDDM  | 00000000:51:00.0 Off |                  Off |
| 37%   56C    P8              24W / 300W |    548MiB / 49140MiB |     27%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [4]:
# Initialize wandb for experiment tracking
wandb.init(project="t5_encoder_classifier")

[34m[1mwandb[0m: Currently logged in as: [33mjinzhaot[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

In [5]:
# Configuration settings
class Config:
    model_name = "T5-small-fce-s_10_epoch_11_15"
    num_labels = 2  # Adjust based on your dataset
    batch_size = 512
    learning_rate = 1e-4
    num_epochs = 5
    max_length = 128  # Adjust as needed
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
train_df_0 = pd.read_csv('classification_data/train_real.csv')

In [7]:
train_df_1 = pd.read_csv('classification_data/c4_2.csv')

In [8]:
train_df_2 = pd.read_csv('classification_data/syth_0.csv')

In [9]:
train_df = pd.concat([train_df_0, train_df_1, train_df_2], ignore_index=True)

In [10]:
train_df = train_df.dropna()

In [11]:
del train_df_0
del train_df_1
del train_df_2

In [12]:
test_df = pd.read_csv('classification_data/test_real.csv')

In [13]:
test_df = test_df.dropna()

In [14]:
# Dataset class for text classification
class TextClassificationDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_length):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        data = self.dataset.iloc[idx]
        text = data['text']
        label = data['label']
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [15]:
# Define the classifier model
class T5Classifier(nn.Module):
    def __init__(self, t5_model, num_labels):
        super().__init__()
        self.t5 = t5_model
        hidden_size = t5_model.config.d_model
        # Additional layers for a deeper classifier
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, hidden_size * 2),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(hidden_size * 2, hidden_size),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(hidden_size, num_labels)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.t5(input_ids=input_ids, attention_mask=attention_mask)
        first_token_tensor = outputs.last_hidden_state[:, 0]
        logits = self.classifier(first_token_tensor)
        return logits

In [16]:
# Load the T5 model and tokenizer
tokenizer = T5Tokenizer.from_pretrained(Config.model_name)
model = T5EncoderModel.from_pretrained(Config.model_name)
classifier_model = T5Classifier(model, Config.num_labels)
classifier_model.load_state_dict(torch.load('t5_encoder_classifier_11_21.pth'))
classifier_model.to(Config.device)

# Freeze the T5 encoder
# for param in classifier_model.t5.parameters():
#     param.requires_grad = False


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


T5Classifier(
  (t5): T5EncoderModel(
    (shared): Embedding(32128, 512)
    (encoder): T5Stack(
      (embed_tokens): Embedding(32128, 512)
      (block): ModuleList(
        (0): T5Block(
          (layer): ModuleList(
            (0): T5LayerSelfAttention(
              (SelfAttention): T5Attention(
                (q): Linear(in_features=512, out_features=512, bias=False)
                (k): Linear(in_features=512, out_features=512, bias=False)
                (v): Linear(in_features=512, out_features=512, bias=False)
                (o): Linear(in_features=512, out_features=512, bias=False)
                (relative_attention_bias): Embedding(32, 8)
              )
              (layer_norm): T5LayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (1): T5LayerFF(
              (DenseReluDense): T5DenseActDense(
                (wi): Linear(in_features=512, out_features=2048, bias=False)
                (wo): Linear(in_features=2048, out_fea

In [17]:
train_dataset = TextClassificationDataset(train_df, tokenizer, Config.max_length)
val_dataset = TextClassificationDataset(test_df, tokenizer, Config.max_length)

In [18]:
train_loader = DataLoader(train_dataset, batch_size=Config.batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=Config.batch_size)

In [19]:
# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(classifier_model.parameters(), lr=Config.learning_rate)



In [20]:
# Function to calculate accuracy
def calculate_accuracy(outputs, labels):
    _, predicted = torch.max(outputs, dim=1)
    correct = (predicted == labels).float()
    return correct.sum() / len(correct)


In [21]:
# Training and validation loop with tqdm
for epoch in range(Config.num_epochs):
    classifier_model.train()
    total_loss = 0
    total_accuracy = 0
    train_progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{Config.num_epochs} Training", leave=False)

    for batch in train_progress_bar:
        inputs = batch['input_ids'].to(Config.device)
        attention_mask = batch['attention_mask'].to(Config.device)
        labels = batch['labels'].to(Config.device)

        optimizer.zero_grad()
        outputs = classifier_model(inputs, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        accuracy = calculate_accuracy(outputs, labels)
        total_accuracy += accuracy
        train_progress_bar.set_postfix({'loss': f"{loss.item():.4f}", 'accuracy': f"{accuracy.item():.4f}"})

    avg_train_loss = total_loss / len(train_loader)
    avg_train_accuracy = total_accuracy / len(train_loader)
    wandb.log({"Train Loss": avg_train_loss, "Train Accuracy": avg_train_accuracy})

    classifier_model.eval()
    total_eval_loss = 0
    total_eval_accuracy = 0
    val_progress_bar = tqdm(val_loader, desc=f"Epoch {epoch+1}/{Config.num_epochs} Validation", leave=False)
    
    with torch.no_grad():
        for batch in val_progress_bar:
            inputs = batch['input_ids'].to(Config.device)
            attention_mask = batch['attention_mask'].to(Config.device)
            labels = batch['labels'].to(Config.device)

            outputs = classifier_model(inputs, attention_mask)
            loss = criterion(outputs, labels)
            total_eval_loss += loss.item()
            accuracy = calculate_accuracy(outputs, labels)
            total_eval_accuracy += accuracy
            val_progress_bar.set_postfix({'loss': f"{loss.item():.4f}", 'accuracy': f"{accuracy.item():.4f}"})

    avg_val_loss = total_eval_loss / len(val_loader)
    avg_val_accuracy = total_eval_accuracy / len(val_loader)
    wandb.log({"Validation Loss": avg_val_loss, "Validation Accuracy": avg_val_accuracy})

    print(f"Epoch {epoch+1}/{Config.num_epochs} - Train Loss: {avg_train_loss:.4f}, Train Accuracy: {avg_train_accuracy:.4f}, Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {avg_val_accuracy:.4f}")

                                                                                                                       

Epoch 1/5 - Train Loss: 0.2201, Train Accuracy: 0.9111, Validation Loss: 0.5290, Validation Accuracy: 0.7591


                                                                                                                       

Epoch 2/5 - Train Loss: 0.1910, Train Accuracy: 0.9239, Validation Loss: 0.5306, Validation Accuracy: 0.7607


                                                                                                                       

Epoch 3/5 - Train Loss: 0.1810, Train Accuracy: 0.9282, Validation Loss: 0.5261, Validation Accuracy: 0.7611


                                                                                                                       

Epoch 4/5 - Train Loss: 0.1745, Train Accuracy: 0.9310, Validation Loss: 0.5287, Validation Accuracy: 0.7618


                                                                                                                       

Epoch 5/5 - Train Loss: 0.1697, Train Accuracy: 0.9330, Validation Loss: 0.5276, Validation Accuracy: 0.7618




In [22]:
# Save the model
torch.save(classifier_model.state_dict(), "t5_encoder_classifier_11_23.pth")

In [23]:
def classify(input_text):
  batch = tokenizer(
            text,
            add_special_tokens=True,
            max_length=64,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        ).to(Config.device)
  result = classifier_model(**batch)
  return result

In [26]:
tokenizer(text,
            add_special_tokens=True,
            max_length=64,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )['attention_mask']

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [49]:
text = "Jeff runs a mile and drops his keys."
print(classify(text))

tensor([[ 1.2851, -1.1959]], device='cuda:0', grad_fn=<AddmmBackward0>)


In [54]:
text = "because they spent time on unmeaningful subjects."
print(classify(text))

tensor([[ 1.3362, -1.2404]], device='cuda:0', grad_fn=<AddmmBackward0>)


In [56]:
text = "My husband engineer."
print(classify(text))

tensor([[-1.0737,  1.0736]], device='cuda:0', grad_fn=<AddmmBackward0>)


In [34]:
text = "Although I've known him for a while, I still can't believe how stubborn he is."
print(classify(text))

tensor([[ 1.7015, -1.5682]], device='cuda:0', grad_fn=<AddmmBackward0>)
