In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [17]:
! pip install sentencepiece



In [45]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F

from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    T5EncoderModel,
    get_linear_schedule_with_warmup
)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Configuration settings
class Config:
    model_name = "/content/drive/MyDrive/11611/project/checkpoint-128000_11_17"
    num_labels = 2  # Adjust based on your dataset
    batch_size = 512
    learning_rate = 1e-4
    num_epochs = 5
    max_length = 128  # Adjust as needed
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# Define the classifier model
class T5Classifier(nn.Module):
    def __init__(self, t5_model, num_labels):
        super().__init__()
        self.t5 = t5_model
        hidden_size = t5_model.config.d_model
        # Additional layers for a deeper classifier
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, hidden_size * 2),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(hidden_size * 2, hidden_size),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(hidden_size, num_labels)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.t5(input_ids=input_ids, attention_mask=attention_mask)
        first_token_tensor = outputs.last_hidden_state[:, 0]
        logits = self.classifier(first_token_tensor)
        return logits

In [6]:
# Load the T5 model and tokenizer
tokenizer = T5Tokenizer.from_pretrained(Config.model_name)
model = T5EncoderModel.from_pretrained(Config.model_name)
classifier_model = T5Classifier(model, Config.num_labels)
classifier_model.load_state_dict(torch.load('/content/drive/MyDrive/11611/project/T5_encoder_only_classification/t5_encoder_classifier_11_26.pth', map_location=torch.device('cpu')))
classifier_model.to(Config.device)

# Freeze the T5 encoder
# for param in classifier_model.t5.parameters():
#     param.requires_grad = False


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


T5Classifier(
  (t5): T5EncoderModel(
    (shared): Embedding(32128, 512)
    (encoder): T5Stack(
      (embed_tokens): Embedding(32128, 512)
      (block): ModuleList(
        (0): T5Block(
          (layer): ModuleList(
            (0): T5LayerSelfAttention(
              (SelfAttention): T5Attention(
                (q): Linear(in_features=512, out_features=512, bias=False)
                (k): Linear(in_features=512, out_features=512, bias=False)
                (v): Linear(in_features=512, out_features=512, bias=False)
                (o): Linear(in_features=512, out_features=512, bias=False)
                (relative_attention_bias): Embedding(32, 8)
              )
              (layer_norm): T5LayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (1): T5LayerFF(
              (DenseReluDense): T5DenseActDense(
                (wi): Linear(in_features=512, out_features=2048, bias=False)
                (wo): Linear(in_features=2048, out_fea

In [113]:
classifier_model.eval()
def classify(input_text):
  batch = tokenizer(
            text,
            add_special_tokens=True,
            max_length=64,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        ).to(Config.device)
  result = classifier_model(**batch).detach()
  probs = F.softmax(result, dim=1).tolist()
  return result.tolist()[0], probs[0]

In [114]:
text = "Jeff runs a mile and drops his keys."
print(classify(text))

([1.2119166851043701, -1.131719708442688], [0.9124270677566528, 0.08757290244102478])


In [115]:
text = "Jeff run a mile and drops his keys."
print(classify(text))

([-0.7727712392807007, 0.7835224866867065], [0.17417912185192108, 0.8258209228515625])


In [82]:
text = "Jeff runs a mile and drop his keys."
print(classify(text))

([-0.5511027574539185, 0.5500047206878662], [0.24953244626522064, 0.7504675984382629])


In [83]:
text = "Jeff ran a mile and drop his keys."
print(classify(text))

([-0.9128469228744507, 0.916192889213562], [0.13835270702838898, 0.8616473078727722])


In [84]:
text = "Jeff ran a mile and drops his keys."
print(classify(text))

([-1.0093377828598022, 1.014397382736206], [0.11673333495855331, 0.8832666873931885])


In [85]:
text = "Jeff ran a mile and dropped his keys."
print(classify(text))

([1.162501573562622, -1.0866374969482422], [0.904576301574707, 0.09542375802993774])


In [86]:
text = "because they spent time unmeaningful subjects."
print(classify(text))

([-0.2505480647087097, 0.2563762664794922], [0.3759148120880127, 0.6240851879119873])


In [87]:
text = "because they spent time on unmeaningful subject."
print(classify(text))

([-0.2504260540008545, 0.2553766965866089], [0.37617796659469604, 0.6238219738006592])


In [88]:
text = "because they spent time on unmeaningful a subject."
print(classify(text))

([0.5543808341026306, -0.5103718042373657], [0.7435977458953857, 0.25640228390693665])


In [89]:
text = "because they spent time on unmeaningful subjects."
print(classify(text))

([1.314034104347229, -1.2207186222076416], [0.9265424609184265, 0.0734575018286705])


In [90]:
text = "My husband engineer."
print(classify(text))

([-1.083757996559143, 1.0905647277832031], [0.10208014398813248, 0.8979198336601257])


In [91]:
text = "My husband is engineer."
print(classify(text))

([-0.7998210191726685, 0.7908667325973511], [0.16928717494010925, 0.8307128548622131])


In [92]:
text = "My husband is engineers."
print(classify(text))

([-0.28121137619018555, 0.2862236499786377], [0.36182889342308044, 0.6381711363792419])


In [93]:
text = "My husband is an engineers."
print(classify(text))

([-0.8522646427154541, 0.8426874876022339], [0.155125692486763, 0.8448742628097534])


In [94]:
text = "My husband is a engineer."
print(classify(text))

([-0.24166958034038544, 0.25256747007369995], [0.37889590859413147, 0.6211040616035461])


In [95]:
text = "My husband is an engineer."
print(classify(text))

([2.432164430618286, -2.224632501602173], [0.9905925393104553, 0.00940749142318964])


In [96]:
text = "Although I've known him for a while, I still can't believe how much stubborn he is."
print(classify(text))

([-0.17973913252353668, 0.18959887325763702], [0.4087010324001312, 0.5912990570068359])


In [97]:
text = "Although I've known him for a while, I still can't believe how stubborn he is."
print(classify(text))

([1.7429393529891968, -1.6051613092422485], [0.9660426378250122, 0.03395741432905197])
