In [23]:
import torch
import transformers
import pandas as pd
import numpy as np

from sklearn import model_selection, metrics
import warnings
warnings.filterwarnings("ignore")

In [24]:
df = pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")

In [25]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [26]:
id2label = {0: "negative", 1: "positive"}
label2id = {label: id_ for id_, label in id2label.items()}

df["label"] = df["sentiment"].map(label2id)

print(df.shape)
df.head()

(50000, 3)


Unnamed: 0,review,sentiment,label
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1


In [27]:
config = {
    "max_length": 360,
    "model_path": "microsoft/xtremedistil-l6-h256-uncased",
    
    "output_dir": "./my-model",
    "train_batch_size": 64,
    "valid_batch_size": 64,
    "learning_rate": 3e-5,
    "epochs": 3,
    
    "debug": True,
}

In [28]:
tokenizer = transformers.AutoTokenizer.from_pretrained(config["model_path"])

In [29]:
train, valid = model_selection.train_test_split(
    df,
    test_size=0.2,
    random_state=123,
    shuffle=True,
    stratify=df["label"]
)

In [30]:
class TextDataset:
    
    def __init__(self, data, tokenizer, config):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = config["max_length"]
        
    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        
        enc = self.tokenizer(
            row["review"],  
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        
        return {
            "input_ids": enc["input_ids"].squeeze(0), 
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels": torch.tensor(row["label"], dtype=torch.long) 
        }


In [31]:
train_ds = TextDataset(train,tokenizer,config)
valid_ds = TextDataset(valid,tokenizer,config)

In [32]:
valid_ds[0]

{'input_ids': tensor([  101,  1045,  2387,  1996,  3185,  2044,  9361,  2049,  5790,  2006,
         10047, 18939,  1012,  2067,  2059,  1010,  2009,  2001,  2012,  1022,
          1012,  1014,  1998,  1045,  2245,  1010,  1000, 10166,   999,  2008,
          2442,  2022,  1037,  2204,  2028,  1000,  1012,  1045,  2245,  3308,
          1012,  1996,  2927,  1997,  1996,  3185,  2941,  7906,  2054,  1996,
          5436, 10659,  1010,  2021,  2059,  2009,  3632, 27258,  2135,  2091,
          7650,  2049,  8102,  1012,  1045,  2228,  2302,  1996,  2839,  1997,
         14411, 23330,  1010,  2009,  2453,  2031,  2042, 10303,  1011,  2348,
          2002,  2003,  1996,  3114,  1996,  2466,  2240,  3138,  1996,  2607,
          2009,  2515,  1012,  1996,  2839,  2003,  2074,  2205,  6034,  2005,
          2026, 16663,  1010,  1998,  2524,  2000, 18094,  1012,  2036,  1010,
          1996,  4990,  1996,  2364,  2839,  3138,  2013,  1996,  2927,  1997,
          1996,  2466,  6229,  2049,  1

In [33]:
model = transformers.AutoModelForSequenceClassification.from_pretrained(config['model_path'])

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h256-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
def compute_metrics(eval_data):
    logits, labels = eval_data
    
    preds = np.argmax(logits, axis=-1)
    
    return {
        "f1": metrics.f1_score(labels, preds, average="weighted")
    }

In [35]:
training_args = transformers.TrainingArguments(
    output_dir=config["output_dir"],
    per_device_train_batch_size=config["train_batch_size"],
    per_device_eval_batch_size=config["valid_batch_size"],
    learning_rate=config["learning_rate"],
    num_train_epochs=config["epochs"],
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
)

In [36]:
trainer = transformers.Trainer(
    model=model,  
    args=training_args, 
    train_dataset=train_ds, 
    eval_dataset=valid_ds,  
    tokenizer=tokenizer, 
    compute_metrics=compute_metrics,  
)

In [37]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,0.2515,0.237794,0.906351
2,0.2131,0.222992,0.914897
3,0.1739,0.222023,0.918799


TrainOutput(global_step=1875, training_loss=0.25393606185913087, metrics={'train_runtime': 829.0986, 'train_samples_per_second': 144.736, 'train_steps_per_second': 2.261, 'total_flos': 1245553977600000.0, 'train_loss': 0.25393606185913087, 'epoch': 3.0})

In [47]:
device = 0 if torch.cuda.is_available() else -1

pipe = transformers.pipeline(
    task='text-classification',
    model = model,
    tokenizer = tokenizer,
    batch_size = 4,
    device=device
)

In [51]:
pipe(["The movie was nice"])

[{'label': 'LABEL_1', 'score': 0.9889182448387146}]

Separate implementation

In [52]:
class TextDataset:
    
    def __init__(self, data):
        self.data = data
        
    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        
        enc = tokenizer(
            row["text"],
            max_length=10,
            truncation=True,
            padding="max_length"
        )
        
        return {
            "input_ids": torch.tensor(enc["input_ids"]),
            "attention_mask": torch.tensor(enc["attention_mask"]),
            # "label": torch.tensor(row["label"]),
        }

In [53]:
df = pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv").rename(columns={"review": "text"})

id2label = {0: "negative", 1: "positive"}
label2id = {label: id_ for id_, label in id2label.items()}

df["label"] = df["sentiment"].map(label2id)

print(df.shape)
df.head()

(50000, 3)


Unnamed: 0,text,sentiment,label
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1


In [56]:
checkpoint_path = "/kaggle/working/my-model/checkpoint-1875"

tokenizer = transformers.AutoTokenizer.from_pretrained(checkpoint_path)
model = transformers.AutoModelForSequenceClassification.from_pretrained(checkpoint_path)

In [57]:
ds = TextDataset(df)

In [58]:
dl = torch.utils.data.DataLoader(
    ds,
    batch_size=2,
    shuffle=False,
    num_workers=2,
)

In [59]:
model.to("cuda")

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 256, padding_idx=0)
      (position_embeddings): Embedding(512, 256)
      (token_type_embeddings): Embedding(2, 256)
      (LayerNorm): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-5): 6 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=256, out_features=256, bias=True)
              (LayerNorm): LayerNorm((256,), eps=1e-1

In [67]:
for idx, batch in enumerate(dl):
    print(batch)
    
    # Corrected line: Use 'k' and 'v' as defined in the loop
    batch = {k: v.to('cuda') for k, v in batch.items()}
    
    with torch.no_grad():
        out = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])
    
    if idx == 5:
        break


{'input_ids': tensor([[  101,  2028,  1997,  1996,  2060, 15814,  2038,  3855,  2008,   102],
        [  101,  1037,  6919,  2210,  2537,  1012,  1026,  7987,  1013,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
{'input_ids': tensor([[  101,  1045,  2245,  2023,  2001,  1037,  6919,  2126,  2000,   102],
        [  101, 10468,  2045,  1005,  1055,  1037,  2155,  2073,  1037,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
{'input_ids': tensor([[ 101, 9004, 3334, 4717, 7416, 1005, 1055, 1000, 2293,  102],
        [ 101, 2763, 2026, 2035, 1011, 2051, 5440, 3185, 1010,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
{'input_ids': tensor([[  101,  1045,  2469,  2052,  2066,  2000,  2156,  1037, 15218,   102],
        [  101,  2023,  2265,  2001,  2019,  6429,  1010,  4840,  1004,   102]]), 'attention_mask'

In [68]:
batch

{'input_ids': tensor([[  101,  6316,  1996,  7344,  2003,  2028,  1997,  2216, 21864,   102],
         [  101,  1045,  2387,  2023,  3185,  2043,  1045,  2001,  2055,   102]],
        device='cuda:0'),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [69]:
model.eval()



BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 256, padding_idx=0)
      (position_embeddings): Embedding(512, 256)
      (token_type_embeddings): Embedding(2, 256)
      (LayerNorm): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-5): 6 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=256, out_features=256, bias=True)
              (LayerNorm): LayerNorm((256,), eps=1e-1

In [70]:
out

SequenceClassifierOutput(loss=None, logits=tensor([[-0.9649,  0.8618],
        [-1.7717,  1.6175]], device='cuda:0'), hidden_states=None, attentions=None)

In [71]:
pipe = transformers.pipeline(
    "text-classification",
    model=checkpoint_path,
    batch_size=4,
    device = 0
)

In [72]:
pipe(["I hated how good the movie was."] * 10)

[{'label': 'LABEL_0', 'score': 0.9623117446899414},
 {'label': 'LABEL_0', 'score': 0.9623117446899414},
 {'label': 'LABEL_0', 'score': 0.9623117446899414},
 {'label': 'LABEL_0', 'score': 0.9623117446899414},
 {'label': 'LABEL_0', 'score': 0.9623117446899414},
 {'label': 'LABEL_0', 'score': 0.9623117446899414},
 {'label': 'LABEL_0', 'score': 0.9623117446899414},
 {'label': 'LABEL_0', 'score': 0.9623117446899414},
 {'label': 'LABEL_0', 'score': 0.9623117446899414},
 {'label': 'LABEL_0', 'score': 0.9623117446899414}]