In [None]:
import torch
import transformers
import pandas as pd
import numpy as np

from sklearn import model_selection, metrics

In [None]:
class TextDataset:

    def __init__(self, data):
        self.data = data

    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, idx):
        row = self.data.iloc[idx]

        enc = tokenizer(
            row["text"],
            max_length=10,
            truncation=True,
            padding="max_length"
        )

        return {
            "input_ids": torch.tensor(enc["input_ids"]),
            "attention_mask": torch.tensor(enc["attention_mask"]),
            # "label": torch.tensor(row["label"]),
        }

In [None]:
df = pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv").rename(columns={"review": "text"})

id2label = {0: "negative", 1: "positive"}
label2id = {label: id_ for id_, label in id2label.items()}

df["label"] = df["sentiment"].map(label2id)

print(df.shape)
df.head()

(50000, 3)


Unnamed: 0,text,sentiment,label
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1


In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained("/kaggle/input/lecture-5-imdb-review-clf/my-model/")
model = transformers.AutoModelForSequenceClassification.from_pretrained("/kaggle/input/lecture-5-imdb-review-clf/my-model/")

In [None]:
ds = TextDataset(df)

In [None]:
[ds[0], ds[1]]

[{'input_ids': tensor([  101,  2028,  1997,  1996,  2060, 15814,  2038,  3855,  2008,   102]),
  'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])},
 {'input_ids': tensor([ 101, 1037, 6919, 2210, 2537, 1012, 1026, 7987, 1013,  102]),
  'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])}]

In [None]:
dl = torch.utils.data.DataLoader(
    ds,
    batch_size=2,
    shuffle=False,
    num_workers=2,
)

In [None]:
model.to("cuda")

In [None]:
for idx, batch in enumerate(dl):
    print(batch)

    batch = {key: value.to('cuda') for k, v in batch.items()}

    with torch.no_grad():
        out = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])

    if idx == 5:
        break

{'input_ids': tensor([[  101,  2028,  1997,  1996,  2060, 15814,  2038,  3855,  2008,   102],
        [  101,  1037,  6919,  2210,  2537,  1012,  1026,  7987,  1013,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
{'input_ids': tensor([[  101,  1045,  2245,  2023,  2001,  1037,  6919,  2126,  2000,   102],
        [  101, 10468,  2045,  1005,  1055,  1037,  2155,  2073,  1037,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
{'input_ids': tensor([[ 101, 9004, 3334, 4717, 7416, 1005, 1055, 1000, 2293,  102],
        [ 101, 2763, 2026, 2035, 1011, 2051, 5440, 3185, 1010,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
{'input_ids': tensor([[  101,  1045,  2469,  2052,  2066,  2000,  2156,  1037, 15218,   102],
        [  101,  2023,  2265,  2001,  2019,  6429,  1010,  4840,  1004,   102]]), 'attention_mask'

In [None]:
batch

{'input_ids': tensor([[  101,  2028,  1997,  1996,  2060, 15814,  2038,  3855,  2008,   102],
         [  101,  1037,  6919,  2210,  2537,  1012,  1026,  7987,  1013,   102]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [None]:
model.eval()



In [None]:
out

SequenceClassifierOutput(loss=None, logits=tensor([[-1.0424,  0.9889],
        [-1.1137,  1.0554]]), hidden_states=None, attentions=None)

In [None]:
pipe = transformers.pipeline(
    "text-classification",
    model="/kaggle/input/lecture-5-imdb-review-clf/my-model/",
    batch_size=4
)

In [None]:
pipe(["I hated how good the movie was."] * 10)

[{'label': 'negative', 'score': 0.8399201035499573},
 {'label': 'negative', 'score': 0.8399200439453125},
 {'label': 'negative', 'score': 0.8399201035499573},
 {'label': 'negative', 'score': 0.8399200439453125},
 {'label': 'negative', 'score': 0.8399201035499573},
 {'label': 'negative', 'score': 0.8399200439453125},
 {'label': 'negative', 'score': 0.8399201035499573},
 {'label': 'negative', 'score': 0.8399200439453125},
 {'label': 'negative', 'score': 0.839920163154602},
 {'label': 'negative', 'score': 0.8399200439453125}]