In [None]:
! pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/82/25/89050e69ed53c2a3b7f8c67844b3c8339c1192612ba89a172cf85b298948/transformers-3.0.1-py3-none-any.whl (757kB)
[K     |▍                               | 10kB 24.6MB/s eta 0:00:01[K     |▉                               | 20kB 5.7MB/s eta 0:00:01[K     |█▎                              | 30kB 5.5MB/s eta 0:00:01[K     |█▊                              | 40kB 6.3MB/s eta 0:00:01[K     |██▏                             | 51kB 6.2MB/s eta 0:00:01[K     |██▋                             | 61kB 6.9MB/s eta 0:00:01[K     |███                             | 71kB 7.2MB/s eta 0:00:01[K     |███▌                            | 81kB 7.0MB/s eta 0:00:01[K     |████                            | 92kB 7.1MB/s eta 0:00:01[K     |████▎                           | 102kB 7.4MB/s eta 0:00:01[K     |████▊                           | 112kB 7.4MB/s eta 0:00:01[K     |█████▏                          | 122kB 7.4M

In [None]:
! mkdir -p ~/.kaggle/
! cp kaggle.json ~/.kaggle/

In [None]:
! kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Downloading imdb-dataset-of-50k-movie-reviews.zip to /content
 35% 9.00M/25.7M [00:00<00:01, 12.2MB/s]
100% 25.7M/25.7M [00:00<00:00, 29.4MB/s]


In [None]:
! unzip imdb-dataset-of-50k-movie-reviews.zip

Archive:  imdb-dataset-of-50k-movie-reviews.zip
  inflating: IMDB Dataset.csv        


In [None]:
import warnings
warnings.filterwarnings("ignore")

## Configurations

In [None]:
import transformers
MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
MODEL_PATH = "model.bin"
EPOCHS = 10
TRAINING_FILE = "IMDB Dataset.csv"
TOKENIZER = transformers.BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

## Dataset

In [None]:
import torch
class BERTDataset:
    def __init__(self, review, target):
        self.review = review
        self.target = target
        self.tokenizer = TOKENIZER
        self.max_len = MAX_LEN

    def __len__(self):
        return len(self.review)

    def __getitem__(self, item):
        review = str(self.review[item])
        review = " ".join(review.split())
        
        # use the bert tokenizer 
        inputs = self.tokenizer.encode_plus(
            review,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True, #add paddings
            truncation=True
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]

        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
            "targets": torch.tensor(self.target[item], dtype=torch.float),
        }

## Engine

In [None]:
import torch.nn as nn
from tqdm import tqdm


def loss_fn(outputs, targets):
    return nn.BCEWithLogitsLoss()(outputs, targets.view(-1, 1))


def train_fn(data_loader, model, optimizer, device, scheduler):
    model.train()

    for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        targets = d["targets"]

        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.float)

        optimizer.zero_grad()
        outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)

        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()
        scheduler.step()

def eval_fn(data_loader, model, device):
    model.eval()
    fin_targets = []
    fin_outputs = []
    with torch.no_grad():
        for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
            ids = d["ids"]
            token_type_ids = d["token_type_ids"]
            mask = d["mask"]
            targets = d["targets"]

            ids = ids.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)

            outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

## Model

In [None]:
class BERTBaseUncased(nn.Module):
    def __init__(self):
        super(BERTBaseUncased, self).__init__()
        self.bert = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.bert_drop = nn.Dropout(0.3)
        self.out = nn.Linear(768, 1)

    def forward(self, ids, mask, token_type_ids):
        _, o2 = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids)
        bo = self.bert_drop(o2)
        output = self.out(bo)
        return output

## Training

In [None]:
import torch
import pandas as pd
import torch.nn as nn
import numpy as np

from sklearn import model_selection
from sklearn import metrics
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup


def run():
    dfx = pd.read_csv(TRAINING_FILE).fillna("none")
    dfx.sentiment = dfx.sentiment.apply(lambda x: 1 if x == "positive" else 0)

    df_train, df_valid = model_selection.train_test_split(
        dfx, test_size=0.1, random_state=42, stratify=dfx.sentiment.values
    )

    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)

    train_dataset = BERTDataset(
        review=df_train.review.values, target=df_train.sentiment.values
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=TRAIN_BATCH_SIZE, num_workers=4
    )

    valid_dataset = BERTDataset(
        review=df_valid.review.values, target=df_valid.sentiment.values
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=VALID_BATCH_SIZE, num_workers=1
    )

    device = torch.device("cuda")
    model = BERTBaseUncased()
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]

    num_train_steps = int(len(df_train) / TRAIN_BATCH_SIZE * EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps
    )

    # model = nn.DataParallel(model) if you have multiple GPUs 

    best_accuracy = 0
    for epoch in range(EPOCHS):
        train_fn(train_data_loader, model, optimizer, device, scheduler)
        outputs, targets = eval_fn(valid_data_loader, model, device)
        outputs = np.array(outputs) >= 0.5
        accuracy = metrics.accuracy_score(targets, outputs)
        print(f"Accuracy Score = {accuracy}")
        if accuracy > best_accuracy:
            torch.save(model.state_dict(), MODEL_PATH)
            best_accuracy = accuracy



In [None]:
run();

[1;30;43mStreaming output truncated to the last 5000 lines.[0m


 82%|████████▏ | 4586/5625 [33:32<07:35,  2.28it/s][A[A[A


 82%|████████▏ | 4587/5625 [33:32<07:34,  2.28it/s][A[A[A


 82%|████████▏ | 4588/5625 [33:33<07:33,  2.29it/s][A[A[A


 82%|████████▏ | 4589/5625 [33:33<07:33,  2.28it/s][A[A[A


 82%|████████▏ | 4590/5625 [33:34<07:32,  2.29it/s][A[A[A


 82%|████████▏ | 4591/5625 [33:34<07:32,  2.29it/s][A[A[A


 82%|████████▏ | 4592/5625 [33:34<07:32,  2.28it/s][A[A[A


 82%|████████▏ | 4593/5625 [33:35<07:32,  2.28it/s][A[A[A


 82%|████████▏ | 4594/5625 [33:35<07:33,  2.27it/s][A[A[A


 82%|████████▏ | 4595/5625 [33:36<07:33,  2.27it/s][A[A[A


 82%|████████▏ | 4596/5625 [33:36<07:32,  2.28it/s][A[A[A


 82%|████████▏ | 4597/5625 [33:37<07:31,  2.28it/s][A[A[A


 82%|████████▏ | 4598/5625 [33:37<07:30,  2.28it/s][A[A[A


 82%|████████▏ | 4599/5625 [33:38<07:30,  2.28it/s][A[A[A


 82%|████████▏ | 4600/5625 [33:38<07:29,  2.28it/s]

Accuracy Score = 0.9442


[1;30;43mStreaming output truncated to the last 5000 lines.[0m


 82%|████████▏ | 4586/5625 [33:32<07:36,  2.28it/s][A[A[A


 82%|████████▏ | 4587/5625 [33:32<07:36,  2.27it/s][A[A[A


 82%|████████▏ | 4588/5625 [33:33<07:35,  2.28it/s][A[A[A


 82%|████████▏ | 4589/5625 [33:33<07:35,  2.28it/s][A[A[A


 82%|████████▏ | 4590/5625 [33:34<07:34,  2.28it/s][A[A[A


 82%|████████▏ | 4591/5625 [33:34<07:33,  2.28it/s][A[A[A


 82%|████████▏ | 4592/5625 [33:34<07:32,  2.28it/s][A[A[A


 82%|████████▏ | 4593/5625 [33:35<07:32,  2.28it/s][A[A[A


 82%|████████▏ | 4594/5625 [33:35<07:31,  2.28it/s][A[A[A


 82%|████████▏ | 4595/5625 [33:36<07:31,  2.28it/s][A[A[A


 82%|████████▏ | 4596/5625 [33:36<07:31,  2.28it/s][A[A[A


 82%|████████▏ | 4597/5625 [33:37<07:30,  2.28it/s][A[A[A


 82%|████████▏ | 4598/5625 [33:37<07:31,  2.28it/s][A[A[A


 82%|████████▏ | 4599/5625 [33:37<07:30,  2.28it/s][A[A[A


 82%|████████▏ | 4600/5625 [33:38<07:30,  2.28it/s]

Accuracy Score = 0.9458


[1;30;43mStreaming output truncated to the last 5000 lines.[0m


 82%|████████▏ | 4586/5625 [33:32<07:35,  2.28it/s][A[A[A


 82%|████████▏ | 4587/5625 [33:33<07:36,  2.27it/s][A[A[A


 82%|████████▏ | 4588/5625 [33:33<07:35,  2.28it/s][A[A[A


 82%|████████▏ | 4589/5625 [33:34<07:34,  2.28it/s][A[A[A


 82%|████████▏ | 4590/5625 [33:34<07:33,  2.28it/s][A[A[A


 82%|████████▏ | 4591/5625 [33:34<07:33,  2.28it/s][A[A[A


 82%|████████▏ | 4592/5625 [33:35<07:32,  2.28it/s][A[A[A


 82%|████████▏ | 4593/5625 [33:35<07:32,  2.28it/s][A[A[A


 82%|████████▏ | 4594/5625 [33:36<07:32,  2.28it/s][A[A[A


 82%|████████▏ | 4595/5625 [33:36<07:32,  2.28it/s][A[A[A


 82%|████████▏ | 4596/5625 [33:37<07:31,  2.28it/s][A[A[A


 82%|████████▏ | 4597/5625 [33:37<07:31,  2.28it/s][A[A[A


 82%|████████▏ | 4598/5625 [33:37<07:30,  2.28it/s][A[A[A


 82%|████████▏ | 4599/5625 [33:38<07:29,  2.28it/s][A[A[A


 82%|████████▏ | 4600/5625 [33:38<07:28,  2.28it/s]

Accuracy Score = 0.9468


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 82%|████████▏ | 4585/5625 [33:32<07:34,  2.29it/s][A[A[A


 82%|████████▏ | 4586/5625 [33:33<07:35,  2.28it/s][A[A[A


 82%|████████▏ | 4587/5625 [33:33<07:33,  2.29it/s][A[A[A


 82%|████████▏ | 4588/5625 [33:33<07:33,  2.29it/s][A[A[A


 82%|████████▏ | 4589/5625 [33:34<07:32,  2.29it/s][A[A[A


 82%|████████▏ | 4590/5625 [33:34<07:32,  2.29it/s][A[A[A


 82%|████████▏ | 4591/5625 [33:35<07:32,  2.28it/s][A[A[A


 82%|████████▏ | 4592/5625 [33:35<07:32,  2.28it/s][A[A[A


 82%|████████▏ | 4593/5625 [33:36<07:31,  2.29it/s][A[A[A


 82%|████████▏ | 4594/5625 [33:36<07:30,  2.29it/s][A[A[A


 82%|████████▏ | 4595/5625 [33:37<07:30,  2.29it/s][A[A[A


 82%|████████▏ | 4596/5625 [33:37<07:30,  2.29it/s][A[A[A


 82%|████████▏ | 4597/5625 [33:37<07:29,  2.29it/s][A[A[A


 82%|████████▏ | 4598/5625 [33:38<07:29,  2.29it/s][A[A[A


 82%|████████▏ | 4599/5625 [33:38<07:29,  2.28it/s][

Accuracy Score = 0.9466



