Hello, this notebook is a "spinoff" of our other notebook called [Challenge - Botando pra quebrar](https://www.kaggle.com/andreispurim/challenge-botando-pra-quebrar). Please take a look there first.

In [None]:
!pip install pytorch-nlp
!pip install pytorch_pretrained_bert 

import string
import pandas
import random
import numpy
import torch
import time
import sys
import gc
import re
%matplotlib inline

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer
from sklearn.metrics import classification_report
from pytorch_pretrained_bert import BertModel
from sklearn.pipeline import make_pipeline
from torch.nn.utils import clip_grad_norm_
from IPython.display import clear_output
from collections import Counter
from torch.optim import Adam
from torch import nn

# This notebook is heavily inspired by Atul Anand, check his BERT tutorial out.
# This is the BERT binary classified developed by Mr. Anand.
class BertBinaryClassifier(nn.Module):
    def __init__(self, dropout=0.1):
        super(BertBinaryClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, tokens, masks=None):
        _, pooled_output = self.bert(tokens, attention_mask=masks, output_all_encoded_layers=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        probality = self.sigmoid(linear_output)
        return probality

def Tokenize_and_mask_input():
    # First, let's make a small function to clean our strings, because as we have seen before, there are tons of unwanted punctuations and other useless tags
    def clear_sentence(sentence: str) -> str:
        '''A function to clear texts using regex.'''
        sentence = sentence.replace('<br />', ' ')
        sentence = sentence.translate(str.maketrans(' ', ' ', string.punctuation))
        sentence = sentence.lower()
        return sentence

    # I'm using functions to stop memory being wasted.
    clear_output()
    torch.cuda.empty_cache()
    memory_clear = gc.collect()
    Reviews = pandas.read_csv('../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')

    # And let's clean our reviews
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
    to_binary = lambda phrase: 1 if phrase=='positive' else 0

    print('Clearing and Tokenizing sentences...')
    
    # We are going to clear our sentences and apply the tokenization [CLS] and [SEP] (as it is done with BERT). Make y binary
    x = Reviews['review'][0:5000].apply(clear_sentence)
    x = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], x))
    y = numpy.array(Reviews['sentiment'][0:5000].apply(to_binary))
    
    # Separate
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
    
    # Apply converts the tokens to ids and pads sequences to limit the size to 512
    print('Applying masks...')
    train_token_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, X_train)), maxlen=512, truncating="post", padding="post", dtype="int")
    test_token_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, X_test)), maxlen=512, truncating="post", padding="post", dtype="int")

    # Makes masks to float and pos. numbers
    train_masks = [[float(number > 0) for number in vector] for vector in train_token_ids]
    test_masks = [[float(number > 0) for number in vector] for vector in  test_token_ids]
    
    # I'm putting them togheters as dicts to make acessing easier
    train = {'ids': train_token_ids, 'masks': train_masks, 'y': y_train}
    test = {'ids': test_token_ids, 'masks': test_masks, 'y': y_test}
    return train,test

def BERT_make_model(data: dict) -> BertBinaryClassifier:
    # Sets up GPU Device and tensor. We are setting our model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Gets model from class
    model = BertBinaryClassifier()
    model = model.cuda() 
    
    # Makes some x and y to get the pretrained part.
    x = torch.tensor(train['ids'][:3]).to(device)
    y, pooled = model.bert(x, output_all_encoded_layers=False)
    y = model(x)
    torch.cuda.empty_cache()
    return model

def Torch_make_dataloader(data: dict) -> torch.utils.data.dataloader.DataLoader: 
    # Makes ids and masks tensors
    token_tensor = torch.tensor(data['ids'])
    masks_tensor = torch.tensor(data['masks'])
    y_tensor = torch.tensor(data['y'].reshape(-1, 1)).float()
    
    # We need a dataloader to train and test
    dataset = TensorDataset(token_tensor, masks_tensor, y_tensor)
    dataloader = DataLoader(dataset, sampler=SequentialSampler(dataset), batch_size=4)
    
    return dataloader
    
def BERT_train(model: BertBinaryClassifier, data: dict) -> BertBinaryClassifier:
    
    # Makes dataloader
    dataloader = Torch_make_dataloader(data)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Makes the params optimizer
    param_optimizer = list(model.sigmoid.named_parameters()) 
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
    optimizer = Adam(model.parameters(), lr=3e-6)

    # Let's make it for 10 Epochs
    for epoch_num in range(10):
        model.train()
        
        for step_num, batch_data in enumerate(dataloader):
            token_ids, masks, labels = tuple(t.to(device) for t in batch_data)

            logits = model(token_ids, masks)
            loss_func = nn.BCELoss()
            batch_loss = loss_func(logits, labels)

            model.zero_grad()
            batch_loss.backward()

            clip_grad_norm_(parameters = model.parameters(), max_norm=1.0)
            optimizer.step()

            clear_output(wait=True)
            print('Epoch: ', epoch_num + 1)

    clear_output()
    return model

def BERT_test(model: BertBinaryClassifier, data: dict):

    dataloader = Torch_make_dataloader(data)
    model.eval()
    predicted = []
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    with torch.no_grad():
        for step_num, batch_data in enumerate(dataloader):

            token_ids, masks, labels = tuple(t.to(device) for t in batch_data)

            logits = model(token_ids, masks)
            loss_func = nn.BCELoss()
            loss = loss_func(logits, labels)
            numpy_logits = logits.cpu().detach().numpy()

            predicted += list(numpy_logits[:, 0] > 0.5)

    print(classification_report(data['y'], predicted))

# Let's see how long it takes to run this notebook
start_time = time.time()
# Gets train and test from input
train,test = Tokenize_and_mask_input()
# Makes model
model = BERT_make_model(train)
# Trains model
model = BERT_train(model,train)
# Test model
BERT_test(model, test)
print('time:', str(round(time.time() - start_time)) + 's')