In [None]:
pip install pytorch-pretrained-bert

In [17]:
import re
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from torch.nn.utils import clip_grad_norm_
from torch.utils.data import dataloader, Dataset, DataLoader
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertConfig
from sklearn.model_selection import train_test_split


PADDING_SIZE: int = 300
batch_size: int = 32

def _parse_line(line):
    line = line.strip().lower()
    line = line.replace("&nbsp;", " ")
    line = re.sub(r'<br(\s\/)?>', ' ', line)
    line = re.sub(r' +', ' ', line)  # merge multiple spaces to one
    return line


class MovieReviewDataset(Dataset):
    def __init__(self, dataframe, padding_size: int):
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
        self.reviews = dataframe['Phrase']
        self.labels = torch.as_tensor(np.array(dataframe["Sentiment"]), dtype=torch.long)
        self.max_length = padding_size

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, item):
        encoded_dict = self.tokenizer.encode_plus(self.reviews[item],
                                                  add_special_tokens = True,
                                                  max_length = self.max_length,
                                                  pad_to_max_length = True,
                                                  return_attention_mask = True,
                                                  return_tensors = 'pt')
        return (encoded_dict['input_ids'].view(-1), 
                encoded_dict['attention_mask'].view(-1), 
                self.labels[item])
    
    

class BertLayerNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-12):
        super(BertLayerNorm, self).__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.bias = nn.Parameter(torch.zeros(hidden_size))
        self.variance_epsilon = eps

    def forward(self, x):
        u = x.mean(-1, keepdim=True)
        s = (x - u).pow(2).mean(-1, keepdim=True)
        x = (x - u) / torch.sqrt(s + self.variance_epsilon)
        return self.weight * x + self.bias
        

class BertForSequenceClassification(nn.Module):
    def __init__(self, num_labels=3):
        super(BertForSequenceClassification, self).__init__()
        self.num_labels = num_labels
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, num_labels)
        nn.init.xavier_normal_(self.classifier.weight)

    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        out = F.softmax(logits, dim=1)
        return out


def training(train_data, model, device, criterion, optimizer, epoch):
    correct: int = 0
    data_len: int = 0
    total_loss: np.float = 0
    start_time = time.time()
    print('Training Epoch: {}'.format(epoch))
    model.train()
    for batch_idx, (data, mask, target) in enumerate(train_data):
        if batch_idx%250 == 0:
            print('.',end='')
        
        data = data.to(device)
        mask = mask.to(device)
        target = target.to(device)

        optimizer.zero_grad()
        target_predictions = model(data, token_type_ids=None, attention_mask=mask, labels=target)
        loss = criterion(target_predictions, target)
        loss.backward()
        clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        total_loss += loss.item()
        predicted_target = target_predictions.argmax(dim=1, keepdim=True)
        correct += predicted_target.eq(target.view_as(predicted_target)).sum().item()
        data_len += len(data)
    
    total_loss /= (batch_idx+1)
    acc = 100.00 * (correct/data_len)
    print("\nTrain Loss: {:.6f}     Train Accuracy: {:.2f}%        Training Time: {:.2f} min".format(total_loss, acc, (time.time()-start_time)/60.00))
    return total_loss, acc


In [11]:
dataframe = pd.read_csv("train.tsv", sep="\t")
# dataframe["Phrase_filter"] = dataframe.Phrase.apply(lambda rec: _parse_line(rec))
train_data, test_data = train_test_split(dataframe, test_size=0.2, shuffle=True)

train_data = MovieReviewDataset(train_data, PADDING_SIZE)
test_data = MovieReviewDataset(test_data, PADDING_SIZE)

train_data = DataLoader(train_data, batch_size, shuffle=True)
test_data = DataLoader(test_data, batch_size, shuffle=True)

In [13]:
test_data

<torch.utils.data.dataloader.DataLoader at 0x7f58e8783b50>

In [None]:
PAD = 40
Test = pd.read_csv(TEST_CSV_PATH, index_col = 0)
Train = pd.read_csv(TRAIN_CSV_PATH, index_col = 0)
Val = Train.sample(n = 30000)
Val = Val.reset_index(drop = True)
Train = Train.drop(Val.index).reset_index(drop = True)

train_data = SentimentDataset(Train)
val_data = SentimentDataset(Val)
test_data = SentimentDataset(Test)

In [None]:
import torch
import torch.nn as nn
from transformers import BertModel, BertConfig
import torch.nn.functional as F

class BertForSentiment(nn.Module):

    def __init__(self, vocab_file,
                 num_classes,
                 hidden_size=768,
                 num_hidden_layers=12,
                 num_attention_heads=12,
                 hidden_dropout_prob=0.1):
        super(BertForSentiment, self).__init__()
        self.config = BertConfig(hidden_size=hidden_size,
                                 num_hiddel_layers=num_hidden_layers,
                                 num_attention_heads=num_attention_heads,
                                 hidden_dropout_prob=hidden_dropout_prob)
        self.bert = BertModel.from_pretrained(vocab_file)
        self.dropout = nn.Dropout(hidden_dropout_prob)
        self.classifier = nn.Linear(hidden_size, num_classes)

    def forward(self, input_ids, attention_mask, token_type_ids=None, labels=None):
        # sequence_output, pooled_output, (hidden_states), (attentions)
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids)num_classes
        pooled_output = self.dropout(outputs[1])
        logits = self.classifier(pooled_output)
        log_logits = F.log_softmax(logits)
        return log_logits