In [4]:
import re
import torch
import pandas as pd
from torch.utils.data import dataloader, Dataset
from pytorch_pretrained_bert import BertTokenizer

PAD: int = 40

def _parse_imdb_line(line):
    line = line.strip().lower()
    line = line.replace("&nbsp;", " ")
    line = re.sub(r'<br(\s\/)?>', ' ', line)
    line = re.sub(r' +', ' ', line)  # merge multiple spaces to one
    return line


class SentimentDataset(Dataset):

    def __init__(self, dataframe):
        self.df = dataframe
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        label = torch.tensor(self.df['Sentiment'][idx], dtype = torch.uint8) 
        tokens = self.tokenizer.encode_plus(
                        text = self.df['Phrase_filter'][idx],
                        add_special_tokens = True, 
                        max_length = PAD, 
                        pad_to_max_length = True, 
                        return_attention_mask = True, 
                        return_tensors = 'pt', 
                        return_token_type_ids = False)
        ids = tokens['input_ids']
        attn_mask = tokens['attention_mask']

        return ids, label, attn_mask

In [5]:
df = pd.read_csv("train.tsv", sep="\t")
df.Phrase_filter = df.Phrase.apply(lambda rec: _parse_imdb_line(rec))
data_set = SentimentDataset(dataframe=df)

  df.Phrase_filter = df.Phrase.apply(lambda rec: _parse_imdb_line(rec))


In [None]:
PAD = 40
Test = pd.read_csv(TEST_CSV_PATH, index_col = 0)
Train = pd.read_csv(TRAIN_CSV_PATH, index_col = 0)
Val = Train.sample(n = 30000)
Val = Val.reset_index(drop = True)
Train = Train.drop(Val.index).reset_index(drop = True)

train_data = SentimentDataset(Train)
val_data = SentimentDataset(Val)
test_data = SentimentDataset(Test)

In [None]:
import torch
import torch.nn as nn
from transformers import BertModel, BertConfig
import torch.nn.functional as F

class BertForSentiment(nn.Module):

    def __init__(self, vocab_file,
                 num_classes,
                 hidden_size=768,
                 num_hidden_layers=12,
                 num_attention_heads=12,
                 hidden_dropout_prob=0.1):
        super(BertForSentiment, self).__init__()
        self.config = BertConfig(hidden_size=hidden_size,
                                 num_hiddel_layers=num_hidden_layers,
                                 num_attention_heads=num_attention_heads,
                                 hidden_dropout_prob=hidden_dropout_prob)
        self.bert = BertModel.from_pretrained(vocab_file)
        self.dropout = nn.Dropout(hidden_dropout_prob)
        self.classifier = nn.Linear(hidden_size, num_classes)

    def forward(self, input_ids, attention_mask, token_type_ids=None, labels=None):
        # sequence_output, pooled_output, (hidden_states), (attentions)
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids)num_classes
        pooled_output = self.dropout(outputs[1])
        logits = self.classifier(pooled_output)
        log_logits = F.log_softmax(logits)
        return log_logits