In [None]:
pip install pytorch-pretrained-bert

In [9]:
import re
import torch
import pandas as pd
import numpy as np
from torch.utils.data import dataloader, Dataset, DataLoader
from pytorch_pretrained_bert import BertTokenizer
from sklearn.model_selection import train_test_split

PADDING_SIZE: int = 300
batch_size: int = 32

def _parse_line(line):
    line = line.strip().lower()
    line = line.replace("&nbsp;", " ")
    line = re.sub(r'<br(\s\/)?>', ' ', line)
    line = re.sub(r' +', ' ', line)  # merge multiple spaces to one
    return line


class MovieReviewDataset(Dataset):
    def __init__(self, dataframe, padding_size: int):
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
        self.reviews = dataframe['Phrase']
        self.labels = torch.as_tensor(np.array(dataframe["Sentiment"]), dtype=torch.long)
        self.max_length = padding_size

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, item):
        encoded_dict = self.tokenizer.encode_plus(self.reviews[item],
                                                  add_special_tokens = True,
                                                  max_length = self.max_length,
                                                  pad_to_max_length = True,
                                                  return_attention_mask = True,
                                                  return_tensors = 'pt')
        return (encoded_dict['input_ids'].view(-1), 
                encoded_dict['attention_mask'].view(-1), 
                self.labels[item])
    
    

In [11]:
dataframe = pd.read_csv("train.tsv", sep="\t")
# dataframe["Phrase_filter"] = dataframe.Phrase.apply(lambda rec: _parse_line(rec))
train_data, test_data = train_test_split(dataframe, test_size=0.2, shuffle=True)

train_data = MovieReviewDataset(train_data, PADDING_SIZE)
test_data = MovieReviewDataset(test_data, PADDING_SIZE)

train_data = DataLoader(train_data, batch_size, shuffle=True)
test_data = DataLoader(test_data, batch_size, shuffle=True)

In [13]:
test_data

<torch.utils.data.dataloader.DataLoader at 0x7f58e8783b50>

In [None]:
PAD = 40
Test = pd.read_csv(TEST_CSV_PATH, index_col = 0)
Train = pd.read_csv(TRAIN_CSV_PATH, index_col = 0)
Val = Train.sample(n = 30000)
Val = Val.reset_index(drop = True)
Train = Train.drop(Val.index).reset_index(drop = True)

train_data = SentimentDataset(Train)
val_data = SentimentDataset(Val)
test_data = SentimentDataset(Test)

In [None]:
import torch
import torch.nn as nn
from transformers import BertModel, BertConfig
import torch.nn.functional as F

class BertForSentiment(nn.Module):

    def __init__(self, vocab_file,
                 num_classes,
                 hidden_size=768,
                 num_hidden_layers=12,
                 num_attention_heads=12,
                 hidden_dropout_prob=0.1):
        super(BertForSentiment, self).__init__()
        self.config = BertConfig(hidden_size=hidden_size,
                                 num_hiddel_layers=num_hidden_layers,
                                 num_attention_heads=num_attention_heads,
                                 hidden_dropout_prob=hidden_dropout_prob)
        self.bert = BertModel.from_pretrained(vocab_file)
        self.dropout = nn.Dropout(hidden_dropout_prob)
        self.classifier = nn.Linear(hidden_size, num_classes)

    def forward(self, input_ids, attention_mask, token_type_ids=None, labels=None):
        # sequence_output, pooled_output, (hidden_states), (attentions)
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids)num_classes
        pooled_output = self.dropout(outputs[1])
        logits = self.classifier(pooled_output)
        log_logits = F.log_softmax(logits)
        return log_logits