In [1]:
import torch
import pandas as pd
import numpy as np
import transformers
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from tqdm import tqdm

from transformers import BertTokenizer, AutoModel
from sklearn.metrics import classification_report

# from Preprocessing import preprocess_text

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_checkpoint = 'bert-base-uncased'

In [3]:
class BertDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = 256

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data.iloc[index]['data_cleaned']
        labels = self.data.iloc[index][['jid']].values.astype(int)
        encoding = self.tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=self.max_length)
        input_ids = encoding['input_ids'][0]
        attention_mask = encoding['attention_mask'][0]
        # resize the tensors to the same size
        input_ids = nn.functional.pad(input_ids, (0, self.max_length - input_ids.shape[0]), value=0)
        attention_mask = nn.functional.pad(attention_mask, (0, self.max_length - attention_mask.shape[0]), value=0)
        return input_ids, attention_mask, torch.tensor(labels)

In [4]:
class BertClassifier(nn.Module):
    def __init__(self, num_labels):
        super(BertClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained(model_checkpoint)
        self.classifier = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size, 100),
            nn.ReLU(),
            nn.Linear(100, 50),
            nn.ReLU(),
            nn.Linear(50, num_labels),
            nn.Softmax()
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        x = outputs['last_hidden_state'][:, 0, :]
        x = self.classifier(x)
        return x

In [6]:
df = pd.read_csv('dataset/preprocessed_dataset.csv')

tokenizer = BertTokenizer.from_pretrained(model_checkpoint)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# test_dataset = BertDataset(df[:64], tokenizer)

batch_size = 32
# test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

model = BertClassifier(2)
model.load_state_dict(torch.load('model/model_bert.pt'))

model = model.to(device)
model.eval()


BertClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_af

In [7]:
df

Unnamed: 0,review,sentiment,preprocessed_review
0,One of the other reviewers has mentioned that ...,1,One of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...,1,A wonderful little production The filming tech...
2,I thought this was a wonderful way to spend ti...,1,I thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...,0,Basically theres a family where a little boy J...
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,Petter Matteis Love in the Time of Money is a ...
...,...,...,...
49577,I thought this movie did a down right good job...,1,I thought this movie did a down right good job...
49578,"Bad plot, bad dialogue, bad acting, idiotic di...",0,Bad plot bad dialogue bad acting idiotic direc...
49579,I am a Catholic taught in parochial elementary...,0,I am a Catholic taught in parochial elementary...
49580,I'm going to have to disagree with the previou...,0,Im going to have to disagree with the previous...


In [8]:
idx_split = 45000
data = df.preprocessed_review[idx_split:]
label = df.sentiment[idx_split:]
pred = []
for input in data:
    encoded_dict = tokenizer.encode_plus(
            input,
            add_special_tokens=True,
            max_length=128,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
    input_ids = encoded_dict['input_ids'].to('cpu')
    attention_mask = encoded_dict['attention_mask'].to('cpu')

    with torch.no_grad():
        output = model(input_ids, attention_mask = attention_mask)

    pred.append(np.argmax(output.squeeze()).item())


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


  return self._call_impl(*args, **kwargs)


In [11]:
report = classification_report(pred, label)
print(report)

              precision    recall  f1-score   support

           0       0.93      0.95      0.94      2253
           1       0.95      0.93      0.94      2329

    accuracy                           0.94      4582
   macro avg       0.94      0.94      0.94      4582
weighted avg       0.94      0.94      0.94      4582

