In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn')
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [4]:
df = pd.read_csv('../data/amazon_large_stratified.csv', index_col=0)

In [5]:
df.head()

Unnamed: 0,review,review_sentiments
3837169,Disappointing treatment of a great album. This...,0
2882480,Good read. I have read all the books in this s...,1
4587996,"NOPE.. Just my opinion obviously, but every ti...",0
196423,"Passing gas was the highlight, and I HATED th...",0
94421,not believable **SPOILERS**. I really like his...,0


In [6]:
train_texts, temp_texts, train_labels, temp_labels = train_test_split(df['review'].tolist(), 
                                                                      df['review_sentiments'].tolist(), 
                                                                      stratify=df['review_sentiments'].tolist(), 
                                                                      test_size=0.5, 
                                                                      random_state=42)

In [7]:
val_texts, test_texts, val_labels, test_labels = train_test_split(temp_texts, 
                                                                  temp_labels, 
                                                                  stratify=temp_labels, 
                                                                  test_size=250000, 
                                                                  random_state=42)

In [8]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [9]:
test_encodings = tokenizer(test_texts, truncation=True, 
                           padding=True, max_length = 100)

In [10]:
print(len(train_labels), len(val_labels), len(test_labels))

500000 250000 250000


In [11]:
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'pre_classi

In [12]:
#load weights of best model
path = 'results/checkpoint-93750/pytorch_model.bin'
model.load_state_dict(torch.load(path))

<All keys matched successfully>

In [13]:
test_batch_size = 10000

In [14]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using {} device".format(device))
model = model.to(device)

pt_preds_proba = None

for i in range(25):
    
    print('test batch:', i + 1)
    
    test_seq = torch.tensor(test_encodings['input_ids'][(i * test_batch_size):((i + 1) * test_batch_size)])
    test_mask = torch.tensor(test_encodings['attention_mask'][(i * test_batch_size):((i + 1) * test_batch_size)])
    test_y = torch.tensor(test_labels[(i * test_batch_size):((i + 1) * test_batch_size)])
        
    # get predictions for test data
    with torch.no_grad():
        preds = model(test_seq.to(device), test_mask.to(device))
        
    pt_preds = nn.functional.softmax(preds.logits, dim=-1)
    
    if pt_preds_proba is None:
        pt_preds_proba = np.array(pt_preds)
    else:
        pt_preds_proba = np.vstack((pt_preds_proba, np.array(pt_preds)))

Using cpu device
test batch: 1
test batch: 2
test batch: 3
test batch: 4
test batch: 5
test batch: 6
test batch: 7
test batch: 8
test batch: 9
test batch: 10
test batch: 11
test batch: 12
test batch: 13
test batch: 14
test batch: 15
test batch: 16
test batch: 17
test batch: 18
test batch: 19
test batch: 20
test batch: 21
test batch: 22
test batch: 23
test batch: 24
test batch: 25


In [15]:
np_preds = np.argmax(pt_preds_proba, axis = 1)
print(classification_report(np.array(test_labels), np_preds))

              precision    recall  f1-score   support

           0       0.87      0.83      0.85    125000
           1       0.84      0.87      0.86    125000

    accuracy                           0.85    250000
   macro avg       0.85      0.85      0.85    250000
weighted avg       0.85      0.85      0.85    250000



In [16]:
pd.crosstab(np.array(test_labels), np_preds)

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,103908,21092
1,15792,109208
