In [1]:
#!pip install torch==1.9.0+cu111 torchvision==0.10.0+cu111 torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html

In [2]:
#!pip install transformers

In [3]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, BertTokenizerFast

In [4]:
df = pd.read_csv('../data/amazon_large_stratified.csv')

In [5]:
train_texts, temp_texts, train_labels, temp_labels = train_test_split(df['review'].tolist(), 
                                                                      df['review_sentiments'].tolist(), 
                                                                      stratify=df['review_sentiments'].tolist(), 
                                                                      test_size=0.5, 
                                                                      random_state=42)

In [6]:
val_texts, test_texts, val_labels, test_labels = train_test_split(temp_texts, 
                                                                  temp_labels, 
                                                                  stratify=temp_labels, 
                                                                  test_size=250000, 
                                                                  random_state=42)

In [7]:
print(len(train_labels), len(val_labels), len(test_labels))

500000 250000 250000


In [8]:
print(len(train_texts), len(val_texts), len(test_texts))

500000 250000 250000


In [9]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [10]:
#train_encodings = tokenizer(train_texts, truncation=True, 
#                            padding=True, max_length = 200)
#val_encodings = tokenizer(val_texts, truncation=True, 
#                          padding=True, max_length = 200)
test_encodings = tokenizer(test_texts, truncation=True, 
                           padding=True, max_length = 100)

In [11]:
print(len(train_labels), len(val_labels), len(test_labels))

500000 250000 250000


In [12]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'pre_classifi

In [13]:
#load weights of best model
path = '../data/results/checkpoint-40000/pytorch_model.bin'
model.load_state_dict(torch.load(path))

<All keys matched successfully>

In [14]:
test_batch_size = 25000

In [15]:
test_seq = torch.tensor(test_encodings['input_ids'][:test_batch_size])
test_mask = torch.tensor(test_encodings['attention_mask'][:test_batch_size])
test_y = torch.tensor(test_labels[:test_batch_size])

In [16]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using {} device".format(device))
model = model.to(device)

Using cpu device


In [17]:
# get predictions for test data
with torch.no_grad():
    preds = model(test_seq.to(device), test_mask.to(device))

In [18]:
pt_preds = nn.functional.softmax(preds.logits, dim=-1)

In [19]:
np_preds = np.argmax(pt_preds, axis = 1)
print(classification_report(np.array(test_labels[:test_batch_size]), np_preds))

              precision    recall  f1-score   support

           0       0.88      0.92      0.90     12394
           1       0.92      0.88      0.90     12606

    accuracy                           0.90     25000
   macro avg       0.90      0.90      0.90     25000
weighted avg       0.90      0.90      0.90     25000



In [20]:
pd.crosstab(np.array(test_labels[:test_batch_size]), np_preds)

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,11369,1025
1,1515,11091
