In [1]:
#!pip install torch==1.9.0+cu111 torchvision==0.10.0+cu111 torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html

In [2]:
#!pip install transformers

In [3]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, BertTokenizerFast

In [4]:
df = pd.read_csv('../data/amazon_large_stratified.csv')

In [5]:
train_texts, temp_texts, train_labels, temp_labels = train_test_split(df['review'].tolist(), 
                                                                      df['review_sentiments'].tolist(), 
                                                                      stratify=df['review_sentiments'].tolist(), 
                                                                      test_size=0.5, 
                                                                      random_state=42)

In [6]:
val_texts, test_texts, val_labels, test_labels = train_test_split(temp_texts, 
                                                                  temp_labels, 
                                                                  stratify=temp_labels, 
                                                                  test_size=250000, 
                                                                  random_state=42)

In [7]:
print(len(train_labels), len(val_labels), len(test_labels))

500000 250000 250000


In [8]:
print(len(train_texts), len(val_texts), len(test_texts))

500000 250000 250000


In [9]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [10]:
#train_encodings = tokenizer(train_texts, truncation=True, 
#                            padding=True, max_length = 200)
#val_encodings = tokenizer(val_texts, truncation=True, 
#                          padding=True, max_length = 200)
test_encodings = tokenizer(test_texts, truncation=True, 
                           padding=True, max_length = 100)

In [11]:
print(len(train_labels), len(val_labels), len(test_labels))

500000 250000 250000


In [12]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classi

In [13]:
#load weights of best model
path = 'results/checkpoint-93500/pytorch_model.bin'
model.load_state_dict(torch.load(path))

<All keys matched successfully>

In [14]:
test_batch_size = 25000

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using {} device".format(device))
model = model.to(device)

pt_preds_proba = None

for i in range(10):
    
    print('test batch:', i + 1)
    
    test_seq = torch.tensor(test_encodings['input_ids'][(i * test_batch_size):((i + 1) * test_batch_size)])
    test_mask = torch.tensor(test_encodings['attention_mask'][(i * test_batch_size):((i + 1) * test_batch_size)])
    test_y = torch.tensor(test_labels[(i * test_batch_size):((i + 1) * test_batch_size)])
        
    # get predictions for test data
    with torch.no_grad():
        preds = model(test_seq.to(device), test_mask.to(device))
        
    pt_preds = nn.functional.softmax(preds.logits, dim=-1)
    
    if pt_preds_proba is None:
        pt_preds_proba = np.array(pt_preds)
    else:
        pt_preds_proba = np.vstack((pt_preds_proba, np.array(pt_preds)))

Using cpu device
test batch: 1
test batch: 2
test batch: 3
test batch: 4
test batch: 5
test batch: 6


In [18]:
test_seq = torch.tensor(test_encodings['input_ids'])
test_mask = torch.tensor(test_encodings['attention_mask'])
test_y = torch.tensor(test_labels)

In [19]:
np_preds = np.argmax(pt_preds_proba, axis = 1)
print(classification_report(np.array(test_labels), np_preds))

              precision    recall  f1-score   support

           0       0.91      0.89      0.90    125000
           1       0.90      0.92      0.91    125000

    accuracy                           0.90    250000
   macro avg       0.91      0.90      0.90    250000
weighted avg       0.91      0.90      0.90    250000



In [20]:
pd.crosstab(np.array(test_labels), np_preds)

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,111583,13417
1,10379,114621


In [21]:
np_preds.shape

(250000,)

In [22]:
torch.save(model.state_dict(), 'distillbert_large_saved_weights.pt')

In [26]:
torch.save(model, 'distillbert_large_model.pt')