In [5]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from datasets import Dataset, DatasetDict

In [2]:
# Read the data
train = pd.read_parquet('../data/raw/train.parquet')
test = pd.read_parquet('../data/raw/test.parquet')

In [6]:
train = Dataset.from_pandas(train)
test = Dataset.from_pandas(test)

ds = DatasetDict({
    'train': train, 
    'test': test
})

In [9]:
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_name'],
        num_rows: 3600000
    })
    test: Dataset({
        features: ['text', 'label', 'label_name'],
        num_rows: 400000
    })
})

In [12]:
train

Dataset({
    features: ['text', 'label', 'label_name'],
    num_rows: 3600000
})

In [10]:
# Load tokenizer
model_ckpt = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

## Understand how the tokenizer works

In [11]:
text = train.text[0]

AttributeError: 'Dataset' object has no attribute 'text'

In [7]:
encoded_text = tokenizer(text)
encoded_text

{'input_ids': [101, 24646, 5582, 2130, 2005, 1996, 2512, 1011, 27911, 1024, 2023, 2614, 2650, 2001, 3376, 999, 2009, 23262, 1996, 12411, 7301, 1999, 2115, 2568, 2061, 2092, 1045, 2052, 28667, 8462, 4859, 2009, 2130, 2000, 2111, 2040, 5223, 6819, 2094, 1012, 2208, 2189, 999, 1045, 2031, 2209, 1996, 2208, 10381, 4948, 2080, 2892, 2021, 2041, 1997, 2035, 1997, 1996, 2399, 1045, 2031, 2412, 2209, 2009, 2038, 1996, 2190, 2189, 999, 2009, 10457, 2185, 2013, 13587, 9019, 2075, 1998, 3138, 1037, 4840, 2121, 3357, 2007, 24665, 3686, 7334, 1998, 3969, 3993, 19505, 1012, 2009, 2052, 17894, 3087, 2040, 14977, 2000, 4952, 999, 1034, 1035, 1034, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 

In [8]:
tokens = tokenizer.convert_ids_to_tokens(encoded_text.input_ids)
print(tokens)

['[CLS]', 'stu', '##ning', 'even', 'for', 'the', 'non', '-', 'gamer', ':', 'this', 'sound', 'track', 'was', 'beautiful', '!', 'it', 'paints', 'the', 'sen', '##ery', 'in', 'your', 'mind', 'so', 'well', 'i', 'would', 'rec', '##ome', '##nd', 'it', 'even', 'to', 'people', 'who', 'hate', 'vi', '##d', '.', 'game', 'music', '!', 'i', 'have', 'played', 'the', 'game', 'ch', '##ron', '##o', 'cross', 'but', 'out', 'of', 'all', 'of', 'the', 'games', 'i', 'have', 'ever', 'played', 'it', 'has', 'the', 'best', 'music', '!', 'it', 'backs', 'away', 'from', 'crude', 'keyboard', '##ing', 'and', 'takes', 'a', 'fresh', '##er', 'step', 'with', 'gr', '##ate', 'guitars', 'and', 'soul', '##ful', 'orchestras', '.', 'it', 'would', 'impress', 'anyone', 'who', 'cares', 'to', 'listen', '!', '^', '_', '^', '[SEP]']


In [9]:
print(tokenizer.convert_tokens_to_string(tokens))

[CLS] stuning even for the non - gamer : this sound track was beautiful! it paints the senery in your mind so well i would recomend it even to people who hate vid. game music! i have played the game chrono cross but out of all of the games i have ever played it has the best music! it backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. it would impress anyone who cares to listen! ^ _ ^ [SEP]


In [10]:
print(tokenizer.vocab_size)
print(tokenizer.model_max_length)
print(tokenizer.model_input_names)

30522
512
['input_ids', 'token_type_ids', 'attention_mask']


### Tokenizing the whole dataset

In [11]:
def tokenize(data):
    return tokenizer(data.text.tolist(), padding=True, truncation=True)

In [12]:
print(tokenize(train.iloc[:2]))

{'input_ids': [[101, 24646, 5582, 2130, 2005, 1996, 2512, 1011, 27911, 1024, 2023, 2614, 2650, 2001, 3376, 999, 2009, 23262, 1996, 12411, 7301, 1999, 2115, 2568, 2061, 2092, 1045, 2052, 28667, 8462, 4859, 2009, 2130, 2000, 2111, 2040, 5223, 6819, 2094, 1012, 2208, 2189, 999, 1045, 2031, 2209, 1996, 2208, 10381, 4948, 2080, 2892, 2021, 2041, 1997, 2035, 1997, 1996, 2399, 1045, 2031, 2412, 2209, 2009, 2038, 1996, 2190, 2189, 999, 2009, 10457, 2185, 2013, 13587, 9019, 2075, 1998, 3138, 1037, 4840, 2121, 3357, 2007, 24665, 3686, 7334, 1998, 3969, 3993, 19505, 1012, 2009, 2052, 17894, 3087, 2040, 14977, 2000, 4952, 999, 1034, 1035, 1034, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1996, 2190, 6050, 2412, 2000, 2505, 1012, 1024, 1045, 1005, 1049, 3752, 1037, 2843, 1997, 4391, 3038, 2008, 2023, 2003, 1996, 2190, 1005, 2208, 6050, 1005, 1998, 1045, 6618, 2008, 1045, 1005, 1040, 4339, 1037, 3319, 2000, 21090, 1037, 2978, 1012, 2023, 1999, 2026, 6728, 5498, 3630, 2003

In [None]:
train_text_encoded = tokenize(train)

## Training a Text Classifier

There are two options to train a text classifier:

1. Use the model as the `feature extractor` and train a classifier on top of it.
2. Train the model end-to-end.

### 1. Transformer as Feature Extractor

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained(model_ckpt).to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


#### Extracting the last hidden states

In [24]:
# Demonstration
inputs = tokenizer(text, return_tensors="pt")
print(f"Input tensor size: {inputs['input_ids'].size()}")

inputs = {k:v.to(device) for k,v in inputs.items()}
with torch.no_grad():
    outputs = model(**inputs)
# print(outputs)

print(outputs.last_hidden_state.size())

Input tensor size: torch.Size([1, 104])
torch.Size([1, 104, 768])


In [25]:
def extract_hidden_states(batch):
    inputs = {k: v.to(device) for k,v in batch.items() if k in tokenizer.model_input_names}
    
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
    
    return {
        "hidden_state": last_hidden_state[:, 0].cput().numpy()
    }