<a href="https://colab.research.google.com/github/ryuzakace/DataScience/blob/main/text_classification_transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Uncomment and run this cell if you're on Colab or Kaggle
!git clone https://github.com/nlp-with-transformers/notebooks.git
%cd notebooks
from install import *
install_requirements(is_chapter2=True)

Cloning into 'notebooks'...
remote: Enumerating objects: 422, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 422 (delta 0), reused 5 (delta 0), pack-reused 416[K
Receiving objects: 100% (422/422), 24.97 MiB | 24.58 MiB/s, done.
Resolving deltas: 100% (190/190), done.
/content/notebooks
⏳ Installing base requirements ...
✅ Base requirements installed!
⏳ Installing Git LFS ...
✅ Git LFS installed!


In [None]:
from utils import *
setup_chapter()

Using transformers v4.13.0
Using datasets v1.16.1


In [None]:
from datasets import list_datasets

all_datasets = list_datasets()
print(f"There are {len(all_datasets)} datasets currently available on the Hub")
print(f"The first 10 are: {all_datasets[:10]}")

There are 7162 datasets currently available on the Hub
The first 10 are: ['acronym_identification', 'ade_corpus_v2', 'adversarial_qa',
'aeslc', 'afrikaans_ner_corpus', 'ag_news', 'ai2_arc', 'air_dialogue',
'ajgt_twitter_ar', 'allegro_reviews']


In [None]:
from datasets import load_dataset

emotions = load_dataset("emotion")

Downloading:   0%|          | 0.00/1.66k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

Downloading and preparing dataset emotion/default (download: 1.97 MiB, generated: 2.07 MiB, post-processed: Unknown size, total: 4.05 MiB) to /root/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705...


Downloading:   0%|          | 0.00/1.66M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/204k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/207k [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset emotion downloaded and prepared to /root/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
emotions

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [None]:
emotions['train'][:5]

{'label': [0, 0, 3, 2, 3],
 'text': ['i didnt feel humiliated',
  'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake',
  'im grabbing a minute to post i feel greedy wrong',
  'i am ever feeling nostalgic about the fireplace i will know that it is still on the property',
  'i am feeling grouchy']}

In [None]:
#Huggingface datasets to pandas dataframe

import pandas as pd

emotions.set_format(type="pandas")
df = emotions["train"][:]
df.head()

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3


In [None]:
emotions["train"].features['label']

ClassLabel(num_classes=6, names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'], names_file=None, id=None)

In [None]:
emotions["train"].features['label'].int2str(2)

'love'

In [None]:
def label_int2str(row):
    return emotions["train"].features["label"].int2str(row) #convert labels(integer) to names

df["label_name"] = df["label"].apply(label_int2str)
df.head()

Unnamed: 0,text,label,label_name
0,i didnt feel humiliated,0,sadness
1,i can go from feeling so hopeless to so damned...,0,sadness
2,im grabbing a minute to post i feel greedy wrong,3,anger
3,i am ever feeling nostalgic about the fireplac...,2,love
4,i am feeling grouchy,3,anger


In [None]:
#tokenizer

from transformers import AutoTokenizer

model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [None]:
encoded_text = tokenizer('I am a buoy not boy yes buoy')
print(encoded_text)

{'input_ids': [101, 1045, 2572, 1037, 20934, 6977, 2025, 2879, 2748, 20934,
6977, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [None]:
tokens = tokenizer.convert_ids_to_tokens(encoded_text.input_ids)
print(tokens)

['[CLS]', 'i', 'am', 'a', 'bu', '##oy', 'not', 'boy', 'yes', 'bu', '##oy',
'[SEP]']


In [None]:
print(tokenizer.convert_tokens_to_string(tokens))

[CLS] i am a buoy not boy yes buoy [SEP]


In [None]:
tokenizer.vocab_size

30522

In [None]:
tokenizer.model_max_length

512

In [None]:
tokenizer.model_input_names #name of the fields model expects in forward pass

['input_ids', 'attention_mask']

In [None]:
def tokenize(batch):
    return tokenizer(batch, padding=True, truncation=True)

In [None]:
# emotions.reset_format() #from pandas dataframe to huggingface dataset again so tokenizer can process

In [None]:
print(tokenize(list(emotions["train"][:2]['text'].values)))

{'input_ids': [[101, 1045, 2134, 2102, 2514, 26608, 102, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0], [101, 1045, 2064, 2175, 2013, 3110, 2061, 20625, 2000,
2061, 9636, 17772, 2074, 2013, 2108, 2105, 2619, 2040, 14977, 1998, 2003, 8300,
102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1]]}


In [None]:
emotions.reset_format() #converting from dataframe to hugginface to use its functionalities

In [None]:
emotions

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [None]:
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)    
emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
emotions_encoded['train'].column_names

['attention_mask', 'input_ids', 'label', 'text']

In [None]:
emotions_encoded

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'label', 'text'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['attention_mask', 'input_ids', 'label', 'text'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'label', 'text'],
        num_rows: 2000
    })
})

In [None]:
from transformers import TFAutoModelForSequenceClassification

In [None]:
tf_model = (TFAutoModelForSequenceClassification
            .from_pretrained(model_ckpt, num_labels=6)) #generator object -- not creating unless it is needed

Downloading:   0%|          | 0.00/347M [00:00<?, ?B/s]

In [None]:
tokenizer.model_input_names

['input_ids', 'attention_mask']

In [None]:
tf_train_dataset = emotions_encoded["train"].to_tf_dataset(
    columns=['input_ids', 'attention_mask'], label_cols=["label"], shuffle=True,
    batch_size=32)


tf_eval_dataset = emotions_encoded["validation"].to_tf_dataset(
    columns=['input_ids', 'attention_mask'], label_cols=["label"], shuffle=False,
    batch_size=32)

In [None]:
import tensorflow as tf

In [None]:
tf_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=tf.metrics.SparseCategoricalAccuracy())

tf_model.fit(tf_train_dataset, validation_data=tf_eval_dataset, epochs=2)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f9d5adf8510>

In [None]:
tf_test_dataset = emotions_encoded["test"].to_tf_dataset(
    columns=['input_ids', 'attention_mask'], shuffle=False,
    batch_size=32)

In [None]:
predicted = tf_model.predict(tf_test_dataset)

In [None]:
predicted['logits']

array([[ 6.1649704 , -1.7635723 , -2.5624204 ,  0.03112658, -2.549718  ,
        -3.070501  ],
       [ 6.325335  , -1.4941944 , -2.469223  , -0.33986786, -2.5057664 ,
        -3.235884  ],
       [ 6.5368385 , -1.8643963 , -2.4068623 , -0.71278805, -2.066765  ,
        -3.0545325 ],
       ...,
       [-2.0613785 ,  6.7133255 , -1.7561104 , -2.7938452 , -2.6320322 ,
        -1.3573337 ],
       [-1.6875514 ,  6.0902395 , -2.1871967 , -2.6562068 , -1.3783616 ,
        -1.8302153 ],
       [-1.7100382 , -2.096121  , -2.061326  , -2.6629362 ,  3.3651175 ,
         3.1184134 ]], dtype=float32)

In [None]:
predicted_labels = [tf.math.argmax(i).numpy() for i in predicted['logits']]

In [None]:
len(predicted_labels)

2000

In [None]:
cnt = 0
for i,j in zip(predicted_labels, emotions['test']['label']):
    if i==j:
        cnt+=1
        
        


In [None]:
cnt/2000

0.934

In [None]:
text_i = 'I am only a crack in this castle of glass'

In [None]:
t_t = tokenizer(text_i)

In [None]:
t_t_df = pd.DataFrame([t_t])

In [None]:
from datasets import Dataset

In [None]:
dataset_p = Dataset.from_pandas(t_t_df)

In [None]:
p_f = dataset_p.to_tf_dataset(
    columns=['input_ids', 'attention_mask'], shuffle=False,
    batch_size=1)

In [None]:
tf_model.predict(p_f)['logits'][0]

array([-0.34012836,  1.2261602 , -1.9837424 ,  1.637978  , -0.71501297,
       -2.646723  ], dtype=float32)

In [None]:
emotions["train"].features['label'].names[tf.math.argmax(tf_model.predict(p_f)['logits'][0]).numpy()]

'anger'

In [None]:
emotions["train"].features['label']

ClassLabel(num_classes=6, names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'], names_file=None, id=None)

In [None]:
emotions["train"].features['label'].names

['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']

In [None]:
numeric_dict_ds

<TensorSliceDataset element_spec={'input_ids': TensorSpec(shape=(5,), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(5,), dtype=tf.int32, name=None)}>

In [None]:
tf_model.predict(t_t, batch_size =2)

ValueError: ignored