## - Import

In [1]:
!pip install transformers



In [2]:
!pip install sentencepiece



In [3]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

import tensorflow as tf

from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
from transformers import XLNetTokenizer, TFXLNetForSequenceClassification

2022-11-06 12:43:37.844342: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-06 12:43:37.947805: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2022-11-06 12:43:37.947827: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-11-06 12:43:37.966016: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022

## - Load Data

In [4]:
df_train = pd.read_csv("./data/train.csv")
df_test = pd.read_csv("./data/test.csv")

In [5]:
df_train

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [6]:
df_test

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


## - Concat Text

In [7]:
df_train['text_combo'] = df_train['location'].astype(str) + " : " + df_train['keyword'].astype(str) + " : " + df_train['text'].astype(str)
df_test['text_combo'] = df_test['location'].astype(str) + " : " + df_test['keyword'].astype(str) + " : " + df_test['text'].astype(str)

In [8]:
df_train

Unnamed: 0,id,keyword,location,text,target,text_combo
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,nan : nan : Our Deeds are the Reason of this #...
1,4,,,Forest fire near La Ronge Sask. Canada,1,nan : nan : Forest fire near La Ronge Sask. Ca...
2,5,,,All residents asked to 'shelter in place' are ...,1,nan : nan : All residents asked to 'shelter in...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"nan : nan : 13,000 people receive #wildfires e..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,nan : nan : Just got sent this photo from Ruby...
...,...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1,nan : nan : Two giant cranes holding a bridge ...
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1,nan : nan : @aria_ahrary @TheTawniest The out ...
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,nan : nan : M1.94 [01:04 UTC]?5km S of Volcano...
7611,10872,,,Police investigating after an e-bike collided ...,1,nan : nan : Police investigating after an e-bi...


In [9]:
df_test

Unnamed: 0,id,keyword,location,text,text_combo
0,0,,,Just happened a terrible car crash,nan : nan : Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s...",nan : nan : Heard about #earthquake is differe...
2,3,,,"there is a forest fire at spot pond, geese are...",nan : nan : there is a forest fire at spot pon...
3,9,,,Apocalypse lighting. #Spokane #wildfires,nan : nan : Apocalypse lighting. #Spokane #wil...
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,nan : nan : Typhoon Soudelor kills 28 in China...
...,...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,nan : nan : EARTHQUAKE SAFETY LOS ANGELES ÛÒ ...
3259,10865,,,Storm in RI worse than last hurricane. My city...,nan : nan : Storm in RI worse than last hurric...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...,nan : nan : Green Line derailment in Chicago h...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...,nan : nan : MEG issues Hazardous Weather Outlo...


## - Split train & test

In [10]:
df_train_subset = df_train[["text_combo", "target"]].copy()
df_train_subset.rename(columns = {'text_combo':'text'}, inplace = True)
X_train, X_test = train_test_split(df_train_subset, test_size=0.05, random_state=0, stratify=df_train["target"])

In [11]:
X_train.shape

(7232, 2)

In [12]:
X_test.shape

(381, 2)

## - DistilBERT Tokenizer

In [13]:
MODEL_NAME = 'distilbert-base-uncased-finetuned-sst-2-english'

In [14]:
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)

In [15]:
train_encodings = tokenizer(list(X_train["text"]),
                            truncation=True, 
                            padding=True)

test_encodings = tokenizer(list(X_test["text"]),
                           truncation=True, 
                           padding=True)

## - DistilBERT Tokenization Examples

In [16]:
X_train["text"].iloc[188]

'Hartford  London Hong Kong : casualty : Conning Builds Strong Case for Portfolio #Diversification for Property-Casualty Insurers http://t.co/33FbR25t1O'

In [17]:
input_ids = train_encodings["input_ids"][188]
tokens = tokenizer.convert_ids_to_tokens(input_ids)
print(f"Tokenized tokens: {tokens}")
print(f"Tokenized text: {tokenizer.convert_tokens_to_string(tokens)}")

Tokenized tokens: ['[CLS]', 'hartford', 'london', 'hong', 'kong', ':', 'casualty', ':', 'con', '##ning', 'builds', 'strong', 'case', 'for', 'portfolio', '#', 'divers', '##ification', 'for', 'property', '-', 'casualty', 'ins', '##urer', '##s', 'http', ':', '/', '/', 't', '.', 'co', '/', '33', '##fb', '##r', '##25', '##t', '##1', '##o', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
Tokenized text: [CLS] hartford london hong kong : casualty : conning builds strong case for portfolio # diversification for property - casualty insurers http : / / t . 

## - Dataset with DistilBERT Tokenizer

In [18]:
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings),
                                    list(X_train["target"].values)))

test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings),
                                    list(X_test["target"].values)))

2022-11-06 12:43:45.541846: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-06 12:43:45.600768: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcusolver.so.11'; dlerror: libcusolver.so.11: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2022-11-06 12:43:45.600823: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1934] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2022-11-06 12:43:45.601433: I tensorflow/core/platform/cpu_feature_guard.cc:193] This Tensor

## - Load Pretrained DistilBERT

In [19]:
model = TFDistilBertForSequenceClassification.from_pretrained(MODEL_NAME)

All model checkpoint layers were used when initializing TFDistilBertForSequenceClassification.

All the layers of TFDistilBertForSequenceClassification were initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


## - Fine-Tuning DistilBERT

In [20]:
TRAIN_BATCH_SIZE = 16
TEST_BATCH_SIZE = 1
N_EPOCHS = 3

In [21]:
optimizer = tf.keras.optimizers.Adam(learning_rate=18e-6)
losss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

In [22]:
model.compile(optimizer=optimizer,
              loss=losss,
              metrics=['accuracy'])

In [23]:
checkpoint_filepath = 'checkpoints'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    verbose=1,
    save_best_only=True)

In [24]:
model.fit(train_dataset.shuffle(len(X_train)).batch(TRAIN_BATCH_SIZE),
          epochs=N_EPOCHS,
          batch_size=TRAIN_BATCH_SIZE,
          callbacks=[model_checkpoint_callback],
          validation_data=test_dataset.shuffle(len(X_train)).batch(TEST_BATCH_SIZE))

Epoch 1/3
Epoch 1: val_loss improved from inf to 0.36901, saving model to checkpoints
Epoch 2/3
Epoch 2: val_loss did not improve from 0.36901
Epoch 3/3
Epoch 3: val_loss did not improve from 0.36901


<keras.callbacks.History at 0x7ff9e56b0590>

In [25]:
model.load_weights(checkpoint_filepath)

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x7ff96c360d90>

## - DistilBERT Test

In [26]:
def predict_proba(text_list, model, tokenizer):
    encodings = tokenizer(text_list, 
                          truncation=True, 
                          padding=True)

    dataset = tf.data.Dataset.from_tensor_slices((dict(encodings)))
    preds = model.predict(dataset.batch(TEST_BATCH_SIZE)).logits  
    
    res = tf.nn.softmax(preds, axis=1).numpy()      
    
    return res

In [27]:
test_texts = list(df_test["text_combo"])

In [28]:
preds = predict_proba(test_texts, model, tokenizer)

