# BERT - Climate Sentiment Multiclass Classification
## CS522 Project

**Dataset:**  
https://www.kaggle.com/code/luiskalckstein/climate-sentiment-multiclass-classification

**Imports**

In [5]:
# ! pip install tensorflow-addons
import os
import warnings

import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import DistilBertTokenizer, TFDistilBertModel, DistilBertConfig
from transformers import logging as hf_logging
from Common.preprocessor import one_hot_encoding
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
import matplotlib.pyplot as plt
from Common.UtilFuncs import DataSize
from Common.DataCenter import data_center
from Common.UtilFuncs import print_evaluation, print_distribution
from Common.UtilFuncs import Evaluator, Lab
from Common.BERTModel import BERTModel
try:
    %load_ext autotime
except:
    !pip install ipython-autotime
    %load_ext autotime
    
hf_logging.set_verbosity_error()
warnings.filterwarnings('ignore')
# ! pip install tensorflow-addons
TrainSizeBaseLine = DataSize.GetTrainSizeBaseline()
TrainSizeWithNoisyData = DataSize.GetTrainSizeWithNoisyData()
# 4000
TestDataSize = DataSize.GetTestDataSize()
NoiseDataSize = DataSize.GetNoiseDataSize()
ValidationDataSize = DataSize.GetValidationDataSize()

%matplotlib inline

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 0 ns (started: 2022-04-21 09:46:36 +08:00)


**Detect GPU**

In [3]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
    print('Set memory autoincrement')
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print('Physical GPUs: %d, Logical GPUs: %d' % (len(gpus), len(logical_gpus)))
  except RuntimeError as e:
    print(e)
else:
    print('GPUs not detected')

Set memory autoincrement
Physical GPUs: 1, Logical GPUs: 1
time: 0 ns (started: 2022-04-21 09:23:42 +08:00)


## 1. Loading Dataset

In [6]:
# Each item: source -> (size, distribution)
noisy_set_sizes = {
    'mislabeled' : (8600, None),                   # max size: 15000
    'irrelevant' : (8600, [0.25,0.25,0.25,0.25]),  # max size: 34259
    'translated' : (5000, "reserve_labels"),       # max size: 5000
}
lab = Lab("twitter_sentiment_data_clean.csv", noisy_sources = noisy_set_sizes, total_train_size = 20000, total_test_size = 4000)

time: 562 ms (started: 2022-04-21 09:46:39 +08:00)


In [7]:
lab.dc.print_summary()


###################################### Data Summary #############################################
  Original set size: 40908
      sentiments ('Anti', 'Neutral', 'Pro', 'News'): 9.4%, 18.3%, 50.2%, 22.1%
  Training set size: 20000
  Test set size: 4000
  Noisy set size: 22200
  Validation set size: 1000
      noise sources ('mislabeled', 'irrelevant', 'translated'): 38.7%, 38.7%, 22.5%
##################################################################################################
time: 0 ns (started: 2022-04-21 09:46:40 +08:00)


Observe the data.

In [9]:
train_df = lab.dc.get_train_with_noisy_df(15000,5000)
data_center.print_data(train_df.head(15))


Unnamed: 0,noise,noise_text,sentiment,origin(sentiment),tweetid...,message...
0,1,mislabeled,3,2,8108306943,Regional/Global seabird stress
1,0,none,2,-,9536174384,I have to write an essay over
2,1,mislabeled,1,3,8645926535,Barack Obama warns climate cha
3,0,none,2,-,8199736905,RT @mitskileaks: want to speci
4,0,none,2,-,8438476460,.@RepBrianFitz Thank you for a
5,0,none,2,-,8182648187,RT @billmckibben: Reading clim
6,0,none,2,-,9556340417,RT @LanreShaper: 'Africa contr
7,1,mislabeled,3,1,9587601354,Keilmuan itu politik. Hawong N
8,0,none,2,-,8401717188,RT @WRIClimate: @CNBC He shoul
9,0,none,0,-,8254569694,@magslol global warming is a C


time: 47 ms (started: 2022-04-21 09:47:47 +08:00)


In [None]:
# do an experiment without denoising
# Parameter: original X,y of training set and test set
# Return evaluation info
def do_experiment(train_df, test_df):
    X_train, y_train = data_center.Xy(train_df)
    X_test, y_test   = data_center.Xy(test_df)
    
    valSet = lab.dc.get_validation()
    X_val = valSet[0]
    y_val = valSet[1]

    X_train_token = tokenize(X_train)
    X_test_token = tokenize(X_test)
    X_val_token = tokenize(X_val)

    y_train_vec, y_test_vec = one_hot_encoding(y_train, y_test)
    y_val_vec = one_hot_encoding(y_val)

    
    # Convert texts to vectors
    bert = BERTModel()
    bert.Init()

    bert.Train(X_train_token, y_train_vec, X_val_token, y_val_vec)
    y_pred = bert.Predict(X_test_token)
    

    # Print the evaluation
    print_evaluation(y_test_vec, y_pred, labels=[0,1,2,3])
    evaluateDF = Evaluator.do_evaluate(y_test_vec, y_pred)
    return evaluateDF


In [3]:
def dc_format(D):
    data = {'message':D[0] , 'sentiment':D[1]}
    df = pd.DataFrame(data)
    return df

time: 217 µs (started: 2022-04-20 22:57:55 +08:00)


In [4]:
noisy_set_sizes = {
    'mislabeled' : 5000,   # max size: 15000
    'irrelevant' : 5000,   # max size: 34259
    'translated' : 5000,   # max size: 5000
}

# Load the database and split it into training set, test set, noisy set, validation set
dc = data_center('twitter_sentiment_data_clean.csv', test_size = 4000, validation_size = 1000,
                 noisy_size = noisy_set_sizes['mislabeled'])

test_df = dc_format(dc.get_test())
val_df = dc_format(dc.get_validation())

print(f"Test size: {test_df.shape[0]}")
print(f"Validation size: {val_df.shape[0]}")

Test size: 4000
Validation size: 1000
time: 107 ms (started: 2022-04-20 22:57:55 +08:00)


### Prepare the noisy set.

In [5]:
lstNoisyInfo = [("mislabeled",dc.get_noisy_len())]
print("Noisy set size is %d"                % dc.get_noisy_len())

# add the external noisy data (irrelevant texts)
# distribution of irrelevant noisy
irrelevant_noisy_distribution = [0.25, 0.25, 0.25, 0.25]    # None, if use the distribution of original set
added_size = dc.add_noisy(noisy_source="irrelevant", distribution = irrelevant_noisy_distribution,
                          size = noisy_set_sizes['irrelevant'])
print("%d noisy samples added" % added_size)
lstNoisyInfo.append(("irrelevant",added_size))

# add the external noisy data (translated texts). use the labels of each noisy data
added_size = dc.add_noisy(noisy_source="translated", distribution = "reserve_labels", 
                          size = noisy_set_sizes['translated'])
print("%d noisy samples added" % added_size)
lstNoisyInfo.append(("translated",added_size))

print("Noisy set new size is %d"                % dc.get_noisy_len())

Noisy set size is 5000
5000 noisy samples added
5000 noisy samples added
Noisy set new size is 15000
time: 306 ms (started: 2022-04-20 22:57:55 +08:00)


In [6]:
TrainSizeBaseLine

[2000, 4000, 5000, 8000, 10000, 15000, 20000]

time: 1.54 ms (started: 2022-04-20 22:57:56 +08:00)


In [7]:
TrainSizeWithNoisyData

[(4000, 1000), (8000, 2000), (15000, 5000)]

time: 735 µs (started: 2022-04-20 22:57:56 +08:00)


In [8]:
train_distribution = None

time: 115 µs (started: 2022-04-20 22:57:56 +08:00)


In [9]:
# 2000
train_df_2000 = dc_format(dc.get_train(TrainSizeBaseLine[0], train_distribution))
# 4000
train_df_4000 = dc_format(dc.get_train(TrainSizeBaseLine[1], train_distribution))
# 5000
train_df_5000 = dc_format(dc.get_train(TrainSizeBaseLine[2], train_distribution))
# 8000
train_df_8000 = dc_format(dc.get_train(TrainSizeBaseLine[3], train_distribution))
# 10000
train_df_10000 = dc_format(dc.get_train(TrainSizeBaseLine[4], train_distribution))
# 15000
train_df_15000 = dc_format(dc.get_train(TrainSizeBaseLine[5], train_distribution))
# 20000
train_df_20000 = dc_format(dc.get_train(TrainSizeBaseLine[6], train_distribution))

time: 51.1 ms (started: 2022-04-20 22:57:56 +08:00)


In [10]:
# (4000, 1000)
train_df_4000_1000 = dc_format(dc.get_train_with_noisy(TrainSizeWithNoisyData[0][0], TrainSizeWithNoisyData[0][1], train_distribution))
# (8000, 2000)
train_df_8000_2000 = dc_format(dc.get_train_with_noisy(TrainSizeWithNoisyData[1][0], TrainSizeWithNoisyData[1][1], train_distribution))
# (15000, 5000)
train_df_15000_5000 = dc_format(dc.get_train_with_noisy(TrainSizeWithNoisyData[2][0], TrainSizeWithNoisyData[2][1], train_distribution))

time: 55.2 ms (started: 2022-04-20 22:57:56 +08:00)


### <font color='red'> Specified training set </font>

In [11]:
train_df = train_df_15000_5000

time: 159 µs (started: 2022-04-20 22:57:56 +08:00)


In [12]:
train_df.head()

Unnamed: 0,message,sentiment
0,RT @theblaze: ‘Bombshell’ climate-change study...,3
1,I have to write an essay over the psychologica...,2
2,@NASA @MatthewACherry Can we make it to one of...,1
3,RT @mitskileaks: want to specify that $ is don...,2
4,.@RepBrianFitz Thank you for acknowleding man'...,2


time: 3.48 ms (started: 2022-04-20 22:57:56 +08:00)


#### Label Encoding

In [2]:
le = LabelEncoder()
train_df['sparse_label'] = le.fit_transform(train_df['sentiment'])
val_df['sparse_label'] = le.transform(val_df['sentiment'])
test_df['sparse_label'] = le.transform(test_df['sentiment'])

NameError: name 'train_df' is not defined

time: 422 ms (started: 2022-04-21 09:30:44 +08:00)


In [14]:
label_dict = (train_df[['sentiment','sparse_label']].drop_duplicates()
              .sort_values(by='sparse_label')
              .reset_index(drop=True)['sentiment']
              .to_dict())

for index, key in label_dict.items():
    print(index, key)

0 0
1 1
2 2
3 3
time: 2.78 ms (started: 2022-04-20 22:57:56 +08:00)


#### Splitting dataset

In [15]:
x_train = train_df['message']
y_train = train_df['sparse_label']

x_val = val_df['message']
y_val = val_df['sparse_label']

print(f"{len(x_train)} training samples\n{len(x_val)} validation samples")

20000 training samples
1000 validation samples
time: 428 µs (started: 2022-04-20 22:57:56 +08:00)


In [16]:
x_test = test_df['message']
y_test = test_df['sparse_label']
print(f"{len(x_test)} testing samples")

4000 testing samples
time: 362 µs (started: 2022-04-20 22:57:56 +08:00)


## 2. Train BERT

In [17]:
MODEL_NAME = 'distilbert-base-uncased'

time: 605 µs (started: 2022-04-20 22:57:56 +08:00)


In [18]:
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)

time: 7.68 s (started: 2022-04-20 22:57:56 +08:00)


In [19]:
max_len = 0
for sentence in (x_train.tolist() + x_val.tolist()):
    try:
        sentence_token_len = len(tokenizer.tokenize(sentence))
        max_len = sentence_token_len if (sentence_token_len > max_len) else max_len
    except:
        pass
    
print(f"The maximum amount of tokens in the dataset is {max_len}")

The maximum amount of tokens in the dataset is 6282
time: 5.88 s (started: 2022-04-20 22:58:03 +08:00)


In [20]:
MAX_LEN = 360

tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME,  
                                                add_special_tokens=True,
                                                max_length=MAX_LEN, 
                                                pad_to_max_length=True)

def tokenize(sentences):
    input_ids, input_masks, input_segments = [], [], []
    for sentence in tqdm(sentences):
        inputs = tokenizer.encode_plus(sentence, 
                                       add_special_tokens=True, 
                                       max_length=MAX_LEN, 
                                       pad_to_max_length=True, 
                                       return_attention_mask=True, 
                                       return_token_type_ids=True, 
                                       truncation=True)
        input_ids.append(inputs['input_ids'])
        input_masks.append(inputs['attention_mask'])
        input_segments.append(inputs['token_type_ids'])       
        
    return np.asarray(input_ids, dtype='int32'), np.asarray(input_masks, dtype='int32')

time: 8.15 s (started: 2022-04-20 22:58:09 +08:00)


In [21]:
X_train = tokenize(x_train)
X_val = tokenize(x_val)
X_test = tokenize(x_test)

100%|███████████████████████████████████| 20000/20000 [00:07<00:00, 2814.03it/s]
100%|█████████████████████████████████████| 1000/1000 [00:00<00:00, 3447.51it/s]
100%|█████████████████████████████████████| 4000/4000 [00:01<00:00, 3504.79it/s]

time: 9.07 s (started: 2022-04-20 22:58:17 +08:00)





#### Add custom layers after embedding model for classification

In [22]:
bert_config = DistilBertConfig.from_pretrained(MODEL_NAME, output_hidden_states=True, output_attentions=True)
TFBert = TFDistilBertModel.from_pretrained(MODEL_NAME, config=bert_config)

input_ids_layer = tf.keras.layers.Input(shape=(MAX_LEN,), name='input_token', dtype='int32')
input_masks_layer = tf.keras.layers.Input(shape=(MAX_LEN,), name='masked_token', dtype='int32') 

X = TFBert(input_ids = input_ids_layer, attention_mask = input_masks_layer)[0]
# X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_sequences=True))(X)
# X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128))(X)
X = tf.keras.layers.Dropout(0.2)(X)
X = tf.keras.layers.Dense(1024, activation=tfa.activations.mish)(X)
X = tf.keras.layers.Flatten()(X)
X = tf.keras.layers.Dense(4, activation=tf.nn.softmax)(X)

model = tf.keras.Model(inputs=[input_ids_layer, input_masks_layer], outputs = X)

for layer in model.layers[:3]:
    layer.trainable = True

2022-04-20 22:58:29.397312: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
time: 5.06 s (started: 2022-04-20 22:58:26 +08:00)


In [23]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_token (InputLayer)       [(None, 360)]        0           []                               
                                                                                                  
 masked_token (InputLayer)      [(None, 360)]        0           []                               
                                                                                                  
 tf_distil_bert_model (TFDistil  TFBaseModelOutput(l  66362880   ['input_token[0][0]',            
 BertModel)                     ast_hidden_state=(N               'masked_token[0][0]']           
                                one, 360, 768),                                                   
                                 hidden_states=((No                                           

#### Model callbacks

In [24]:
ckpt_dir = './ckpt'
if not os.path.exists(ckpt_dir):
    os.makedirs(ckpt_dir)

time: 230 µs (started: 2022-04-20 22:58:32 +08:00)


In [25]:
model_checkpoint = ModelCheckpoint(filepath=ckpt_dir + '/weights_val_best.hdf5',
                                   monitor='val_accuracy',
                                   save_weights_only=True,
                                   save_best_only=True,
                                   verbose=0)

early_stopping = EarlyStopping(patience=3,
                               monitor='val_accuracy',
                               min_delta=0,
                               mode='max',
                               restore_best_weights=False,
                               verbose=1)

reduce_lr = ReduceLROnPlateau(monitor='val_loss',
                              min_lr=0.000001,
                              patience=1,
                              mode='min',
                              factor=0.1,
                              min_delta=0.0001,
                              verbose=1)

time: 520 µs (started: 2022-04-20 22:58:32 +08:00)


In [26]:
model.compile(loss='sparse_categorical_crossentropy',
              optimizer=tfa.optimizers.RectifiedAdam(0.0001),
              metrics=['accuracy'])

history = model.fit(X_train,
                    y_train,
                    epochs=50,
                    batch_size=8,
                    validation_data=(X_val, y_val),
                    callbacks=[model_checkpoint, early_stopping, reduce_lr])

Epoch 1/50


2022-04-20 22:58:32.130367: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz




2022-04-20 22:58:34.198153: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


 130/2500 [>.............................] - ETA: 50:28 - loss: 1.2277 - accuracy: 0.4846

KeyboardInterrupt: 

time: 2min 55s (started: 2022-04-20 22:58:32 +08:00)


In [None]:
def plot_graphs(history, metric, title=''):
    plt.figure(figsize=(8, 6))
    plt.plot(history.history[metric],  label='Training')
    plt.xlabel('Epochs')
    plt.ylabel(metric)
    plt.title(title)
    plt.legend()
    plt.show()

In [None]:
plot_graphs(history, 'accuracy', 'Accuracy')

In [None]:
plot_graphs(history, 'loss', 'Loss')

#### Loading the best model and test on a test set

In [None]:
model.load_weights(ckpt_dir + '/weights_val_best.hdf5')

In [None]:
pred_probs = model.predict(X_test)
y_pred = np.argmax(pred_probs, axis=1)

#### Visualizing Confusion Matrix using Heatmap

In [None]:
cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
labels_names = ['Anti','Neutral','Pro','News'] 
class_names=[0, 1, 2, 3]

fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))

sns.set(font_scale = 1.2, color_codes=True, palette='deep')
sns.heatmap(pd.DataFrame(cm, index=labels_names ,columns=class_names), annot=True, annot_kws = {'size':16}, cmap='YlGnBu' ,fmt='g')
ax.xaxis.set_label_position('top')
plt.xticks(class_names, labels_names, rotation=45)
plt.yticks(class_names, labels_names, rotation=45)
plt.title('Confusion matrix', y=1.2)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

print('Test Accuracy:', accuracy_score(y_test, y_pred))

In [None]:
print(classification_report(y_test, y_pred, target_names=labels_names))

In [None]:
# mf = f1_score(y_test, y_pred, average='macro')
# wf = f1_score(y_test, y_pred, average='weighted')
# mp = precision_score(y_test, y_pred, average='macro')
# mr = recall_score(y_test, y_pred, average='macro')

In [None]:
# print(
#     f'Macro F1: {mf: .3f} \
#     | Weighted F1: {wf: .3f} \
#     | Macro Precision: {mp: .3f} \
#     | Macro recall: {mr: .3f}'
# )

In [None]:
print_evaluation(y_test,y_pred)