RoBERTa model

**IMPORTS**

---



In [1]:
#scripts
!pip install emoji
!pip install transformers
!pip install gensim



In [2]:
import pandas as pd
import tensorflow as tf
import numpy as np
import emoji as emo
import sys
import re
from transformers import BertTokenizer , BertConfig
from sklearn.preprocessing import MultiLabelBinarizer #for binary encoding of labels
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import tensorflow_hub as hub
from sklearn.metrics import classification_report
import nltk
from nltk.corpus import stopwords
from gensim.parsing.preprocessing import remove_stopwords

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


**DATASET AND LABEL ONE-HOT ENCODING**

---



In [3]:
path_train = '/kaggle/input/vaccine/val_train.csv'
path_test = '/kaggle/input/testdata/test.csv'
ds = pd.read_csv(path_train)
tds = pd.read_csv(path_test)

#converting label strings into set
lst = ds['labels'].to_list()
labels = [[label] for label in lst]
def sep(target):
    return target[0].split()

seplabels = [sep(label) for label in labels]
#one-hot encoding
mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(seplabels)
label_array = np.array(labels)
tds.head()

Unnamed: 0,id,tweet
0,1070378532260470789t,Study Links HPV Vaccine to Historically High I...
1,973746711964372993t,Deaths from tainted measles vaccine affecting ...
2,1043031076787040257t,"Am apreciat un videoclip pe @YouTube, https://..."
3,1066338147527741440t,VIDEO --&gt;&gt; MMR #Vaccine increase risk of...
4,963522018544152576t,Oral Polio Vaccine: Infecting Unvaccinated Kid...


## **PRE PROCESSING**

---




In [4]:
def proc(tweet):
    tweet = tweet.lower()
    tweet = emo.demojize(tweet) #emoji to string
    tweet = re.sub(r"http[s]?://t.co/[a-zA-Z0-9]+" , "" , tweet)  #https twitter link removal
    tweet = re.sub(r"[!\"#\$%&\'\(\)\*\+,-\./:;<=>\?@\[\\\]\^_`{\|}~]" , "" , tweet) #unwanted expressions
    return(tweet)

tweets = ds['tweet'].apply(proc).tolist() #training data list
tests = tds['tweet'].apply(proc).tolist() #test data list






In [5]:
num_classes = len(mlb.classes_)
num_classes

12

In [6]:

tok = BertTokenizer.from_pretrained('bert-base-uncased')
tweet_encoded_training = tok(tweets , padding='max_length' , truncation=True , max_length = 150 , return_tensors='tf')
tweet_encoded_test = tok(tests , padding='max_length', truncation = True , max_length = 150 , return_tensors='tf')

x_train = {
    'input_word_ids':tweet_encoded_training['input_ids'],
    'input_mask': tweet_encoded_training['attention_mask'],
    'input_type_ids': tweet_encoded_training['token_type_ids']
}

x_test = {
    'input_word_ids': tweet_encoded_test['input_ids'],
    'input_mask': tweet_encoded_test['attention_mask'],
    'input_type_ids': tweet_encoded_test['token_type_ids']
}



Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

**TRAIN_TEST_SPLIT AND CONVERTING TO DICTIONARY WITH INPUTS**

---



In [7]:
data_list = [({'input_word_ids': x_train['input_word_ids'][i],
               'input_mask': x_train['input_mask'][i],
               'segment_ids': x_train['input_type_ids'][i]}, label) for i, label in enumerate(label_array)]

train_list, val_list = train_test_split(data_list, test_size=0.2, random_state=42)

train_x = {'input_word_ids': np.array([elem[0]['input_word_ids'] for elem in train_list]),
           'input_mask': np.array([elem[0]['input_mask'] for elem in train_list]),
           'input_type_ids': np.array([elem[0]['segment_ids'] for elem in train_list])}

val_x = {'input_word_ids': np.array([elem[0]['input_word_ids'] for elem in val_list]),
         'input_mask': np.array([elem[0]['input_mask'] for elem in val_list]),
         'input_type_ids': np.array([elem[0]['segment_ids'] for elem in val_list])}

train_labels = np.array([elem[1] for elem in train_list])
val_labels = np.array([elem[1] for elem in val_list])
val_labels.shape

(1985, 12)

In [8]:
tf.keras.backend.clear_session()

**MODEL**

---



In [9]:
module_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'
bert_layer = hub.KerasLayer(module_url, trainable=True)
max_len = 150

In [10]:
tf.keras.backend.clear_session()

In [11]:
def build_model(num_classes):
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")
    segment_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    pooled_output , sequence_output = bert_layer([input_word_ids , input_mask , segment_ids])
    dense = tf.keras.layers.Dense(128, activation='relu')(pooled_output)
    drop = tf.keras.layers.Dropout(0.1)(dense)
    dense2 = tf.keras.layers.Dense(64 , activation='relu')(drop)
    drop2 = tf.keras.layers.Dropout(0.1)(dense2)
    output = tf.keras.layers.Dense(num_classes , activation = 'sigmoid' , name = 'output')(drop2)

    model = tf.keras.Model(inputs = {
        'input_word_ids' : input_word_ids,
        'input_mask': input_mask,
        'input_type_ids': segment_ids
    }, outputs = output)

    return model

num_classes = 12
model = build_model(num_classes)
model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = 1e-6),
              loss = tf.keras.losses.BinaryCrossentropy(),
              metrics = 'accuracy')

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 150)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 150)]        0           []                               
                                                                                                  
 segment_ids (InputLayer)       [(None, 150)]        0           []                               
                                                                                                  
 keras_layer (KerasLayer)       [(None, 768),        109482241   ['input_ids[0][0]',              
                                 (None, 150, 768)]                'attention_mask[0][0]',     

In [12]:
tf.keras.backend.clear_session()

**TRAINING**

---



In [13]:
model.fit(train_x , train_labels , validation_data=(val_x , val_labels) , epochs=3, batch_size=16)
#clear GPU memory
tf.keras.backend.clear_session()

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [14]:
output = model.predict(x_test)




output/metrics

In [15]:
pred = (output > 0.25).astype(int)

all_labels = mlb.classes_
pred_list = mlb.inverse_transform(pred)
ids = tds['id']
#for choosing most likely labels(countering empty tuples)
for i , tup in enumerate(pred_list):
    if not tup: #if the tuple is empty
      #analysing probs
      prob = output[i]
      top_index = prob.argsort()[-3:] #top 3 labels
      final_label = all_labels[top_index]
      pred_list[i] = tuple(final_label)



**MAKING CSV**

---



In [16]:
final_dict = {'id':[] , 'preds':[]}

for i in range(len(ids)):
    final_dict['id'].append(ids[i])
    tuples = pred_list[i]
    final_dict['preds'].append(' '.join(tuples))


print(final_dict)

final = pd.DataFrame(final_dict)
final.to_csv('output_11823_1e6_30.csv' , index=False)


{'id': ['1070378532260470789t', '973746711964372993t', '1043031076787040257t', '1066338147527741440t', '963522018544152576t', '1027180081997930501t', '1015587373537562625t', '1010148037828726784t', '1024938906750537728t', '1087794890942099457t', '1096050258277224449t', '1109166251765571589t', '1164355472469704704t', '1049860045423435776t', '973082796859232256t', '1125356654483009537t', '1026937551309942784t', '1123909055599214598t', '1071079276433289216t', '999794111996727297t', '948219542442008576t', '1057586435564417025t', '974170030106492928t', '951940073934770176t', '1109958380301500417t', '1137165623514075138t', '1148976649624535042t', '1030265559395573761t', '1164194599293710338t', '1125215271373942784t', '1054786806183485440t', '967234030281216000t', '1160964931128369153t', '1088393073023950848t', '1014717937825898496t', '1142464446809272320t', '1061268802820849667t', '1170970178298867713t', '1018015755751903232t', '1042020166157185024t', '1128845248921264128t', '991498571718459