#  Twitter Airline RoBERTa
This notebook referred to the following noteboook.<br/>
https://www.kaggle.com/code/junjitakeshima/ell-simple-roberta-starter-eng

# 1. Read Data

In [None]:
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import transformers
from transformers import RobertaTokenizer, TFRobertaModel
transformers.logging.set_verbosity_error()
import re
pd.set_option("display.max_columns", None)

In [None]:
data0 = pd.read_csv('../input/twitter-airline-sentiment/Tweets.csv')
display(data0[0:3].T)

In [None]:
data=data0[['airline_sentiment','text']]
data=data.dropna()
data['airline_sentiment'].value_counts()

In [None]:
Name0=data['airline_sentiment'].unique().tolist()
Name=sorted(Name0)
N=list(range(len(Name)))
normal_mapping=dict(zip(Name,N)) 
reverse_mapping=dict(zip(N,Name)) 
print(normal_mapping)
data['airline_sentiment']=data['airline_sentiment'].map(normal_mapping)

In [None]:
n=len(data)
N=list(range(n))
random.shuffle(N)

In [None]:
train_df = data.iloc[N[0:(n//10)*3]].reset_index(drop=True)
test_df = data.iloc[N[(n//10)*3:(n//10)*4]].reset_index(drop=True)

In [None]:
train_df["text"] = train_df["text"].replace(re.compile(r'[\n\r\t]'), ' ', regex=True)
test_df["text"] = test_df["text"].replace(re.compile(r'[\n\r\t]'), ' ', regex=True)

# 2. Tokenize and create data

In [None]:
tokenizer = RobertaTokenizer.from_pretrained("../input/roberta-base/")
max_len = 128

In [None]:
def create_data(text):
    
    encoded = tokenizer.batch_encode_plus(
        text,
        add_special_tokens = True,
        max_length= max_len,
        padding='max_length',
        truncation=True,
        return_attention_mask=True)

    input_ids       = np.array(encoded["input_ids"], dtype="int32")
    attention_masks = np.array(encoded["attention_mask"], dtype="int32")

    return {"input_ids": input_ids, "attention_masks": attention_masks}

In [None]:
train_data   = create_data(train_df['text'])

In [None]:
train = []
train.append(train_df["airline_sentiment"].to_list())


# 3. Build Model

In [None]:
def build_model():
    
    model_ids  = Input(shape=(max_len, ), dtype = tf.int32)
    model_mask = Input(shape=(max_len, ), dtype = tf.int32)
    
    roberta_model = TFRobertaModel.from_pretrained("../input/roberta-base/")
    
    x = roberta_model(input_ids = model_ids, 
                      attention_mask = model_mask)       
    x = tf.keras.layers.GlobalAveragePooling1D()(x.last_hidden_state)    
    outputs = Dense(len(Name))(x) ####
    
    model = tf.keras.Model(inputs = [model_ids, model_mask], outputs = outputs)
    
    model.compile(
        optimizer = tf.keras.optimizers.Adam(),
        loss = "mse",
        metrics=["mse"])
    return model

In [None]:
def scheduler(epoch):
    learning_rate = 2e-5
    if epoch == 0:
        return learning_rate * 0.05
    else:
        return learning_rate * (0.9**epoch)
    
callback_lr = tf.keras.callbacks.LearningRateScheduler(scheduler)

In [None]:
def get_model(train_col) :
    
    model = build_model()
    model.fit((np.array(train_data['input_ids']),
               np.array(train_data['attention_masks'])),
        np.array(train_col).ravel(), 
        epochs = 10,
        shuffle=True,
        callbacks = [EarlyStopping(monitor='val_mse', patience=3, restore_best_weights=True), 
                     ModelCheckpoint('roberta_uspppm.h5', monitor='val_mse', 
                                     save_best_only=True, save_weights_only=True), 
                     callback_lr],                     
        batch_size = 16,
        validation_split=0.2 )
    
    return model

# 4. Get Model


In [None]:
%%time

target_cols = ["airline_sentiment", ]
models = {}

for i, col in enumerate(target_cols) :
            
    print (f"-------------- Model for {col} ---------------")
    model = get_model(train[i])
    models[i] = model

# 5. Submission

In [None]:
test_data = create_data(test_df['text'])

In [None]:
preds = []
for i in range(1) :
    pred = models[i].predict((np.array(test_data['input_ids']),
                              np.array(test_data['attention_masks'])))
    preds.append(np.argmax(pred,axis=1))
    
trues = test_df["airline_sentiment"]

In [None]:
from sklearn.metrics import classification_report
print(classification_report(trues,preds[0],target_names=Name,digits=4))