In [1]:
import os 
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/prometeo23-kaggle/train_absa.csv
/kaggle/input/prometeo23-kaggle/sample.csv
/kaggle/input/prometeo23-kaggle/test_absa.csv


In [2]:
import sys
!{sys.executable} -m pip install tensorflow-addons
import tensorflow_addons as tfa

[0m

In [3]:
#general purpose packages
import os
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn import preprocessing
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split

#transformers
from transformers import BertTokenizerFast
from transformers import TFBertModel
from transformers import RobertaTokenizerFast
from transformers import TFRobertaModel

#keras
import tensorflow as tf
from tensorflow import keras

#set seed for reproducibility
seed=42

In [4]:
df = pd.read_csv('/kaggle/input/prometeo23-kaggle/train_absa.csv')
df_test = pd.read_csv('/kaggle/input/prometeo23-kaggle/test_absa.csv')

In [5]:
tokenizer_roberta = RobertaTokenizerFast.from_pretrained("roberta-base")

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [6]:
X=df['text'].values
y=df['label'].values

In [7]:
x=df_test['text'].values

In [8]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, stratify=y, random_state=seed)

In [9]:
y_train_le = y_train.copy()
y_valid_le = y_valid.copy()

In [10]:
ohe = preprocessing.OneHotEncoder()
y_train = ohe.fit_transform(np.array(y_train).reshape(-1, 1)).toarray()
y_valid = ohe.fit_transform(np.array(y_valid).reshape(-1, 1)).toarray()

In [11]:
token_lens = []
for txt in X_train:
    tokens = tokenizer_roberta.encode(txt, max_length=333, truncation=True)
    token_lens.append(len(tokens))
max_length=np.max(token_lens)

In [12]:
MAX_LEN=333

In [14]:
MAX_LEN

333

In [15]:
def tokenize_roberta(data,max_len=MAX_LEN) :
    input_ids = []
    attention_masks = []
    for i in range(len(data)):
        encoded = tokenizer_roberta.encode_plus(
            data[i],
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            return_attention_mask=True
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    return np.array(input_ids),np.array(attention_masks)

In [16]:
train_input_ids, train_attention_masks = tokenize_roberta(X_train, MAX_LEN)
val_input_ids, val_attention_masks = tokenize_roberta(X_valid, MAX_LEN)
test_input_ids, test_attention_masks = tokenize_roberta(df_test['text'].values, MAX_LEN)

In [17]:
test_input_ids, test_attention_masks = tokenize_roberta(x, MAX_LEN)

In [18]:
metric1=tfa.metrics.F1Score(num_classes=3,threshold=0.5)
metric2=tfa.metrics.FBetaScore(num_classes=3,threshold=0.5,beta=2.0)

In [19]:
def create_model(bert_model, max_len=MAX_LEN):
    
    opt = tf.keras.optimizers.legacy.Adam(learning_rate=2e-5, decay=0.07)
    loss = tf.keras.losses.CategoricalCrossentropy()
    accuracy = tf.keras.metrics.CategoricalAccuracy()

    input_ids = tf.keras.Input(shape=(max_len,),dtype='int32')
    attention_masks = tf.keras.Input(shape=(max_len,),dtype='int32')
    output = bert_model([input_ids,attention_masks])
    output = output[1]
    output = tf.keras.layers.Dense(3, activation=tf.nn.softmax)(output)
    model = tf.keras.models.Model(inputs = [input_ids,attention_masks],outputs = output)
    model.compile(opt, loss=loss, metrics = [metric1,metric2])
    return model

In [20]:
roberta_model = TFRobertaModel.from_pretrained('roberta-base')

Downloading tf_model.h5:   0%|          | 0.00/657M [00:00<?, ?B/s]

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


In [21]:
model = create_model(roberta_model, MAX_LEN)
print(model.summary())

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 333)]        0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 333)]        0           []                               
                                                                                                  
 tf_roberta_model (TFRobertaMod  TFBaseModelOutputWi  124645632  ['input_1[0][0]',                
 el)                            thPoolingAndCrossAt               'input_2[0][0]']                
                                tentions(last_hidde                                               
                                n_state=(None, 333,                                           

In [22]:
history_2 = model.fit([train_input_ids,train_attention_masks], y_train, validation_data=([val_input_ids,val_attention_masks], y_valid), epochs=3, batch_size=4)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [23]:
model.save('roberta.h5')

In [24]:
results=list()
sentiment = model.predict([test_input_ids,test_attention_masks],batch_size=1,verbose = 1)



In [25]:
sentiment[0]

array([0.04398831, 0.90402454, 0.05198708], dtype=float32)

In [26]:
for i in range(0,len(sentiment)):
    a=sentiment[i]
    results.append(np.argmax(a))

In [27]:
results[0]

1

In [35]:
sub = pd.read_csv('/kaggle/input/prometeo23-kaggle/sample.csv')

In [36]:
sub_dict = {0:'Negative' , 1:'Neutral' , 2:'Positive'}
results = sub['Predicted'].map(sub_dict)

In [37]:
sub['Predicted'] = results

In [38]:
sub

Unnamed: 0,Id,Predicted
0,0,Negative
1,1,Neutral
2,2,Positive
3,3,Negative
4,4,Negative
...,...,...
495,495,Negative
496,496,Neutral
497,497,Negative
498,498,Positive


In [34]:
sub.to_csv('submission.csv',index=False)