In [3]:
import re
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt
from tensorflow.keras.layers import Embedding, LSTM, GRU, Dense, Dropout, Input, Bidirectional, Concatenate

print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

# Load dataset
file_path = '/kaggle/input/twitter-us-airline/Twitter_US_Airline/Tweets.csv'  # Replace with your dataset path
df = pd.read_csv(file_path)

# Preprocess text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r"@\w+", "", text)  # Remove mentions
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    return text.strip()

# Apply text preprocessing
df["cleaned_text"] = df["text"].apply(preprocess_text)

# Encode labels
sentiment_mapping = {"negative": 0, "neutral": 1, "positive": 2}
df["label"] = df["airline_sentiment"].map(sentiment_mapping)

# Tokenization using Roberta tokenizer
max_len = 100  # Maximum length for padding
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Encode the text using Roberta tokenizer
encoded_inputs = tokenizer(list(df["cleaned_text"]), truncation=True, padding=True, max_length=max_len, return_tensors="tf")

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    encoded_inputs['input_ids'].numpy(), 
    df['label'], 
    test_size=0.2, 
    random_state=42
)

# Convert labels to categorical
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

# Load Roberta Model for Embedding
roberta_model = TFRobertaModel.from_pretrained('roberta-base')

# Define the Ensemble Model
input_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32)

# Ensure the input_ids are a TensorFlow tensor
input_ids = tf.convert_to_tensor(encoded_inputs['input_ids'])

# Get embeddings from Roberta
embedding_layer = roberta_model(input_ids)[0]  # Use hidden states from the Roberta model

# LSTM Branch
lstm_branch = LSTM(128, return_sequences=True)(embedding_layer)
lstm_branch = LSTM(64)(lstm_branch)

# BiLSTM Branch
bilstm_branch = Bidirectional(LSTM(128, return_sequences=True))(embedding_layer)
bilstm_branch = LSTM(64)(bilstm_branch)

# GRU Branch
gru_branch = GRU(128, return_sequences=True)(embedding_layer)
gru_branch = GRU(64)(gru_branch)

# Concatenate all branches
concatenated = Concatenate()([lstm_branch, bilstm_branch, gru_branch])

# Dense Layers
dense1 = Dense(128, activation='relu')(concatenated)
dropout1 = Dropout(0.5)(dense1)
dense2 = Dense(64, activation='relu')(dropout1)
dropout2 = Dropout(0.5)(dense2)

# Output Layer
outputs = Dense(3, activation='softmax')(dropout2)  # 3 sentiment classes

# Build the Model
model = tf.keras.Model(inputs=input_ids, outputs=outputs)

# Compile the Model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), 
              loss='categorical_crossentropy', 
              metrics=['accuracy'])

# Model Summary
model.summary()

# Callbacks for training
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, verbose=1)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, verbose=1)

print("Starting training...")

# Train the Model
history = model.fit(
    X_train, y_train,
    epochs=30,  # Increase if needed
    batch_size=32,
    validation_split=0.2,
    callbacks=[early_stopping, lr_scheduler],
    verbose=1
)

print("Training completed.")

print("Starting evaluation...")

# Evaluate the Model
loss, accuracy = model.evaluate(X_test, y_test, verbose=1)
print(f'Test Accuracy: {accuracy:.2f}')

print("Evaluation completed.")

# Save the Model
model.save('roberta_ensemble_model.h5')

# Visualize Training Performance
plt.figure(figsize=(12, 6))
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Metrics')
plt.xlabel('Epochs')
plt.ylabel('Accuracy / Loss')
plt.legend()
plt.show()


Num GPUs Available:  1


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaModel were not initialized from the PyTorch model and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infe

ResourceExhaustedError: Exception encountered when calling layer 'LayerNorm' (type LayerNormalization).

{{function_node __wrapped__Mul_device_/job:localhost/replica:0/task:0/device:GPU:0}} failed to allocate memory [Op:Mul] name: 

Call arguments received by layer 'LayerNorm' (type LayerNormalization):
  • inputs=tf.Tensor(shape=(14640, 45, 768), dtype=float32)