In [9]:
# Import Libraries
import numpy as np
import pandas as pd
import re
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import (DebertaV2Tokenizer, TFDebertaV3ForSequenceClassification,
                          RobertaTokenizer, TFRobertaForSequenceClassification,
                          XLNetTokenizer, TFXLNetForSequenceClassification)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Load Dataset
file_path = "/kaggle/input/twitter-us-airline/Twitter_US_Airline/Tweets.csv"
df = pd.read_csv(file_path)

# Preprocess Text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"[^\w\s]", "", text)
    return text.strip()

df["cleaned_text"] = df["text"].apply(preprocess_text)

# Encode Labels
sentiment_mapping = {"negative": 0, "neutral": 1, "positive": 2}
df["label"] = df["airline_sentiment"].map(sentiment_mapping)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(df["cleaned_text"], df["label"], test_size=0.2, random_state=42)

# Hyperparameters
MAX_LENGTH = 128
BATCH_SIZE = 32
EPOCHS = 5
LEARNING_RATE = 3e-5
NUM_CLASSES = 3  # Change this according to your task

# Callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2)

# ------------------------- 1. TFDeBERTa Model -------------------------
deberta_name = "microsoft/deberta-v3-base"
deberta_tokenizer = DebertaV2Tokenizer.from_pretrained(deberta_name)
deberta_model = TFDebertaV3ForSequenceClassification.from_pretrained(deberta_name, num_labels=NUM_CLASSES)

# Tokenize Data for DeBERTa
deberta_train_encodings = deberta_tokenizer(list(X_train), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
deberta_test_encodings = deberta_tokenizer(list(X_test), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')

# TensorFlow Dataset for DeBERTa
deberta_train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(deberta_train_encodings),
    y_train
)).shuffle(len(X_train)).batch(BATCH_SIZE)

deberta_test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(deberta_test_encodings),
    y_test
)).batch(BATCH_SIZE)

# Compile and Train DeBERTa Model
deberta_model.compile(optimizer=Adam(learning_rate=LEARNING_RATE), loss=deberta_model.compute_loss, metrics=['accuracy'])
deberta_model.fit(deberta_train_dataset, validation_data=deberta_test_dataset, epochs=EPOCHS, callbacks=[early_stopping, reduce_lr])

# ------------------------- 2. TFRoBERTa Model -------------------------
roberta_name = "roberta-base"
roberta_tokenizer = RobertaTokenizer.from_pretrained(roberta_name)
roberta_model = TFRobertaForSequenceClassification.from_pretrained(roberta_name, num_labels=NUM_CLASSES)

# Tokenize Data for RoBERTa
roberta_train_encodings = roberta_tokenizer(list(X_train), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
roberta_test_encodings = roberta_tokenizer(list(X_test), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')

# TensorFlow Dataset for RoBERTa
roberta_train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(roberta_train_encodings),
    y_train
)).shuffle(len(X_train)).batch(BATCH_SIZE)

roberta_test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(roberta_test_encodings),
    y_test
)).batch(BATCH_SIZE)

# Compile and Train RoBERTa Model
roberta_model.compile(optimizer=Adam(learning_rate=LEARNING_RATE), loss=roberta_model.compute_loss, metrics=['accuracy'])
roberta_model.fit(roberta_train_dataset, validation_data=roberta_test_dataset, epochs=EPOCHS, callbacks=[early_stopping, reduce_lr])

# ------------------------- 3. TFXLNet Model -------------------------
xlnet_name = "xlnet-base-cased"
xlnet_tokenizer = XLNetTokenizer.from_pretrained(xlnet_name)
xlnet_model = TFXLNetForSequenceClassification.from_pretrained(xlnet_name, num_labels=NUM_CLASSES)

# Tokenize Data for XLNet
xlnet_train_encodings = xlnet_tokenizer(list(X_train), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
xlnet_test_encodings = xlnet_tokenizer(list(X_test), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')

# TensorFlow Dataset for XLNet
xlnet_train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(xlnet_train_encodings),
    y_train
)).shuffle(len(X_train)).batch(BATCH_SIZE)

xlnet_test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(xlnet_test_encodings),
    y_test
)).batch(BATCH_SIZE)

# Compile and Train XLNet Model
xlnet_model.compile(optimizer=Adam(learning_rate=LEARNING_RATE), loss=xlnet_model.compute_loss, metrics=['accuracy'])
xlnet_model.fit(xlnet_train_dataset, validation_data=xlnet_test_dataset, epochs=EPOCHS, callbacks=[early_stopping, reduce_lr])

# ------------------------- 4. Ensemble Predictions -------------------------
def get_ensemble_predictions(models, test_dataset):
    predictions = []
    for model in models:
        preds = model.predict(test_dataset).logits
        probs = tf.nn.softmax(preds, axis=1).numpy()
        predictions.append(probs)
    
    # Average the probabilities
    avg_probs = np.mean(predictions, axis=0)
    ensemble_preds = np.argmax(avg_probs, axis=1)
    return ensemble_preds

# Get predictions from all models
models = [deberta_model, roberta_model, xlnet_model]
ensemble_preds = get_ensemble_predictions(models, deberta_test_dataset)

# Evaluate Accuracy
accuracy = accuracy_score(y_test, ensemble_preds)
print(f'Ensemble Test Accuracy: {accuracy:.4f}')


ImportError: cannot import name 'TFDebertaV3ForSequenceClassification' from 'transformers' (/usr/local/lib/python3.10/dist-packages/transformers/__init__.py)