In [8]:
import tensorflow as tf
from transformers import TFAutoModelForSequenceClassification, AutoTokenizer
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import re

# Load dataset
file_path = "/kaggle/input/twitter-us-airline/Twitter_US_Airline/Tweets.csv"
df = pd.read_csv(file_path)

# Preprocess text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"[^\w\s]", "", text)
    return text.strip()

df["cleaned_text"] = df["text"].apply(preprocess_text)

# Encode labels
sentiment_mapping = {"negative": 0, "neutral": 1, "positive": 2}
df["label"] = df["airline_sentiment"].map(sentiment_mapping)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df["cleaned_text"], df["label"], test_size=0.2, random_state=42)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

# Tokenization
def encode_texts(texts, tokenizer, max_len=128):
    return tokenizer(list(texts), max_length=max_len, truncation=True, padding="max_length", return_tensors="tf")

X_train_enc = encode_texts(X_train, tokenizer)
X_test_enc = encode_texts(X_test, tokenizer)

# Create TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((dict(X_train_enc), y_train)).shuffle(1000).batch(32)
test_dataset = tf.data.Dataset.from_tensor_slices((dict(X_test_enc), y_test)).batch(32)

# Load RoBERTa model
roberta_model = TFAutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=3, from_pt=True)

# Define optimizer and loss
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# Define accuracy metric
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()
test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()

# Custom Training Step
@tf.function
def train_step(batch_inputs, batch_labels):
    with tf.GradientTape() as tape:
        logits = roberta_model(batch_inputs, training=True).logits
        loss = loss_fn(batch_labels, logits)
    
    gradients = tape.gradient(loss, roberta_model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, roberta_model.trainable_variables))

    # Update accuracy metric
    train_accuracy.update_state(batch_labels, logits)
    
    return loss

# Custom Testing Step
@tf.function
def test_step(batch_inputs, batch_labels):
    logits = roberta_model(batch_inputs, training=False).logits
    loss = loss_fn(batch_labels, logits)
    
    # Update accuracy metric
    test_accuracy.update_state(batch_labels, logits)
    
    return loss

# Training Loop
epochs = 3
for epoch in range(epochs):
    print(f"\nEpoch {epoch + 1}/{epochs}")

    # Reset accuracy at start of epoch
    train_accuracy.reset_state()
    test_accuracy.reset_state()

    # Training
    for batch_inputs, batch_labels in train_dataset:
        loss = train_step(batch_inputs, batch_labels)

    # Testing
    for batch_inputs, batch_labels in test_dataset:
        test_loss = test_step(batch_inputs, batch_labels)

    print(f"Train Loss: {loss.numpy():.4f}, Train Accuracy: {train_accuracy.result().numpy():.4f}")
    print(f"Test Loss: {test_loss.numpy():.4f}, Test Accuracy: {test_accuracy.result().numpy():.4f}")

# Save model
roberta_model.save_pretrained("/mnt/data/roberta_sentiment_model")
tokenizer.save_pretrained("/mnt/data/roberta_tokenizer")


All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFRobertaForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/3
Train Loss: 0.7854, Train Accuracy: 0.8020
Test Loss: 0.1628, Test Accuracy: 0.8583

Epoch 2/3
Train Loss: 0.3418, Train Accuracy: 0.8676
Test Loss: 0.1544, Test Accuracy: 0.8596

Epoch 3/3
Train Loss: 0.3315, Train Accuracy: 0.9032
Test Loss: 0.1566, Test Accuracy: 0.8552


('/mnt/data/roberta_tokenizer/tokenizer_config.json',
 '/mnt/data/roberta_tokenizer/special_tokens_map.json',
 '/mnt/data/roberta_tokenizer/vocab.json',
 '/mnt/data/roberta_tokenizer/merges.txt',
 '/mnt/data/roberta_tokenizer/added_tokens.json',
 '/mnt/data/roberta_tokenizer/tokenizer.json')