In [1]:
import tensorflow as tf
from transformers import TFAutoModelForSequenceClassification, AutoTokenizer
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import re

# Load dataset
file_path = "/kaggle/input/twitter-us-airline/Twitter_US_Airline/Tweets.csv"
df = pd.read_csv(file_path)

# Preprocess text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"[^\w\s]", "", text)
    return text.strip()

df["cleaned_text"] = df["text"].apply(preprocess_text)

# Encode labels
sentiment_mapping = {"negative": 0, "neutral": 1, "positive": 2}
df["label"] = df["airline_sentiment"].map(sentiment_mapping)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df["cleaned_text"], df["label"], test_size=0.2, random_state=42)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

# Tokenization
def encode_texts(texts, tokenizer, max_len=128):
    return tokenizer(list(texts), max_length=max_len, truncation=True, padding="max_length", return_tensors="tf")

X_train_enc = encode_texts(X_train, tokenizer)
X_test_enc = encode_texts(X_test, tokenizer)

# Convert labels to NumPy arrays
y_train = np.array(y_train)
y_test = np.array(y_test)

# Create TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((dict(X_train_enc), y_train)).shuffle(1000).batch(32).prefetch(tf.data.AUTOTUNE)
test_dataset = tf.data.Dataset.from_tensor_slices((dict(X_test_enc), y_test)).batch(32).prefetch(tf.data.AUTOTUNE)

# Load RoBERTa model
roberta_model = TFAutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=3, from_pt=True)

# Define optimizer, loss, and metrics
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = [tf.keras.metrics.SparseCategoricalAccuracy()]

# Compile the model
roberta_model.compile(optimizer=optimizer, loss=loss_fn, metrics=metrics)

# Train the model using model.fit()
epochs = 10
roberta_model.fit(train_dataset, validation_data=test_dataset, epochs=epochs)

# Save model
roberta_model.save_pretrained("/mnt/data/roberta_sentiment_model")
tokenizer.save_pretrained("/mnt/data/roberta_tokenizer")


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFRobertaForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


('/mnt/data/roberta_tokenizer/tokenizer_config.json',
 '/mnt/data/roberta_tokenizer/special_tokens_map.json',
 '/mnt/data/roberta_tokenizer/vocab.json',
 '/mnt/data/roberta_tokenizer/merges.txt',
 '/mnt/data/roberta_tokenizer/added_tokens.json',
 '/mnt/data/roberta_tokenizer/tokenizer.json')