In [6]:
!ls /kaggle/input/

twitter-us-airline


In [8]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_scheduler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

# Step 1: Load and Preprocess Data
def clean_text(text):
    import re
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r"@\w+", "", text)    # Remove mentions
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove special characters
    return text.strip().lower()

data = pd.read_csv("/kaggle/input/twitter-us-airline/Twitter_US_Airline/Tweets.csv")
data["text"] = data["text"].apply(clean_text)

# Handle class imbalance
labels = data["airline_sentiment"].map({"positive": 2, "neutral": 1, "negative": 0})
class_weights = compute_class_weight(class_weight="balanced", classes=[0, 1, 2], y=labels)
class_weights = torch.tensor(class_weights, dtype=torch.float32).to("cuda")

# Split data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data["text"], labels, test_size=0.2, random_state=42
)

# Step 2: Tokenize Data
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

def tokenize_function(texts, labels):
    tokens = tokenizer(
        list(texts), padding="max_length", truncation=True, max_length=128, return_tensors="pt"
    )
    return tokens["input_ids"], tokens["attention_mask"], torch.tensor(labels.values, dtype=torch.long)

train_inputs, train_masks, train_labels = tokenize_function(train_texts, train_labels)
val_inputs, val_masks, val_labels = tokenize_function(val_texts, val_labels)

class SentimentDataset(Dataset):
    def __init__(self, inputs, masks, labels):
        self.inputs = inputs
        self.masks = masks
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.inputs[idx], self.masks[idx], self.labels[idx]

train_dataset = SentimentDataset(train_inputs, train_masks, train_labels)
val_dataset = SentimentDataset(val_inputs, val_masks, val_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# Step 3: Model Setup
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=3)
model.to("cuda")

# Step 4: Optimization and Scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
epochs = 1
scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * epochs
)

loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)

# Step 5: Training and Validation Loops
def train_epoch(model, loader):
    model.train()
    total_loss, total_correct = 0, 0
    for inputs, masks, labels in loader:
        inputs, masks, labels = inputs.to("cuda"), masks.to("cuda"), labels.to("cuda")

        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks)
        loss = loss_fn(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        preds = outputs.logits.argmax(dim=1)
        total_correct += (preds == labels).sum().item()

    return total_loss / len(loader), total_correct / len(loader.dataset)

def validate_epoch(model, loader):
    model.eval()
    total_loss, total_correct = 0, 0
    with torch.no_grad():
        for inputs, masks, labels in loader:
            inputs, masks, labels = inputs.to("cuda"), masks.to("cuda"), labels.to("cuda")
            outputs = model(inputs, attention_mask=masks)
            loss = loss_fn(outputs.logits, labels)
            total_loss += loss.item()
            preds = outputs.logits.argmax(dim=1)
            total_correct += (preds == labels).sum().item()

    return total_loss / len(loader), total_correct / len(loader.dataset)

for epoch in range(epochs):
    train_loss, train_acc = train_epoch(model, train_loader)
    val_loss, val_acc = validate_epoch(model, val_loader)
    print(f"Epoch {epoch + 1}/{epochs}:")
    print(f"  Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}")
    print(f"  Val Loss: {val_loss:.4f}, Val Accuracy: {val_acc:.4f}")

# Step 6: Evaluation
model.eval()
preds, true_labels = [], []
with torch.no_grad():
    for inputs, masks, labels in val_loader:
        inputs, masks, labels = inputs.to("cuda"), masks.to("cuda"), labels.to("cuda")
        outputs = model(inputs, attention_mask=masks)
        preds.extend(outputs.logits.argmax(dim=1).cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

print(classification_report(true_labels, preds, target_names=["Negative", "Neutral", "Positive"]))


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/1:
  Train Loss: 0.5773, Train Accuracy: 0.7774
  Val Loss: 0.4670, Val Accuracy: 0.8286
              precision    recall  f1-score   support

    Negative       0.95      0.85      0.89      1889
     Neutral       0.64      0.72      0.68       580
    Positive       0.70      0.88      0.78       459

    accuracy                           0.83      2928
   macro avg       0.76      0.82      0.78      2928
weighted avg       0.85      0.83      0.83      2928



In [None]:
#version3 code here

In [None]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import RobertaTokenizer, RobertaModel
from sklearn.metrics import classification_report
from sklearn.utils import resample
import numpy as np
import random

# Fix random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

# Step 1: Load and Preprocess Data
def clean_text(text):
    import re
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r"@\w+", "", text)    # Remove mentions
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove special characters
    return text.strip().lower()

# Load dataset
dataset_path = "/kaggle/input/twitter-us-airline/Twitter_US_Airline/Tweets.csv"
df = pd.read_csv(dataset_path)

# Apply text cleaning
df["clean_text"] = df["text"].apply(clean_text)

# Extract cleaned texts and labels
texts = df["clean_text"].tolist()
labels = df["airline_sentiment"].map({"negative": 0, "neutral": 1, "positive": 2}).tolist()

# Step 2: Oversample Data for Class Balancing
def oversample_data(texts, labels):
    data = list(zip(texts, labels))
    negative = [x for x in data if x[1] == 0]
    neutral = [x for x in data if x[1] == 1]
    positive = [x for x in data if x[1] == 2]

    neutral_upsampled = resample(neutral, replace=True, n_samples=len(negative), random_state=42)
    positive_upsampled = resample(positive, replace=True, n_samples=len(negative), random_state=42)

    balanced_data = negative + neutral_upsampled + positive_upsampled
    random.shuffle(balanced_data)
    return zip(*balanced_data)

# Step 3: Dataset Class
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

# Step 4: Attention Layer
class AttentionLayer(nn.Module):
    def __init__(self, hidden_size):
        super(AttentionLayer, self).__init__()
        self.attention = nn.Linear(hidden_size, 1, bias=False)

    def forward(self, rnn_output):
        weights = torch.softmax(self.attention(rnn_output), dim=1)
        weighted_output = torch.sum(weights * rnn_output, dim=1)
        return weighted_output, weights

# Step 5: Base Model with RoBERTa + RNN + Attention
class RoBERTaRNNWithAttention(nn.Module):
    def __init__(self, model_type="lstm", hidden_size=128, num_classes=3):
        super(RoBERTaRNNWithAttention, self).__init__()
        self.roberta = RobertaModel.from_pretrained("roberta-base")
        self.rnn_type = model_type.lower()
        self.hidden_size = hidden_size

        if self.rnn_type == "lstm":
            self.rnn = nn.LSTM(self.roberta.config.hidden_size, hidden_size, batch_first=True, bidirectional=True)
        elif self.rnn_type == "bilstm":
            self.rnn = nn.LSTM(self.roberta.config.hidden_size, hidden_size, batch_first=True, bidirectional=True)
        elif self.rnn_type == "gru":
            self.rnn = nn.GRU(self.roberta.config.hidden_size, hidden_size, batch_first=True, bidirectional=True)
        else:
            raise ValueError(f"Unsupported RNN type: {model_type}")

        self.attention = AttentionLayer(hidden_size * 2)  # Bidirectional doubles the size
        self.fc = nn.Linear(hidden_size * 2, num_classes)

    def forward(self, input_ids, attention_mask):
        with torch.no_grad():  # Freeze RoBERTa during training
            roberta_output = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = roberta_output.last_hidden_state
        rnn_output, _ = self.rnn(last_hidden_state)
        attn_output, _ = self.attention(rnn_output)
        logits = self.fc(attn_output)
        return logits

# Step 6: Ensemble Model
class EnsembleModel(nn.Module):
    def __init__(self, models):
        super(EnsembleModel, self).__init__()
        self.models = nn.ModuleList(models)
        self.fc = nn.Linear(len(models) * 3, 3)

    def forward(self, input_ids, attention_mask):
        logits_list = [model(input_ids, attention_mask) for model in self.models]
        logits = torch.cat(logits_list, dim=1)  # Concatenate logits
        return self.fc(logits)

# Step 7: Oversample and Create Dataset
balanced_texts, balanced_labels = oversample_data(texts, labels)

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
dataset = SentimentDataset(balanced_texts, balanced_labels, tokenizer)

# Split dataset
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize models
lstm_model = RoBERTaRNNWithAttention(model_type="lstm").to(device)
bilstm_model = RoBERTaRNNWithAttention(model_type="bilstm").to(device)
gru_model = RoBERTaRNNWithAttention(model_type="gru").to(device)

ensemble_model = EnsembleModel([lstm_model, bilstm_model, gru_model]).to(device)

# Optimizer and loss
optimizer = torch.optim.AdamW(ensemble_model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    ensemble_model.train()
    train_loss, train_acc = 0, 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        logits = ensemble_model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        train_acc += (logits.argmax(dim=1) == labels).sum().item()

    val_loss, val_acc, val_preds, val_labels = 0, 0, [], []
    ensemble_model.eval()
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            logits = ensemble_model(input_ids, attention_mask)
            loss = criterion(logits, labels)

            val_loss += loss.item()
            val_acc += (logits.argmax(dim=1) == labels).sum().item()
            val_preds.extend(logits.argmax(dim=1).cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

    print(f"Epoch {epoch + 1}/{num_epochs}")
    print(f"Train Loss: {train_loss / len(train_loader):.4f}, Train Accuracy: {train_acc / len(train_dataset):.4f}")
    print(f"Val Loss: {val_loss / len(val_loader):.4f}, Val Accuracy: {val_acc / len(val_dataset):.4f}")

# Classification report
print(classification_report(val_labels, val_preds, target_names=["Negative", "Neutral", "Positive"]))

# Save model
torch.save(ensemble_model.state_dict(), "ensemble_model.pth")
tokenizer.save_pretrained("ensemble_model_tokenizer")


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
