In [2]:
!pip install torch transformers pandas scikit-learn



In [3]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import random
import time

def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)


print("All dependencies have been imported, and the random seed has been set.")


All dependencies have been imported, and the random seed has been set.


In [5]:
# Load the CSV file
df = pd.read_csv("Tweets.csv")
# df = pd.read_csv("Tweets.csv", nrows=2000)


# Keep only rows where 'text' and 'airline_sentiment' are not null
df = df.dropna(subset=['text', 'airline_sentiment'])

# Encode sentiment labels into integers
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['airline_sentiment'])
print("Label mapping:", dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))

# Split into train and test sets (80% train, 20% test), stratified by label
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df['label']
)
print(f"Train size: {len(train_df)}, Test size: {len(test_df)}")

print("Data loading and preprocessing complete.")


Label mapping: {'negative': 0, 'neutral': 1, 'positive': 2}
Train size: 11712, Test size: 2928
Data loading and preprocessing complete.


In [8]:
class TweetDataset(Dataset):
    def __init__(self, dataframe: pd.DataFrame, tokenizer: BertTokenizer, max_len: int = 128):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx: int):
        text = str(self.data.iloc[idx]['text'])
        label = int(self.data.iloc[idx]['label'])
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

print("The TweetDataset definition is complete.")


The TweetDataset definition is complete.


In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_dataset = TweetDataset(train_df, tokenizer, max_len=128)
test_dataset  = TweetDataset(test_df,  tokenizer, max_len=128)

batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader  = DataLoader(test_dataset,  batch_size=batch_size)



In [12]:
num_labels = len(label_encoder.classes_)
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=num_labels
)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
epochs = 3
total_steps = len(train_loader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

print("Model initialization, along with the optimizer and scheduler setup, is complete.")


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Model initialization, along with the optimizer and scheduler setup, is complete.




In [14]:
def train_epoch(model, loader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    correct = 0

    for batch in loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        loss, logits = outputs.loss, outputs.logits
        total_loss += loss.item()

        preds = torch.argmax(logits, dim=1)
        correct += (preds == labels).sum().item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    avg_loss = total_loss / len(loader)
    accuracy = correct / (len(loader.dataset))
    return accuracy, avg_loss

def eval_model(model, loader, device):
    model.eval()
    total_loss = 0
    correct = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss, logits = outputs.loss, outputs.logits
            total_loss += loss.item()

            preds = torch.argmax(logits, dim=1)
            correct += (preds == labels).sum().item()

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    avg_loss = total_loss / len(loader)
    accuracy = correct / (len(loader.dataset))
    return accuracy, avg_loss, all_preds, all_labels


print("The training and validation functions have been defined.")


The training and validation functions have been defined.


In [16]:
best_accuracy = 0

for epoch in range(epochs):
    start = time.time()
    train_acc, train_loss = train_epoch(model, train_loader, optimizer, scheduler, device)
    val_acc, val_loss, _, _ = eval_model(model, test_loader, device)
    end = time.time()

    print(f"Epoch {epoch+1}/{epochs} | "
          f"Train loss: {train_loss:.4f}, Train acc: {train_acc:.4f} | "
          f"Val loss: {val_loss:.4f}, Val acc: {val_acc:.4f} | "
          f"Time: {end-start:.0f}s")

    if val_acc > best_accuracy:
        best_accuracy = val_acc
        torch.save(model.state_dict(), 'best_model_state.bin')


print("Model training complete.")


Epoch 1/3 | Train loss: 0.4718, Train acc: 0.8179 | Val loss: 0.4275, Val acc: 0.8364 | Time: 2596s
Epoch 2/3 | Train loss: 0.2589, Train acc: 0.9122 | Val loss: 0.4743, Val acc: 0.8371 | Time: 2490s
Epoch 3/3 | Train loss: 0.1528, Train acc: 0.9566 | Val loss: 0.6146, Val acc: 0.8439 | Time: 2484s
Model training complete.


In [17]:
_, _, y_pred, y_true = eval_model(model, test_loader, device)
print(classification_report(y_true, y_pred, target_names=label_encoder.classes_))



              precision    recall  f1-score   support

    negative       0.90      0.92      0.91      1835
     neutral       0.72      0.67      0.70       620
    positive       0.78      0.77      0.78       473

    accuracy                           0.84      2928
   macro avg       0.80      0.79      0.79      2928
weighted avg       0.84      0.84      0.84      2928



In [18]:
model.eval()
preds = []

for item in test_dataset:
    input_ids = item['input_ids'].unsqueeze(0).to(device)
    attention_mask = item['attention_mask'].unsqueeze(0).to(device)
    with torch.no_grad():
        logits = model(input_ids=input_ids, attention_mask=attention_mask).logits
    pred_id = torch.argmax(logits, dim=1).cpu().item()
    preds.append(label_encoder.inverse_transform([pred_id])[0])

output_df = pd.DataFrame({
    'tweet_id': test_df['tweet_id'].values,
    'bert_pred': preds,
    'airline_sentiment': test_df['airline_sentiment'].values
})

output_df.to_csv("bert_alldata.csv", index=False)
print("Saved comparison CSV to bert_alldata.csv")
print(output_df.head())




Saved comparison CSV to bert_alldata.csv
             tweet_id bert_pred airline_sentiment
0  568803260569690112   neutral           neutral
1  569310070825226241  negative          positive
2  568155066668060672  negative          negative
3  570247551368241153  negative          negative
4  567783713833234432  negative          negative
