In [2]:
!pip install torch transformers pandas scikit-learn



In [3]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import random
import time

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

print("All dependencies have been imported, and the random seed has been set.")


All dependencies have been imported, and the random seed has been set.


In [5]:
csv_path = "Tweets.csv"  
# df = pd.read_csv(csv_path, nrows=5000)
df = pd.read_csv(csv_path)

print("Sample data：")
print(df.head())

df = df.dropna(subset=['text', 'airline_sentiment'])

label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['airline_sentiment'])
print("Label mapping：", dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))

print("Data loading and preprocessing complete.")


Sample data：
             tweet_id airline_sentiment  airline_sentiment_confidence  \
0  570306133677760513           neutral                        1.0000   
1  570301130888122368          positive                        0.3486   
2  570301083672813571           neutral                        0.6837   
3  570301031407624196          negative                        1.0000   
4  570300817074462722          negative                        1.0000   

  negativereason  negativereason_confidence         airline  \
0            NaN                        NaN  Virgin America   
1            NaN                     0.0000  Virgin America   
2            NaN                        NaN  Virgin America   
3     Bad Flight                     0.7033  Virgin America   
4     Can't Tell                     1.0000  Virgin America   

  airline_sentiment_gold        name negativereason_gold  retweet_count  \
0                    NaN     cairdin                 NaN              0   
1                  

In [6]:
class TweetDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=128):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.max_len = max_len
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        tweet = str(self.data.iloc[index]['text'])
        label = int(self.data.iloc[index]['label'])
        
        encoding = self.tokenizer.encode_plus(
            tweet,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_token_type_ids=False,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'tweet_text': tweet,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

print("The TweetDataset definition is complete.")


The TweetDataset definition is complete.


In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

dataset = TweetDataset(df, tokenizer, max_len=128)

train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

print(f"Dataset split complete: training set samples. {train_size}, Validation set samples {val_size}")


Dataset split complete: training set samples. 11712, Validation set samples 2928


In [12]:
num_labels = len(label_encoder.classes_)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
epochs = 3  
total_steps = len(train_loader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,  
    num_training_steps=total_steps
)

print("Model initialization, along with the optimizer and scheduler setup, is complete.")


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Model initialization, along with the optimizer and scheduler setup, is complete.




In [13]:
def train_epoch(model, data_loader, optimizer, scheduler, device):
    model.train()
    losses = []
    correct_predictions = 0
    
    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        
        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
    
    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

def eval_model(model, data_loader, device):
    model.eval()
    losses = []
    correct_predictions = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits
            
            _, preds = torch.max(logits, dim=1)
            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses), all_preds, all_labels

print("The training and validation functions have been defined.")


The training and validation functions have been defined.


In [16]:
history = {'train_acc': [], 'train_loss': [], 'val_acc': [], 'val_loss': []}
best_accuracy = 0

print("Begin training model...")

for epoch in range(epochs):
    print(f'Epoch {epoch+1}/{epochs}')
    start_time = time.time()
    
    train_acc, train_loss = train_epoch(model, train_loader, optimizer, scheduler, device)
    val_acc, val_loss, y_pred, y_true = eval_model(model, val_loader, device)
    
    end_time = time.time()
    
    history['train_acc'].append(train_acc.item())
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc.item())
    history['val_loss'].append(val_loss)
    
    print(f'Train loss: {train_loss:.4f}, Accuracy: {train_acc:.4f}')
    print(f'Val loss: {val_loss:.4f}, Accuracy: {val_acc:.4f}')
    print(f'Epoch comsumed time: {end_time - start_time:.0f} seconds\n')
    
    if val_acc > best_accuracy:
        best_accuracy = val_acc
        torch.save(model.state_dict(), 'best_model_state.bin')
        print("Best model is saved")

print("Model training complete.")


Begin training model...
Epoch 1/3
Train loss: 0.4883, Accuracy: 0.8101
Val loss: 0.3907, Accuracy: 0.8470
Epoch comsumed time: 2716 seconds

Best model is saved
Epoch 2/3
Train loss: 0.2691, Accuracy: 0.9057
Val loss: 0.4443, Accuracy: 0.8542
Epoch comsumed time: 2671 seconds

Best model is saved
Epoch 3/3
Train loss: 0.1610, Accuracy: 0.9517
Val loss: 0.5363, Accuracy: 0.8535
Epoch comsumed time: 2834 seconds

Model training complete.


In [17]:
print("Classification report：")
report = classification_report(y_true, y_pred, target_names=label_encoder.classes_)
print(report)


Classification report：
              precision    recall  f1-score   support

    negative       0.90      0.92      0.91      1825
     neutral       0.75      0.67      0.71       638
    positive       0.79      0.83      0.81       465

    accuracy                           0.85      2928
   macro avg       0.81      0.81      0.81      2928
weighted avg       0.85      0.85      0.85      2928



In [18]:

model.eval()

preds = []

for text in df['text']:
    encoding = tokenizer.encode_plus(
        str(text),
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
    
    pred_label_id = torch.argmax(logits, dim=1).cpu().item()
    pred_label_str = label_encoder.inverse_transform([pred_label_id])[0]
    preds.append(pred_label_str)

output_df = pd.DataFrame({
    'tweet_id': df['tweet_id'],
    'bert_pred': preds,
    'airline_sentiment': df['airline_sentiment']
})

output_path = "bert_vs_original.csv"
output_df.to_csv(output_path, index=False)

print("save complete，first 5 row previewed：")
print(output_df.head())

print(f"\n comparation was saved to `{output_path}`。")


save complete，first 5 row previewed：
             tweet_id bert_pred airline_sentiment
0  570306133677760513   neutral           neutral
1  570301130888122368  positive          positive
2  570301083672813571  negative           neutral
3  570301031407624196  negative          negative
4  570300817074462722  negative          negative

 comparation was saved to `bert_vs_original.csv`。
