# Step 1: Load the dataset and Preprocess data

### Split the data into train and test df

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset (Sentiment140 dataset)
df = pd.read_csv('tweet_dataset.csv', encoding='ISO-8859-1', header=None)
df.columns = ['target', 'ids', 'date', 'flag', 'user', 'text']

# Simplify sentiment labels (0 = negative, 1 = positive)
df['target'] = df['target'].replace({4: 1, 0: 0})

# Split the data into train and test DataFrames
train_df, test_df = train_test_split(df[['text', 'target']], test_size=0.2, random_state=42)

print(f"Train size: {len(train_df)}, Test size: {len(test_df)}")

Train size: 1280000, Test size: 320000


# Step 2: Tokenize Tweets Using Hugging Faceâ€™s BERT Tokenizer

### We use pytorch version that tailored to Apple Silicon

In [4]:
import torch
from transformers import BertTokenizer

# Check if MPS (Apple Silicon GPU) is available; fallback to CPU if not
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
print(f"Using device: {device}")

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize function using DataFrames
def tokenize_df(df):
    tokens = tokenizer(
        df['text'].to_list(), padding=True, truncation=True, return_tensors='pt'
    )
    # Move tensors to the selected device (MPS/CPU)
    tokens = {key: val.to(device) for key, val in tokens.items()}
    # Return tokens as a DataFrame for easy analysis
    return pd.DataFrame({key: val.cpu().numpy().tolist() for key, val in tokens.items()})

# Tokenize train and test DataFrames
train_tokens_df = tokenize_df(train_df)
test_tokens_df = tokenize_df(test_df)

print(train_tokens_df.head())

  from .autonotebook import tqdm as notebook_tqdm


Using device: mps
                                           input_ids  \
0  [101, 1030, 1046, 19279, 4710, 10626, 2007, 80...   
1  [101, 2371, 1996, 8372, 2023, 5027, 1010, 2009...   
2  [101, 21766, 18142, 2015, 2006, 11344, 2024, 2...   
3  [101, 3492, 2919, 2305, 2046, 1037, 10231, 768...   
4  [101, 1030, 5887, 23736, 9468, 18319, 3398, 10...   

                                      token_type_ids  \
0  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
1  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
2  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
3  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
4  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   

                                      attention_mask  
0  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...  
1  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...  
2  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...  
3  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...  
4  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

# Step 3: Create Dataset Class to Work with DataFrames

In [6]:
class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, tokens_df, labels_df):
        self.tokens_df = tokens_df
        self.labels = labels_df['target'].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.tokens_df.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

# Create DataLoader-ready datasets
train_dataset = TweetDataset(train_tokens_df, train_df)
test_dataset = TweetDataset(test_tokens_df, test_df)

#  Step 4: Fine-Tune BERT Model

In [7]:
from transformers import BertForSequenceClassification, AdamW

# Load pre-trained BERT with a classification head (binary classification)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.to(device)

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Create DataLoader for batch processing
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True)

# Training loop
for epoch in range(3):  # Train for 3 epochs
    model.train()
    for batch in train_loader:
        batch = {key: val.to(device) for key, val in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    print(f"Epoch {epoch + 1} completed.")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Step 5: Evaluate the Model

In [None]:
from sklearn.metrics import accuracy_score

# Create DataLoader for test data
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=8)

# Evaluate the model
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        batch = {key: val.to(device) for key, val in batch.items()}
        outputs = model(**batch)
        preds = torch.argmax(outputs.logits, dim=-1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(batch['labels'].cpu().numpy())

# Calculate accuracy
accuracy = accuracy_score(all_labels, all_preds)
print(f"Test Accuracy: {accuracy * 100:.2f}%")