In [122]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
from datasets import load_dataset

# Your imports
from text_processing.pre_processing import preprocessing_text
from glove.glove_controller import load_glove, tweet_to_glove_vector
from vit import VisionTransformerWithLearnableAux

RANDOM_STATE = 123

In [123]:
# Set up autoreload
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
# ====================================
# Load Data
# ====================================

print("Loading TweetEval...")
dataset = load_dataset('tweet_eval', 'sentiment')

# Process TweetEval train data
train_texts = dataset['train']['text']
train_labels = [0 if l == 0 else 1 if l == 2 else -1 for l in dataset['train']['label']]
df_tweet_eval = pd.DataFrame({'text': train_texts, 'label': train_labels})
df_tweet_eval = df_tweet_eval[df_tweet_eval['label'] != -1]  # Remove neutral

print(f"TweetEval train: {len(df_tweet_eval)} samples")

# Load YOUR original training data
print("Loading your original training data...")
df_your_train = pd.read_csv('../../../data/twitter_sentiment_train.csv')[['text', 'label']]

print(f"Your original train: {len(df_your_train)} samples")

# Stack (Combine) Both Training Sets
df_train = pd.concat([df_tweet_eval, df_your_train], ignore_index=True)

# Shuffle the combined training data
df_train = df_train.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)

print(f"Combined train: {len(df_train)} samples")
print(f"  From TweetEval: {len(df_tweet_eval)}")
print(f"  From your data: {len(df_your_train)}")

# Use YOUR original test set
df_test = pd.read_csv('../../../data/twitter_sentiment_test.csv')[['text', 'label']]
df_test = df_test.reset_index(drop=True)

print(f"\nTest: {len(df_test)} samples")

print(f"\n{'='*60}")
print(f"FINAL DATASET:")
print(f"Train: {len(df_train)} (TweetEval + Your data)")
print(f"Test:  {len(df_test)} (Your original test set)")
print(f"{'='*60}")

Loading TweetEval...
TweetEval: 24942 samples
Balancing TweetEval...
  Before: Class 0=7093, Class 1=17849, Ratio=2.52
  After:  Class 0=17849, Class 1=17849, Ratio=1.00

Loading your original data...
Your data: 14186 samples
Balancing your data...
  Before: Class 0=7093, Class 1=7093, Ratio=1.00
  After:  Class 0=7093, Class 1=7093, Ratio=1.00

Loading SST-2...


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 67349/67349 [00:00<00:00, 717870.85 examples/s]
Generating validation split: 100%|██████████| 872/872 [00:00<00:00, 192203.12 examples/s]
Generating test split: 100%|██████████| 1821/1821 [00:00<00:00, 358718.18 examples/s]


SST-2: 67349 samples
Balancing SST-2...
  Before: Class 0=29780, Class 1=37569, Ratio=1.26
  After:  Class 0=37569, Class 1=37569, Ratio=1.00

COMBINING DATASETS

Combined train: 125022 samples
  From TweetEval: 35698
  From your data: 14186
  From SST-2: 75138

Final distribution:
label
0    62511
1    62511
Name: count, dtype: int64
Ratio: 1.00

Test: 6347 samples
Test distribution:
label
0    3972
1    2375
Name: count, dtype: int64

FINAL DATASET:
Train: 125022 samples (Balanced: TweetEval + Your data + SST-2)
Test:  6347 samples (Your original test set)


In [125]:
# ====================================
# Preprocessing and GloVe Embeddings
# ====================================

print("\nPreprocessing texts...")
clean_tokens_train = [preprocessing_text(t) for t in tqdm(df_train['text'], desc="Preprocessing train")]
clean_tokens_test = [preprocessing_text(t) for t in tqdm(df_test['text'], desc="Preprocessing test")]
clean_text_train = [' '.join(tokens) for tokens in clean_tokens_train]
clean_text_test = [' '.join(tokens) for tokens in clean_tokens_test]

# Load GloVe vectors
glove_path = r"C:\Users\eggle\Downloads\glove.twitter.27B\glove.twitter.27B.200d.txt"
EMBED_DIM = 200
glove_vectors = load_glove(glove_path, EMBED_DIM, use_cache=True)

# Create GloVe embeddings
print("Creating GloVe embeddings...")
glove_train = np.vstack([tweet_to_glove_vector(t, glove_vectors, EMBED_DIM) 
                         for t in tqdm(clean_text_train, desc="Embedding train")])
glove_test  = np.vstack([tweet_to_glove_vector(t, glove_vectors, EMBED_DIM) 
                         for t in tqdm(clean_text_test, desc="Embedding test")])

# Extract labels
y_train = df_train['label'].values
y_test = df_test['label'].values

print(f"GloVe train shape: {glove_train.shape}")
print(f"GloVe test shape: {glove_test.shape}")



Preprocessing texts...


Preprocessing train: 100%|██████████| 125022/125022 [00:51<00:00, 2441.67it/s]
Preprocessing test: 100%|██████████| 6347/6347 [00:02<00:00, 2347.65it/s]


Loading GloVe from cache: glove\glove.pkl
Loaded 1193514 word vectors from cache
Creating GloVe embeddings...


Embedding train: 100%|██████████| 125022/125022 [00:02<00:00, 42555.62it/s]
Embedding test: 100%|██████████| 6347/6347 [00:00<00:00, 38491.43it/s]

GloVe train shape: (125022, 200)
GloVe test shape: (6347, 200)





In [126]:
# ====================================
# Simple Dataset (Back to Original)
# ====================================

class GloVeImageDataset(Dataset):
    """Dataset for GloVe vectors reshaped as images."""
    
    def __init__(self, glove_vectors, labels, height=20, width=10):
        """
        Parameters
        ----------
        glove_vectors : np.array
            Array of shape (num_samples, 200).
        labels : np.array
            Array of shape (num_samples,).
        height : int
            Image height for reshaping.
        width : int
            Image width for reshaping.
        """
        # Reshape to (num_samples, 1, height, width) - add channel dimension
        self.glove_vectors = glove_vectors.reshape(-1, 1, height, width)
        self.labels = labels
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        # x is already (1, height, width) from reshape above
        x = torch.FloatTensor(self.glove_vectors[idx])
        y = torch.LongTensor([self.labels[idx]])[0]
        return x, y

# Prepare data
train_dataset = GloVeImageDataset(
    glove_train,
    y_train,
    height=20,
    width=10
)

test_dataset = GloVeImageDataset(
    glove_test,
    y_test,
    height=20,
    width=10
)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=0)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False, num_workers=0)

print(f"\nDatasets created")
print(f"Train batches: {len(train_loader)}, Test batches: {len(test_loader)}")


Datasets created
Train batches: 977, Test batches: 50


In [127]:
# ====================================
# Initialize Model (Original ViT)
# ====================================

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"\nUsing device: {device}")
num_epochs = 20

model = VisionTransformerWithLearnableAux(
    glove_dim=200,
    embed_dim=200,
    d_ff=1000,
    num_heads=5,
    layers=3,
    num_classes=2,
    dropout=0.1,
    num_auxiliary_patches=3,
    mode="fine-tuning"
).to(device)

print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

# Training setup
criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs, eta_min=1e-6)


Using device: cuda
Model parameters: 1,727,202


In [128]:
# ====================================
# Training Loop (Original)
# ====================================
best_acc = 0.0

print("\n" + "="*60)
print("Training Vision Transformer on GloVe Embeddings")
print("="*60 + "\n")

for epoch in range(num_epochs):
    # Training
    model.train()
    train_loss = 0
    
    for batch_x, batch_y in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        
        # Forward pass
        logits = model(batch_x)
        loss = criterion(logits, batch_y)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
    
    scheduler.step()
    
    # Evaluation
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch_x, batch_y in test_loader:
            batch_x = batch_x.to(device)
            logits = model(batch_x)
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(batch_y.numpy())
    
    acc = accuracy_score(all_labels, all_preds)
    
    print(f"Epoch {epoch+1}/{num_epochs} | Loss: {train_loss/len(train_loader):.4f} | Test Acc: {acc:.4f}")

print(f"\n{'='*60}")
print(f"Best Test Accuracy: {best_acc:.4f}")
print(f"{'='*60}")


Training Vision Transformer on GloVe Embeddings



Epoch 1/20: 100%|██████████| 977/977 [00:14<00:00, 69.38it/s]


Epoch 1/20 | Loss: 0.5385 | Test Acc: 0.8528


Epoch 2/20: 100%|██████████| 977/977 [00:13<00:00, 72.40it/s]


Epoch 2/20 | Loss: 0.5028 | Test Acc: 0.8568


Epoch 3/20: 100%|██████████| 977/977 [00:14<00:00, 68.33it/s]


Epoch 3/20 | Loss: 0.4964 | Test Acc: 0.8499


Epoch 4/20: 100%|██████████| 977/977 [00:12<00:00, 75.49it/s]


Epoch 4/20 | Loss: 0.4921 | Test Acc: 0.8547


Epoch 5/20: 100%|██████████| 977/977 [00:12<00:00, 81.13it/s]


Epoch 5/20 | Loss: 0.4878 | Test Acc: 0.8549


Epoch 6/20: 100%|██████████| 977/977 [00:12<00:00, 80.59it/s]


Epoch 6/20 | Loss: 0.4850 | Test Acc: 0.8462


Epoch 7/20:   1%|          | 11/977 [00:00<00:11, 83.81it/s]


KeyboardInterrupt: 

In [None]:
print("\nFinal Classification Report:")
print(classification_report(all_labels, all_preds, target_names=['Negative', 'Positive']))


Final Classification Report:
              precision    recall  f1-score   support

    Negative       0.86      0.92      0.89      3972
    Positive       0.84      0.76      0.80      2375

    accuracy                           0.86      6347
   macro avg       0.85      0.84      0.84      6347
weighted avg       0.86      0.86      0.85      6347

