In [38]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
from datasets import load_dataset

# Your imports
from text_processing.pre_processing import preprocessing_text
from glove.glove_controller import load_glove, tweet_to_glove_vector
from vit import VisionTransformerWithLearnableAux

RANDOM_STATE = 123

In [39]:
# Set up autoreload
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
# # ====================================
# # Load Data
# # ====================================

# print("Loading TweetEval...")
# dataset = load_dataset('tweet_eval', 'sentiment')

# # Process TweetEval train data
# train_texts = dataset['train']['text']
# train_labels = [0 if l == 0 else 1 if l == 2 else -1 for l in dataset['train']['label']]
# df_tweet_eval = pd.DataFrame({'text': train_texts, 'label': train_labels})
# df_tweet_eval = df_tweet_eval[df_tweet_eval['label'] != -1]  # Remove neutral

# print(f"TweetEval train: {len(df_tweet_eval)} samples")

# # Load YOUR original training data
# print("Loading your original training data...")
# df_your_train = pd.read_csv('../../../data/twitter_sentiment_train.csv')[['text', 'label']]

# print(f"Your original train: {len(df_your_train)} samples")

# # Stack (Combine) Both Training Sets
# df_train = pd.concat([df_tweet_eval, df_your_train], ignore_index=True)

# # Shuffle the combined training data
# df_train = df_train.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)

# print(f"Combined train: {len(df_train)} samples")
# print(f"  From TweetEval: {len(df_tweet_eval)}")
# print(f"  From your data: {len(df_your_train)}")

# # Use YOUR original test set
# df_test = pd.read_csv('../../../data/twitter_sentiment_test.csv')[['text', 'label']]
# df_test = df_test.reset_index(drop=True)

# print(f"\nTest: {len(df_test)} samples")

# print(f"\n{'='*60}")
# print(f"FINAL DATASET:")
# print(f"Train: {len(df_train)} (TweetEval + Your data)")
# print(f"Test:  {len(df_test)} (Your original test set)")
# print(f"{'='*60}")
from sklearn.model_selection import train_test_split

# Load both datasets
df_train = pd.read_csv('../../data/twitter_sentiment_train.csv')
df_test  = pd.read_csv('../../data/twitter_sentiment_test.csv')

# Concatenate both datasets into one
df_combined = pd.concat([df_train, df_test], ignore_index=True)

# Split into 80% training and 20% testing
df_train, df_test = train_test_split(
    df_combined,
    test_size=0.2,           # 20% for testing
    random_state=RANDOM_STATE,
    stratify=df_combined['label']  # Ensures balanced class distribution in both splits
)

df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

Loading TweetEval...


KeyboardInterrupt: 

In [None]:
# ====================================
# Preprocessing and GloVe Embeddings
# ====================================

print("\nPreprocessing texts...")
clean_tokens_train = [preprocessing_text(t) for t in tqdm(df_train['text'], desc="Preprocessing train")]
clean_tokens_test = [preprocessing_text(t) for t in tqdm(df_test['text'], desc="Preprocessing test")]
clean_text_train = [' '.join(tokens) for tokens in clean_tokens_train]
clean_text_test = [' '.join(tokens) for tokens in clean_tokens_test]

# Load GloVe vectors
glove_path = r"C:\Users\eggle\Downloads\glove.twitter.27B\glove.twitter.27B.200d.txt"
EMBED_DIM = 200
glove_vectors = load_glove(glove_path, EMBED_DIM, use_cache=True)

# Create GloVe embeddings
print("Creating GloVe embeddings...")
glove_train = np.vstack([tweet_to_glove_vector(t, glove_vectors, EMBED_DIM) 
                         for t in tqdm(clean_text_train, desc="Embedding train")])
glove_test  = np.vstack([tweet_to_glove_vector(t, glove_vectors, EMBED_DIM) 
                         for t in tqdm(clean_text_test, desc="Embedding test")])

# Extract labels
y_train = df_train['label'].values
y_test = df_test['label'].values

print(f"GloVe train shape: {glove_train.shape}")
print(f"GloVe test shape: {glove_test.shape}")



Preprocessing texts...


Preprocessing train: 100%|██████████| 40927/40927 [00:10<00:00, 3794.76it/s]
Preprocessing test: 100%|██████████| 4548/4548 [00:01<00:00, 3582.94it/s]


Loading GloVe from cache: glove\glove.pkl
Loaded 1193514 word vectors from cache
Creating GloVe embeddings...


Embedding train: 100%|██████████| 40927/40927 [00:00<00:00, 72584.95it/s]
Embedding test: 100%|██████████| 4548/4548 [00:00<00:00, 76755.95it/s]

GloVe train shape: (40927, 200)
GloVe test shape: (4548, 200)





In [None]:
# ====================================
# Simple Dataset (Back to Original)
# ====================================

class GloVeImageDataset(Dataset):
    """Dataset for GloVe vectors reshaped as images."""
    
    def __init__(self, glove_vectors, labels, height=20, width=10):
        """
        Parameters
        ----------
        glove_vectors : np.array
            Array of shape (num_samples, 200).
        labels : np.array
            Array of shape (num_samples,).
        height : int
            Image height for reshaping.
        width : int
            Image width for reshaping.
        """
        # Reshape to (num_samples, 1, height, width) - add channel dimension
        self.glove_vectors = glove_vectors.reshape(-1, 1, height, width)
        self.labels = labels
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        # x is already (1, height, width) from reshape above
        x = torch.FloatTensor(self.glove_vectors[idx])
        y = torch.LongTensor([self.labels[idx]])[0]
        return x, y

# Prepare data
train_dataset = GloVeImageDataset(
    glove_train,
    y_train,
    height=20,
    width=10
)

test_dataset = GloVeImageDataset(
    glove_test,
    y_test,
    height=20,
    width=10
)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=0)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False, num_workers=0)

print(f"\nDatasets created")
print(f"Train batches: {len(train_loader)}, Test batches: {len(test_loader)}")


Datasets created
Train batches: 320, Test batches: 36


In [None]:
# ====================================
# Initialize Model (Original ViT)
# ====================================

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"\nUsing device: {device}")
num_epochs = 20

model = VisionTransformerWithLearnableAux(
    glove_dim=200,
    embed_dim=200,
    d_ff=1000,
    num_heads=5,
    layers=12,
    num_classes=2,
    dropout=0.1,
    num_auxiliary_patches=3,
    mode="fine-tuning"
).to(device)

print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

# Training setup
criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs, eta_min=1e-6)


Using device: cuda
Model parameters: 6,785,202


In [None]:
# ====================================
# Training Loop (Original)
# ====================================
best_acc = 0.0

print("\n" + "="*60)
print("Training Vision Transformer on GloVe Embeddings")
print("="*60 + "\n")

for epoch in range(num_epochs):
    # Training
    model.train()
    train_loss = 0
    
    for batch_x, batch_y in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        
        # Forward pass
        logits = model(batch_x)
        loss = criterion(logits, batch_y)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
    
    scheduler.step()
    
    # Evaluation
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch_x, batch_y in test_loader:
            batch_x = batch_x.to(device)
            logits = model(batch_x)
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(batch_y.numpy())
    
    acc = accuracy_score(all_labels, all_preds)
    
    print(f"Epoch {epoch+1}/{num_epochs} | Loss: {train_loss/len(train_loader):.4f} | Test Acc: {acc:.4f}")

print(f"\n{'='*60}")
print(f"Best Test Accuracy: {best_acc:.4f}")
print(f"{'='*60}")


Training Vision Transformer on GloVe Embeddings



Epoch 1/20: 100%|██████████| 320/320 [00:07<00:00, 40.73it/s]


Epoch 1/20 | Loss: 0.5850 | Test Acc: 0.8197


Epoch 2/20: 100%|██████████| 320/320 [00:06<00:00, 46.42it/s]


Epoch 2/20 | Loss: 0.4771 | Test Acc: 0.8157


Epoch 3/20: 100%|██████████| 320/320 [00:07<00:00, 45.64it/s]


Epoch 3/20 | Loss: 0.4616 | Test Acc: 0.8160


Epoch 4/20: 100%|██████████| 320/320 [00:07<00:00, 43.75it/s]


Epoch 4/20 | Loss: 0.4550 | Test Acc: 0.8314


Epoch 5/20: 100%|██████████| 320/320 [00:07<00:00, 45.53it/s]


Epoch 5/20 | Loss: 0.4526 | Test Acc: 0.8347


Epoch 6/20: 100%|██████████| 320/320 [00:07<00:00, 41.26it/s]


Epoch 6/20 | Loss: 0.4508 | Test Acc: 0.8320


Epoch 7/20: 100%|██████████| 320/320 [00:07<00:00, 43.55it/s]


Epoch 7/20 | Loss: 0.4501 | Test Acc: 0.8349


Epoch 8/20: 100%|██████████| 320/320 [00:06<00:00, 47.37it/s]


Epoch 8/20 | Loss: 0.4477 | Test Acc: 0.8353


Epoch 9/20: 100%|██████████| 320/320 [00:07<00:00, 42.88it/s]


Epoch 9/20 | Loss: 0.4484 | Test Acc: 0.8347


Epoch 10/20: 100%|██████████| 320/320 [00:07<00:00, 44.16it/s]


Epoch 10/20 | Loss: 0.4465 | Test Acc: 0.8366


Epoch 11/20: 100%|██████████| 320/320 [00:06<00:00, 49.46it/s]


Epoch 11/20 | Loss: 0.4450 | Test Acc: 0.8347


Epoch 12/20: 100%|██████████| 320/320 [00:07<00:00, 43.34it/s]


Epoch 12/20 | Loss: 0.4438 | Test Acc: 0.8380


Epoch 13/20: 100%|██████████| 320/320 [00:07<00:00, 42.55it/s]


Epoch 13/20 | Loss: 0.4429 | Test Acc: 0.8373


Epoch 14/20: 100%|██████████| 320/320 [00:07<00:00, 44.73it/s]


Epoch 14/20 | Loss: 0.4430 | Test Acc: 0.8364


Epoch 15/20: 100%|██████████| 320/320 [00:06<00:00, 45.93it/s]


Epoch 15/20 | Loss: 0.4420 | Test Acc: 0.8386


Epoch 16/20: 100%|██████████| 320/320 [00:06<00:00, 47.84it/s]


Epoch 16/20 | Loss: 0.4408 | Test Acc: 0.8391


Epoch 17/20: 100%|██████████| 320/320 [00:06<00:00, 48.33it/s]


Epoch 17/20 | Loss: 0.4401 | Test Acc: 0.8391


Epoch 18/20: 100%|██████████| 320/320 [00:06<00:00, 48.48it/s]


Epoch 18/20 | Loss: 0.4396 | Test Acc: 0.8384


Epoch 19/20: 100%|██████████| 320/320 [00:06<00:00, 49.80it/s]


Epoch 19/20 | Loss: 0.4400 | Test Acc: 0.8384


Epoch 20/20: 100%|██████████| 320/320 [00:06<00:00, 48.93it/s]


Epoch 20/20 | Loss: 0.4393 | Test Acc: 0.8384

Best Test Accuracy: 0.0000


In [None]:
print("\nFinal Classification Report:")
print(classification_report(all_labels, all_preds, target_names=['Negative', 'Positive']))


Final Classification Report:
              precision    recall  f1-score   support

    Negative       0.80      0.79      0.80      1816
    Positive       0.86      0.87      0.87      2732

    accuracy                           0.84      4548
   macro avg       0.83      0.83      0.83      4548
weighted avg       0.84      0.84      0.84      4548

