In [149]:
# Set up autoreload
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Load Libraries

In [150]:
import pandas as pd
import os
import json
from word_normalization import *
from sklearn.feature_extraction.text import TfidfVectorizer
from process_input import *
from bert import BERT
import torch
import torch.nn as nn
from tqdm import tqdm
import gc

# Download TextBlob dependencies
import nltk
nltk.download('brown')
nltk.download('punkt')


[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\eggle\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\eggle\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Define Globals

In [151]:
DATA_PATH = '../../data/'

# Tf-idf params
MAX_VOCAB = 10000
MIN_FREQ = 1

# Model params
MAX_LEN = 100
D_MODEL = 384     
D_FF = 1024         
HEADS = 4         
N = 4             
LR = 2e-4           
DROPOUT = 0.2     
LABEL_SMOOTH = 0 

# Training params
BATCH_SIZE = 128  
EPOCHS = 30       

### Load Datasets

In [152]:

df_train = pd.read_csv(os.path.join(DATA_PATH, 'twitter_sentiment_train.csv'))
df_test  = pd.read_csv(os.path.join(DATA_PATH, 'twitter_sentiment_test.csv'))

# Shuffle train set
RANDOM_STATE = 123
df_train = df_train.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)

### Create Vocabulary
1. Word normalization
2. Tokenize sentence
3. Finally mark each token to an index

In [153]:
# # Initialize Vocabulary
# vocab_path = "./vocab.json"
# vocab = {}
# # Check if vocabulary already exists
# if os.path.exists(vocab_path):
#     # Read json and save
#     with open(vocab_path, "r") as f:
#         vocab = json.load(f)
# else:
#     # Put special tokens
#     vocab = {
#         '<pad>': 0,   # Padding token
#         '<cls>': 1,   # Classification token (start of sequence)
#         '<sep>': 2    # Separator token (end of sequence)
#     }

#     index = 3
#     for tweet in df_train['text']:
#         # Pre-process input
#         tokens = preprocessing_text(tweet)
#         # Update vocabulary
#         for token in tokens:
#             if token not in vocab.keys():
#                 vocab[token] = index
#                 index += 1
#     for tweet in df_test['text']:
#         # Pre-process input
#         tokens = preprocessing_text(tweet)
#         # Update vocabulary
#         for token in tokens:
#             if token not in vocab.keys():
#                 vocab[token] = index
#                 index += 1
                
#     with open(vocab_path, 'w') as f:
#         json.dump(vocab, f, indent=4, ensure_ascii=False)


### Create Vocabulary Using TF-IDF

In [154]:
# def build_tfidf_vocabulary(df_train, df_test, max_features=5000, min_df=2):
#     """
#     Build vocabulary using TF-IDF's most discriminative features.
#     Preserves emojis and adds sentiment words.
#     """
#     print("="*70)
#     print("BUILDING TF-IDF-BASED VOCABULARY")
#     print("="*70)
    
#     # Step 1: Preprocess all texts
#     print("\n1. Preprocessing texts...")
#     train_texts = []
#     test_texts = []
#     all_emojis = set()
    
#     for text in df_train['text']:
#         tokens = preprocessing_text(text)
#         # Separate emojis from words
#         emojis = [t for t in tokens if emoji.is_emoji(t)]
#         words = [t for t in tokens if not emoji.is_emoji(t)]
#         all_emojis.update(emojis)
#         train_texts.append(' '.join(words))  # TF-IDF on words only
    
#     for text in df_test['text']:
#         tokens = preprocessing_text(text)
#         emojis = [t for t in tokens if emoji.is_emoji(t)]
#         words = [t for t in tokens if not emoji.is_emoji(t)]
#         all_emojis.update(emojis)
#         test_texts.append(' '.join(words))
    
#     print(f"   Found {len(all_emojis)} unique emojis")
    
#     # Step 2: Fit TF-IDF on words (not emojis)
#     print(f"\n2. Fitting TF-IDF (max_features={max_features}, min_df={min_df})...")
    
#     tfidf = TfidfVectorizer(
#         max_features=max_features,
#         min_df=min_df,
#         max_df=0.90,
#         ngram_range=(1, 1),  # Unigrams only
#         token_pattern=r'\S+',
#         lowercase=False
#     )
    
#     tfidf.fit(train_texts + test_texts)
#     tfidf_words = set(tfidf.get_feature_names_out())
    
#     print(f"   TF-IDF selected {len(tfidf_words)} word features")
    
#     # Step 3: Add sentiment words (ensure important words are included)
#     print(f"\n3. Adding critical sentiment words...")
    
#     sentiment_words = {
#         # Positive
#         'good', 'great', 'love', 'best', 'amazing', 'awesome', 'excellent',
#         'perfect', 'wonderful', 'fantastic', 'happy', 'beautiful',
#         # Negative
#         'bad', 'hate', 'worst', 'terrible', 'awful', 'horrible', 'poor',
#         'disappointing', 'sad', 'angry', 'useless', 'waste',
#         # Negations (CRITICAL!)
#         'not', 'no', 'never', 'neither', 'nor', 'cannot', 'hardly',
#         # Intensifiers
#         'very', 'really', 'so', 'too', 'extremely'
#     }
    
#     added_sentiment = 0
#     for word in sentiment_words:
#         if word not in tfidf_words:
#             tfidf_words.add(word)
#             added_sentiment += 1
    
#     print(f"   Added {added_sentiment} critical sentiment words")
    
#     # Step 4: Build final vocabulary
#     print(f"\n4. Building final vocabulary...")
    
#     vocab = {
#         '<pad>': 0,
#         '<cls>': 1,
#         '<sep>': 2
#     }
    
#     index = 3
    
#     # Add TF-IDF words
#     for word in sorted(tfidf_words):
#         vocab[word] = index
#         index += 1
    
#     # Add ALL emojis from dataset
#     for emoji_char in sorted(all_emojis):
#         if emoji_char not in vocab:
#             vocab[emoji_char] = index
#             index += 1
    
#     # Add special tokens
#     special_tokens = ['url', 'email']
#     for token in special_tokens:
#         if token not in vocab:
#             vocab[token] = index
#             index += 1
    
#     # Calculate coverage
#     print(f"\n5. Calculating coverage...")
    
#     total_tokens = 0
#     covered_tokens = 0
    
#     for text in train_texts + test_texts:
#         tokens = text.split()
#         total_tokens += len(tokens)
#         covered_tokens += sum(1 for t in tokens if t in vocab)
    
#     return vocab, tfidf

# # Build vocabulary
# vocab_path = "./vocab_tfidf.json"

# if os.path.exists(vocab_path):
#     print("Loading existing TF-IDF vocabulary...")
#     with open(vocab_path, "r") as f:
#         vocab = json.load(f)
#     print(f"Loaded vocabulary: {len(vocab)} tokens")
# else:
#     vocab, tfidf_vectorizer = build_tfidf_vocabulary(
#         df_train,
#         df_test,
#         max_features=MAX_VOCAB,
#         min_df=MIN_FREQ
#     )
    
#     # Save vocabulary
#     with open(vocab_path, 'w') as f:
#         json.dump(vocab, f, indent=4, ensure_ascii=False)

### Create Vocabulary with Sentiment Scores

In [155]:
# Simply load the vocabulary
vocab_path = "./vocab_textblob.json"

if os.path.exists(vocab_path):
    with open(vocab_path, "r") as f:
        vocab = json.load(f)
    print(f"Loaded TextBlob vocabulary: {len(vocab)} tokens")
else:
    # Build it
    from textblob_vocabulary import build_textblob_vocabulary
    vocab, _ = build_textblob_vocabulary(df_train, df_test, max_vocab=MAX_VOCAB)
    
    with open(vocab_path, 'w') as f:
        json.dump(vocab, f, indent=4, ensure_ascii=False)

Loaded TextBlob vocabulary: 6589 tokens


### Train Model
1. Pass each sentence to model:
    - Transform index to embeddings
    - Forward to layers 
    - Final MLP head for prediction

In [156]:
# Initialize Model

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")

model = BERT(
        vocab_size=len(vocab.keys()),
        max_len=MAX_LEN,
        d_model=D_MODEL, 
        d_ff=D_FF, 
        num_heads=HEADS, 
        N=N, 
        lr=LR, 
        dropout=DROPOUT,
        num_classes=2
).to(device)

loss_fn = nn.CrossEntropyLoss(label_smoothing=LABEL_SMOOTH)
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=LR,
    betas=(0.9, 0.999), 
    eps=1e-9
)

# Initialize scheduler
from torch.optim.lr_scheduler import CosineAnnealingLR

scheduler = CosineAnnealingLR(
    optimizer,
    T_max=EPOCHS,      
    eta_min=1e-9   
)

Device: cuda


In [None]:
for e in range(EPOCHS):
    # ==================== TRAINING ====================
    model.train()
    epoch_loss = 0
    num_batches = 0
    correct_train = 0
    total_train = 0
    
    num_train_batches = (len(df_train) + BATCH_SIZE - 1) // BATCH_SIZE
    pbar = tqdm(range(0, len(df_train), BATCH_SIZE), 
                desc=f"Epoch {e+1}/{EPOCHS} [Train]", 
                total=num_train_batches,
                ncols=100)
    
    for i in pbar:
        # Get batch
        batch_texts = df_train['text'].iloc[i:min(i+BATCH_SIZE, len(df_train))]
        batch_labels = df_train['label'].iloc[i:min(i+BATCH_SIZE, len(df_train))]
        
        # Process batch
        input_ids, attention_masks, labels = process_batch(
            batch_texts, batch_labels, vocab, MAX_LEN, device
        )
        
        # Forward pass
        optimizer.zero_grad()
        outputs = model(input_ids, attention_masks)
        loss = loss_fn(outputs, labels)
        
        # Calculate batch accuracy
        _, predicted = torch.max(outputs, 1)
        correct_train += (predicted == labels).sum().item()
        total_train += labels.size(0)
        
        # Backward pass
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

        # Track metrics
        epoch_loss += loss.item()
        num_batches += 1
        
        # Free memory every batch
        del input_ids, attention_masks, labels, outputs, predicted
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        
        # Update progress bar
        current_lr = scheduler.get_last_lr()[0]
        pbar.set_postfix({
            'loss': f'{loss.item():.4f}',
            'acc': f'{100.0 * correct_train / total_train:.2f}%',
            'lr': f'{current_lr:.2e}'
        })
    
    scheduler.step()
    # Calculate training metrics
    avg_train_loss = epoch_loss / num_batches
    train_accuracy = 100.0 * correct_train / total_train
    
    # ==================== TESTING ====================
    model.eval()
    test_loss = 0
    correct_test = 0
    total_test = 0
    num_test_batches = 0
    
    with torch.no_grad():
        num_test_batches_total = (len(df_test) + BATCH_SIZE - 1) // BATCH_SIZE
        test_pbar = tqdm(range(0, len(df_test), BATCH_SIZE),
                        desc=f"Epoch {e+1}/{EPOCHS} [Test]",
                        total=num_test_batches_total,
                        ncols=100)
        
        for i in test_pbar:
            batch_texts = df_test['text'].iloc[i:min(i+BATCH_SIZE, len(df_test))]
            batch_labels = df_test['label'].iloc[i:min(i+BATCH_SIZE, len(df_test))]
            
            input_ids, attention_masks, labels = process_batch(
                batch_texts, batch_labels, vocab, MAX_LEN, device
            )
            
            outputs = model(input_ids, attention_masks)
            loss = loss_fn(outputs, labels)
            
            _, predicted = torch.max(outputs, 1)
            correct_test += (predicted == labels).sum().item()
            total_test += labels.size(0)
            
            test_loss += loss.item()
            num_test_batches += 1
            
            # Free memory
            del input_ids, attention_masks, labels, outputs, predicted
            
            test_pbar.set_postfix({
                'loss': f'{loss.item():.4f}',
                'acc': f'{100.0 * correct_test / total_test:.2f}%'
            })
    
    # Clear cache after testing
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    gc.collect()
    
    avg_test_loss = test_loss / num_test_batches
    test_accuracy = 100.0 * correct_test / total_test
    
    # ==================== EPOCH SUMMARY ====================
    print(f"\n{'='*60}")
    print(f"Epoch {e+1}/{EPOCHS} Summary:")
    print(f"  Train Loss:     {avg_train_loss:.4f}")
    print(f"  Train Accuracy: {train_accuracy:.2f}% ({correct_train}/{total_train})")
    print(f"  Test Loss:      {avg_test_loss:.4f}")
    print(f"  Test Accuracy:  {test_accuracy:.2f}% ({correct_test}/{total_test})")
    
    # Print memory usage
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated(0) / 1e9
        reserved = torch.cuda.memory_reserved(0) / 1e9
        print(f"  GPU Memory: {allocated:.2f} GB allocated, {reserved:.2f} GB reserved")
    
    print(f"{'='*60}\n")

# ==================== CLEANUP ====================
print("Training complete! Cleaning up memory...")

# Clear GPU memory
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# Clear Python garbage
gc.collect()

print("✓ Memory cleaned!")

Epoch 1/30 [Train]: 100%|███| 111/111 [00:24<00:00,  4.56it/s, loss=0.7111, acc=52.11%, lr=2.00e-04]
Epoch 1/30 [Test]: 100%|███████████████████| 50/50 [00:12<00:00,  4.01it/s, loss=0.6354, acc=66.93%]



Epoch 1/30 Summary:
  Train Loss:     1.4376
  Train Accuracy: 52.11% (7392/14186)
  Test Loss:      0.6375
  Test Accuracy:  66.93% (4248/6347)
  GPU Memory: 0.34 GB allocated, 1.60 GB reserved



Epoch 2/30 [Train]: 100%|███| 111/111 [00:38<00:00,  2.87it/s, loss=0.5588, acc=63.41%, lr=1.99e-04]
Epoch 2/30 [Test]: 100%|███████████████████| 50/50 [00:14<00:00,  3.52it/s, loss=0.5523, acc=72.70%]



Epoch 2/30 Summary:
  Train Loss:     0.6442
  Train Accuracy: 63.41% (8995/14186)
  Test Loss:      0.5432
  Test Accuracy:  72.70% (4614/6347)
  GPU Memory: 0.34 GB allocated, 0.81 GB reserved



Epoch 3/30 [Train]: 100%|███| 111/111 [00:36<00:00,  3.05it/s, loss=0.4654, acc=70.72%, lr=1.98e-04]
Epoch 3/30 [Test]: 100%|███████████████████| 50/50 [00:05<00:00,  9.05it/s, loss=0.4996, acc=75.09%]



Epoch 3/30 Summary:
  Train Loss:     0.5643
  Train Accuracy: 70.72% (10033/14186)
  Test Loss:      0.5062
  Test Accuracy:  75.09% (4766/6347)
  GPU Memory: 0.34 GB allocated, 0.80 GB reserved



Epoch 4/30 [Train]: 100%|███| 111/111 [00:41<00:00,  2.70it/s, loss=0.4505, acc=74.47%, lr=1.95e-04]
Epoch 4/30 [Test]: 100%|███████████████████| 50/50 [00:11<00:00,  4.47it/s, loss=0.4886, acc=75.93%]



Epoch 4/30 Summary:
  Train Loss:     0.5164
  Train Accuracy: 74.47% (10564/14186)
  Test Loss:      0.4892
  Test Accuracy:  75.93% (4819/6347)
  GPU Memory: 0.34 GB allocated, 0.77 GB reserved



Epoch 5/30 [Train]:  78%|███▏| 87/111 [00:32<00:40,  1.68s/it, loss=0.4422, acc=76.50%, lr=1.91e-04]