In [None]:
# ============================================================
# Hybrid Sentiment Analysis - Google Colab Training
# ============================================================

In [None]:
!nvidia-smi
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
%pip install transformers datasets gensim scikit-learn xgboost nltk pandas numpy matplotlib seaborn tqdm pyyaml

Tue Dec  2 12:27:18 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   47C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
!git clone https://github.com/steepcloud/hybrid-sentiment.git
%cd hybrid-sentiment

# private repo option
'''
from google.colab import files
import zipfile

print("Upload hybrid-sentiment.zip")
uploaded = files.upload()

zip_name = list(uploaded.keys())[0]
with zipfile.ZipFile(zip_name, 'r') as zip_ref:
    zip_ref.extractall('/content/')

%cd /content/hybrid-sentiment
'''

Cloning into 'hybrid-sentiment'...
remote: Enumerating objects: 161, done.[K
remote: Counting objects: 100% (161/161), done.[K
remote: Compressing objects: 100% (112/112), done.[K
remote: Total 161 (delta 88), reused 111 (delta 42), pack-reused 0 (from 0)[K
Receiving objects: 100% (161/161), 93.70 KiB | 4.68 MiB/s, done.
Resolving deltas: 100% (88/88), done.
/content/hybrid-sentiment


'\nfrom google.colab import files\nimport zipfile\n\nprint("Upload hybrid-sentiment.zip")\nuploaded = files.upload()\n\nzip_name = list(uploaded.keys())[0]\nwith zipfile.ZipFile(zip_name, \'r\') as zip_ref:\n    zip_ref.extractall(\'/content/\')\n\n%cd /content/hybrid-sentiment\n'

In [None]:
# verify
!ls -la src/

total 56
drwxr-xr-x 8 root root  4096 Dec  2 12:27 .
drwxr-xr-x 6 root root  4096 Dec  2 12:27 ..
drwxr-xr-x 2 root root  4096 Dec  2 12:27 data
drwxr-xr-x 2 root root  4096 Dec  2 12:27 evaluation
-rw-r--r-- 1 root root     0 Dec  2 12:27 __init__.py
-rw-r--r-- 1 root root 22185 Dec  2 12:27 main.py
drwxr-xr-x 4 root root  4096 Dec  2 12:27 models
drwxr-xr-x 2 root root  4096 Dec  2 12:27 training
drwxr-xr-x 2 root root  4096 Dec  2 12:27 utils
drwxr-xr-x 2 root root  4096 Dec  2 12:27 visualization


In [None]:
import nltk
for package in ['punkt', 'punkt_tab', 'stopwords']:
    nltk.download(package, quiet=True)
print("NLTK data downloaded!")

NLTK data downloaded!


In [None]:
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
print(f"Device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

PyTorch version: 2.9.0+cu126
CUDA available: True
CUDA version: 12.6
Device: Tesla T4
GPU: Tesla T4
Memory: 15.8 GB


In [None]:
# Load Dataset (IMDB, Twitter, or Custom)
# ============================================================

import sys
sys.path.append('/content/hybrid-sentiment')

from src.data.data_loader import DatasetLoader

# initialize
loader = DatasetLoader(config_path='configs/config.yaml')

# dataset type: 'imdb', 'twitter' or 'custom'
DATASET = 'imdb'

print(f"Loading {DATASET.upper()} dataset...")

if DATASET == 'imdb':
    # load IMDB movie reviews (50k reviews)
    train_df, val_df, test_df = loader.load_imdb(use_cache=False)

elif DATASET == 'twitter':
    # load twitter Sentiment140 (1.6M tweets)
    train_df, val_df, test_df = loader.load_twitter(use_cache=False)

elif DATASET == 'custom':
    # load custom dataset from CSV
    try:
        from google.colab import files
        IN_COLAB = True
    except ImportError:
        IN_COLAB = False
    import os

    print("\nUpload your CSV file")
    print("Required format:")
    print("  - Column 1: 'text' (review/tweet/comment)")
    print("  - Column 2: 'label' (0=negative, 1=positive)")
    print("\nExample CSV format:")
    print('  text,label')
    print('  "Great product!",1')
    print('  "Terrible service.",0')
    print()

    if IN_COLAB:
        uploaded = files.upload()
        csv_filename = list(uploaded.keys())[0]
    else:
        # for local Jupyter, prompt for file path
        csv_filename = input("Enter the path to your CSV file: ")

    os.makedirs('data/raw', exist_ok=True)
    custom_path = f'data/raw/{csv_filename}'
    os.rename(csv_filename, custom_path)

    print(f"\nLoading custom dataset from {custom_path}...")
    train_df, val_df, test_df = loader.load_custom(
        train_path=custom_path,
        use_cache=True
    )

    print(f"  Custom dataset loaded and split:")
    print(f"  Train: {len(train_df)} samples")
    print(f"  Val: {len(val_df)} samples")
    print(f"  Test: {len(test_df)} samples")

else:
    raise ValueError(f"Unknown dataset: {DATASET}. Choose 'imdb', 'twitter', or 'custom'")

print("\n" + "="*60)
loader.get_data_statistics(train_df, "Training Set")
loader.get_data_statistics(val_df, "Validation Set")
loader.get_data_statistics(test_df, "Test Set")
print("="*60)

print(f"\n{DATASET.upper()} dataset ready for training!")

Loading IMDB dataset...
Loading IMDb dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Train size: 22500, Val size: 2500, Test size: 25000


Training Set Statistics
Total samples: 22500

Class Distribution:
  Negative (label=0): 11250 (50.00%)
  Positive (label=1): 11250 (50.00%)

Text Length Statistics (in words):
  Mean: 233.86
  Median: 174.00
  Std Dev: 173.82
  Min: 10
  Max: 2470
  25th percentile: 127.00
  75th percentile: 284.00
  95th percentile: 596.05
  99th percentile: 917.00

Validation Set Statistics
Total samples: 2500

Class Distribution:
  Negative (label=0): 1250 (50.00%)
  Positive (label=1): 1250 (50.00%)

Text Length Statistics (in words):
  Mean: 233.17
  Median: 172.00
  Std Dev: 173.01
  Min: 18
  Max: 1398
  25th percentile: 127.00
  75th percentile: 284.25
  95th percentile: 610.05
  99th percentile: 900.02

Test Set Statistics
Total samples: 25000

Class Distribution:
  Negative (label=0): 12500 (50.00%)
  Positive (label=1): 12500 (50.00%)

Text Length Statistics (in words):
  Mean: 228.53
  Median: 172.00
  Std Dev: 168.88
  Min: 4
  Max: 227

In [None]:
dataset_arg = DATASET if DATASET in ['imdb', 'twitter'] else 'imdb'

In [None]:
# train Word2Vec embeddings

!python src/training/train_embeddings.py \
    --dataset {dataset_arg} \
    --embedding word2vec

print(f"Word2Vec embeddings trained")
if DATASET == 'custom':
    print(f"  (using {dataset_arg.upper()} configuration)")
else:
    print(f"  Dataset: {DATASET.upper()}")

Configuration loaded from configs/config.yaml
Random seed set to 42
Initialized EmbeddingTrainer
  Embedding type: word2vec

Training Embeddings on IMDB

Loading dataset...
Loading IMDb dataset...
Saved dataset to cache: data/processed/imdb_cache.pkl
Train size: 22500, Val size: 2500, Test size: 25000
Total texts for training: 50000

Preparing corpus...
  Total texts: 50000
  Building vocabulary...
Building vocabulary...
Vocabulary size: 20000
Most common words: ['the', 'and', 'a', 'of', 'to', 'is', 'it', 'in', 'i', 'this']
Vocabulary saved to results/embeddings/word2vec/vocab.pkl
  Tokenizing texts...
    Processed 5000/50000 texts
    Processed 10000/50000 texts
    Processed 15000/50000 texts
    Processed 20000/50000 texts
    Processed 25000/50000 texts
    Processed 30000/50000 texts
    Processed 35000/50000 texts
    Processed 40000/50000 texts
    Processed 45000/50000 texts
    Processed 50000/50000 texts
✓ Corpus prepared: 50000 documents
  Vocabulary size: 20000

Training W

In [None]:
# train LSTM End-to-End

!python src/training/train_end_to_end_dl.py \
    --dataset {dataset_arg} \
    --model lstm \
    --epochs 10 \
    --batch_size 64 \
    --lr 0.001

print(f"LSTM model trained on {DATASET.upper()} dataset")

Configuration loaded from configs/config.yaml
Random seed set to 42
Using GPU: Tesla T4
Initialized EndToEndDLTrainer
  Model type: lstm
  Device: cuda

Training on IMDB Dataset

Loading dataset...
Loaded dataset from cache: data/processed/imdb_cache.pkl

Building vocabulary...
Building vocabulary...
Vocabulary size: 20000
Most common words: ['the', 'and', 'a', 'of', 'to', 'is', 'it', 'in', 'i', 'this']

Creating LSTM model...
  ✓ Applied Xavier initialization to classifier
  Total parameters: 6,868,738

Preparing data loaders...
  Train batches: 352
  Val batches: 40
  Test batches: 391

Training LSTM Model
  Epochs: 10
  Learning rate: 0.001
  Batch size: 64

Epoch 1/10
Epoch 1: 100% 352/352 [00:17<00:00, 19.74it/s, loss=0.5839, acc=64.77%]

Training - Loss: 0.6193, Accuracy: 0.6477

Validation Results:

Validation
Accuracy:  0.7476
Precision: 0.7599
Recall:    0.7240
F1 Score:  0.7415
ROC-AUC:   0.8228
Checkpoint saved to results/models/deep_learning/imdb/lstm/lstm_best.pt
✓ Best mo

In [None]:
# train GRU End-to-End

!python src/training/train_end_to_end_dl.py \
    --dataset {dataset_arg} \
    --model gru \
    --epochs 10 \
    --batch_size 64 \
    --lr 0.0001

Configuration loaded from configs/config.yaml
Random seed set to 42
Using GPU: Tesla T4
Initialized EndToEndDLTrainer
  Model type: gru
  Device: cuda

Training on IMDB Dataset

Loading dataset...
Loaded dataset from cache: data/processed/imdb_cache.pkl

Building vocabulary...
Building vocabulary...
Vocabulary size: 20000
Most common words: ['the', 'and', 'a', 'of', 'to', 'is', 'it', 'in', 'i', 'this']

Creating GRU model...
  ✓ Applied Xavier initialization to classifier
  Total parameters: 6,659,842

Preparing data loaders...
  Train batches: 352
  Val batches: 40
  Test batches: 391

Training GRU Model
  Epochs: 10
  Learning rate: 0.0001
  Batch size: 64

Epoch 1/10
Epoch 1: 100% 352/352 [00:16<00:00, 21.45it/s, loss=0.9031, acc=50.36%]

Training - Loss: 0.9883, Accuracy: 0.5036

Validation Results:

Validation
Accuracy:  0.5020
Precision: 0.5010
Recall:    0.9552
F1 Score:  0.6573
ROC-AUC:   0.5378
Checkpoint saved to results/models/deep_learning/imdb/gru/gru_best.pt
✓ Best model 

In [None]:
# train Transformer

!python src/training/train_end_to_end_dl.py \
    --dataset {dataset_arg} \
    --model transformer \
    --epochs 5 \
    --batch_size 32 \
    --lr 0.0001

Configuration loaded from configs/config.yaml
Random seed set to 42
Using GPU: Tesla T4
Initialized EndToEndDLTrainer
  Model type: transformer
  Device: cuda

Training on IMDB Dataset

Loading dataset...
Loaded dataset from cache: data/processed/imdb_cache.pkl

Building vocabulary...
Building vocabulary...
Vocabulary size: 20000
Most common words: ['the', 'and', 'a', 'of', 'to', 'is', 'it', 'in', 'i', 'this']

Creating TRANSFORMER model...
  ✓ Applied Xavier initialization to classifier
  Total parameters: 8,088,806

Preparing data loaders...
  Train batches: 704
  Val batches: 79
  Test batches: 782

Training TRANSFORMER Model
  Epochs: 5
  Learning rate: 0.0001
  Batch size: 32

Epoch 1/5
Epoch 1: 100% 704/704 [00:51<00:00, 13.73it/s, loss=0.7524, acc=70.78%]

Training - Loss: 0.5465, Accuracy: 0.7078
  output = torch._nested_tensor_from_mask(

Validation Results:

Validation
Accuracy:  0.7868
Precision: 0.7884
Recall:    0.7840
F1 Score:  0.7862
ROC-AUC:   0.8633
Checkpoint saved t

In [None]:
!python src/training/train_end_to_end_dl.py \
    --dataset {dataset_arg} \
    --model bert \
    --epochs 3 \
    --batch_size 16 \
    --lr 2e-5

print(f"BERT model trained on {dataset_arg.upper()} dataset")

Configuration loaded from configs/config.yaml
Random seed set to 42
Using GPU: Tesla T4
Initialized EndToEndDLTrainer
  Model type: bert
  Device: cuda

Training on IMDB Dataset

Loading dataset...
Loaded dataset from cache: data/processed/imdb_cache.pkl

Building vocabulary...
Building vocabulary...
Vocabulary size: 20000
Most common words: ['the', 'and', 'a', 'of', 'to', 'is', 'it', 'in', 'i', 'this']

Creating BERT model...
config.json: 100% 570/570 [00:00<00:00, 3.38MB/s]
2025-12-01 14:17:52.415341: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764598672.437897    6792 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764598672.444652    6792 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin 

In [None]:
!python src/training/train_end_to_end_dl.py \
    --dataset {dataset_arg} \
    --model roberta \
    --epochs 3 \
    --batch_size 16 \
    --lr 2e-5

print(f"RoBERTa model trained on {dataset_arg.upper()} dataset")

Configuration loaded from configs/config.yaml
Random seed set to 42
Using GPU: Tesla T4
Initialized EndToEndDLTrainer
  Model type: roberta
  Device: cuda

Training on IMDB Dataset

Loading dataset...
Loading IMDb dataset...
Saved dataset to cache: data/processed/imdb_cache.pkl
Train size: 22500, Val size: 2500, Test size: 25000

Building vocabulary...
Building vocabulary...
Vocabulary size: 20000
Most common words: ['the', 'and', 'a', 'of', 'to', 'is', 'it', 'in', 'i', 'this']

Creating ROBERTA model...
config.json: 100% 481/481 [00:00<00:00, 3.44MB/s]
2025-12-02 12:29:10.095889: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764678550.117952    1158 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764678550.124693    1158 cuda_blas.cc:1407

In [None]:
!python src/training/train_end_to_end_dl.py \
    --dataset {dataset_arg} \
    --model distilbert \
    --epochs 3 \
    --batch_size 16 \
    --lr 2e-5

print(f"DistilBERT model trained on {dataset_arg.upper()} dataset")

Configuration loaded from configs/config.yaml
Random seed set to 42
Using GPU: Tesla T4
Initialized EndToEndDLTrainer
  Model type: distilbert
  Device: cuda

Training on IMDB Dataset

Loading dataset...
Loaded dataset from cache: data/processed/imdb_cache.pkl

Building vocabulary...
Building vocabulary...
Vocabulary size: 20000
Most common words: ['the', 'and', 'a', 'of', 'to', 'is', 'it', 'in', 'i', 'this']

Creating DISTILBERT model...
config.json: 100% 483/483 [00:00<00:00, 2.50MB/s]
2025-12-02 14:33:11.823080: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764685991.846203   32020 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764685991.853232   32020 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory

In [None]:
# train hybrid models (LSTM + Classical ML)

!python src/training/train_classical_ml.py \
    --dataset {dataset_arg} \
    --encoder lstm

Random seed set to 42
Initialized ClassicalMLTrainer
  Encoder type: lstm
  Device: cuda

Training Classical ML Models on IMDB

Loading dataset...
Loaded dataset from cache: data/processed/imdb_cache.pkl

Building vocabulary...
Building vocabulary...
Vocabulary size: 20000
Most common words: ['the', 'and', 'a', 'of', 'to', 'is', 'it', 'in', 'i', 'this']

Loading lstm encoder...
  No pre-trained encoder found, using random initialization
✓ Random encoder initialized

Extracting embeddings for 22500 samples...
  Processed 320/22500 samples
  Processed 640/22500 samples
  Processed 960/22500 samples
  Processed 1280/22500 samples
  Processed 1600/22500 samples
  Processed 1920/22500 samples
  Processed 2240/22500 samples
  Processed 2560/22500 samples
  Processed 2880/22500 samples
  Processed 3200/22500 samples
  Processed 3520/22500 samples
  Processed 3840/22500 samples
  Processed 4160/22500 samples
  Processed 4480/22500 samples
  Processed 4800/22500 samples
  Processed 5120/22500 s

In [None]:
# train hybrid models (GRU + Classical ML)

!python src/training/train_classical_ml.py \
    --dataset {dataset_arg} \
    --encoder gru

Random seed set to 42
Initialized ClassicalMLTrainer
  Encoder type: gru
  Device: cuda

Training Classical ML Models on IMDB

Loading dataset...
Loaded dataset from cache: data/processed/imdb_cache.pkl

Building vocabulary...
Building vocabulary...
Vocabulary size: 20000
Most common words: ['the', 'and', 'a', 'of', 'to', 'is', 'it', 'in', 'i', 'this']

Loading gru encoder...
  No pre-trained encoder found, using random initialization
✓ Random encoder initialized

Extracting embeddings for 22500 samples...
  Processed 320/22500 samples
  Processed 640/22500 samples
  Processed 960/22500 samples
  Processed 1280/22500 samples
  Processed 1600/22500 samples
  Processed 1920/22500 samples
  Processed 2240/22500 samples
  Processed 2560/22500 samples
  Processed 2880/22500 samples
  Processed 3200/22500 samples
  Processed 3520/22500 samples
  Processed 3840/22500 samples
  Processed 4160/22500 samples
  Processed 4480/22500 samples
  Processed 4800/22500 samples
  Processed 5120/22500 sam

In [None]:
# train hybrid models (Transformer + Classical ML)

!python src/training/train_classical_ml.py \
    --dataset {dataset_arg} \
    --encoder transformer

Random seed set to 42
Initialized ClassicalMLTrainer
  Encoder type: transformer
  Device: cuda

Training Classical ML Models on IMDB

Loading dataset...
Loaded dataset from cache: data/processed/imdb_cache.pkl

Building vocabulary...
Building vocabulary...
Vocabulary size: 20000
Most common words: ['the', 'and', 'a', 'of', 'to', 'is', 'it', 'in', 'i', 'this']

Loading transformer encoder...
  No pre-trained encoder found, using random initialization
✓ Random encoder initialized

Extracting embeddings for 22500 samples...
  output = torch._nested_tensor_from_mask(
  Processed 320/22500 samples
  Processed 640/22500 samples
  Processed 960/22500 samples
  Processed 1280/22500 samples
  Processed 1600/22500 samples
  Processed 1920/22500 samples
  Processed 2240/22500 samples
  Processed 2560/22500 samples
  Processed 2880/22500 samples
  Processed 3200/22500 samples
  Processed 3520/22500 samples
  Processed 3840/22500 samples
  Processed 4160/22500 samples
  Processed 4480/22500 sample

In [None]:
!python src/training/train_classical_ml.py \
    --dataset {dataset_arg} \
    --encoder bert

print(f"Hybrid BERT models trained on {dataset_arg.upper()} dataset")

Random seed set to 42
Initialized ClassicalMLTrainer
  Encoder type: bert
  Device: cuda

Training Classical ML Models on IMDB

Loading dataset...
Loaded dataset from cache: data/processed/imdb_cache.pkl

Building vocabulary...
Building vocabulary...
Vocabulary size: 20000
Most common words: ['the', 'and', 'a', 'of', 'to', 'is', 'it', 'in', 'i', 'this']

Loading bert encoder...
  No pre-trained encoder found, using random initialization
2025-12-01 16:44:17.251433: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764607457.273862   43496 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764607457.280892   43496 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registe

In [None]:
!python src/training/train_classical_ml.py \
    --dataset {dataset_arg} \
    --encoder roberta

print(f"Hybrid RoBERTa models trained on {dataset_arg.upper()} dataset")

Random seed set to 42
Initialized ClassicalMLTrainer
  Encoder type: roberta
  Device: cuda

Training Classical ML Models on IMDB

Loading dataset...
Loaded dataset from cache: data/processed/imdb_cache.pkl

Building vocabulary...
Building vocabulary...
Vocabulary size: 20000
Most common words: ['the', 'and', 'a', 'of', 'to', 'is', 'it', 'in', 'i', 'this']

Loading roberta encoder...
  No pre-trained encoder found, using random initialization
2025-12-02 15:37:51.422653: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764689871.446467   48169 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764689871.453424   48169 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been r

In [None]:
!python src/training/train_classical_ml.py \
    --dataset {dataset_arg} \
    --encoder distilbert

print(f"Hybrid DistilBERT models trained on {dataset_arg.upper()} dataset")

Random seed set to 42
Initialized ClassicalMLTrainer
  Encoder type: distilbert
  Device: cuda

Training Classical ML Models on IMDB

Loading dataset...
Loaded dataset from cache: data/processed/imdb_cache.pkl

Building vocabulary...
Building vocabulary...
Vocabulary size: 20000
Most common words: ['the', 'and', 'a', 'of', 'to', 'is', 'it', 'in', 'i', 'this']

Loading distilbert encoder...
  No pre-trained encoder found, using random initialization
2025-12-02 16:03:52.229106: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764691432.267995   54665 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764691432.279540   54665 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already 

In [None]:
# compare ALL models

!python src/evaluation/compare_models.py

object address  : 0x7984059d7580
object refcount : 3
object type     : 0xa2a4e0
object type name: KeyboardInterrupt
object repr     : KeyboardInterrupt()
lost sys.stderr
^C


In [None]:
# download trained models

!zip -r results.zip results/
from google.colab import files
files.download('results.zip')

print("Training complete! Models downloaded.")

  adding: results/ (stored 0%)
  adding: results/models/ (stored 0%)
  adding: results/models/deep_learning/ (stored 0%)
  adding: results/models/deep_learning/imdb/ (stored 0%)
  adding: results/models/deep_learning/imdb/distilbert/ (stored 0%)
  adding: results/models/deep_learning/imdb/distilbert/distilbert_best.pt (deflated 12%)
  adding: results/models/deep_learning/imdb/roberta/ (stored 0%)
  adding: results/models/deep_learning/imdb/roberta/roberta_best.pt (deflated 12%)
  adding: results/models/classical_ml/ (stored 0%)
  adding: results/models/classical_ml/imdb/ (stored 0%)
  adding: results/models/classical_ml/imdb/distilbert/ (stored 0%)
  adding: results/models/classical_ml/imdb/distilbert/logistic_regression.pkl (deflated 23%)
  adding: results/models/classical_ml/imdb/distilbert/random_forest.pkl (deflated 72%)
  adding: results/models/classical_ml/imdb/distilbert/xgboost.pkl (deflated 81%)
  adding: results/models/classical_ml/imdb/roberta/ (stored 0%)
  adding: results/

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Training complete! Models downloaded.
