In [6]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

# Load dataset (using HuggingFace 'emotion' as an example)
dataset = load_dataset('emotion')
train_texts = dataset['train']['text'][:1000]  # Take first 1000 samples
train_labels = dataset['train']['label'][:1000]  # Ensure labels match

# Initialize tokenizer and model (using DistilBERT for efficiency)
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = AutoModel.from_pretrained('distilbert-base-uncased')

# Generate embeddings (CLS token as sentence representation)
def get_embeddings(texts):
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].numpy()  # Shape: (n_samples, 768)

train_embeddings = get_embeddings(train_texts)  # Now shape (1000, 768)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [7]:
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split

# Split data (X = embeddings, y = labels)
X_train, X_test, y_train, y_test = train_test_split(
    train_embeddings,  # Shape (1000, 768)
    train_labels,      # Shape (1000,)
    test_size=0.2,
    random_state=42
)

# Initialize TPOT (optimizes pipelines using genetic algorithms)
tpot = TPOTClassifier(
    generations=5,       # Number of optimization iterations
    population_size=20,  # Models evaluated per generation
    verbosity=2,         # Shows progress
    random_state=42,
    n_jobs=-1            # Uses all CPU cores
)

# Train TPOT (finds best pipeline)
tpot.fit(X_train, y_train)

# Evaluate
print(f"Test Accuracy: {tpot.score(X_test, y_test):.2f}")

# Export the best pipeline to a Python file
tpot.export('best_pipeline.py')

Version 0.12.2 of tpot is outdated. Version 1.0.0 was released Wednesday February 26, 2025.


Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.51375

Generation 2 - Current best internal CV score: 0.51375

Generation 3 - Current best internal CV score: 0.5225

Generation 4 - Current best internal CV score: 0.525

Generation 5 - Current best internal CV score: 0.525

Best pipeline: MLPClassifier(GradientBoostingClassifier(StandardScaler(input_matrix), learning_rate=0.01, max_depth=5, max_features=0.15000000000000002, min_samples_leaf=8, min_samples_split=5, n_estimators=100, subsample=0.9500000000000001), alpha=0.01, learning_rate_init=0.001)


AttributeError: 'list' object has no attribute 'astype'