In [1]:
import pickle
import numpy as np
from tqdm import tqdm 

In [2]:
# Load the data we saved
with open('../data/processed_data.pkl', 'rb') as f:
    data = pickle.load(f)

X_train = data['X_train']
X_val = data['X_val']
X_test = data['X_test']
y_train = data['y_train']
y_val = data['y_val']
y_test = data['y_test']
label_encoder = data['label_encoder']

print(f"Loaded data:")
print(f"  Train: {len(X_train)} users")
print(f"  Val: {len(X_val)} users")
print(f"  Test: {len(X_test)} users")
print()
print(f"Labels: {label_encoder.classes_.tolist()}")

Loaded data:
  Train: 1790 users
  Val: 384 users
  Test: 384 users

Labels: ['ADHD', 'OCD', 'aspergers', 'depression', 'ptsd']


In [None]:
from sentence_transformers import SentenceTransformer

# Load a lightweight but effective model

model_name = 'all-MiniLM-L6-v2'  

print(f"Loading SBERT model: {model_name}")
sbert_model = SentenceTransformer(model_name)
print("✓ Model loaded!")

# Check embedding dimension
test_embedding = sbert_model.encode("test sentence")
print(f"Embedding dimension: {len(test_embedding)}")



Loading SBERT model: all-MiniLM-L6-v2


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✓ Model loaded!
Embedding dimension: 384


In [4]:
def embed_timeline(timeline, model):
    """
    Convert a user's timeline (list of posts) into embeddings.
    
    Args:
        timeline: List of post texts
        model: SBERT model
    
    Returns:
        numpy array of shape (num_posts, embedding_dim)
    """
    embeddings = model.encode(timeline, show_progress_bar=False)
    return np.array(embeddings)

# Test it
sample_timeline = X_train[0]
sample_embedding = embed_timeline(sample_timeline, sbert_model)

print(f"Sample timeline: {len(sample_timeline)} posts")
print(f"Embedding shape: {sample_embedding.shape}")
print(f"  - {sample_embedding.shape[0]} posts")
print(f"  - {sample_embedding.shape[1]} dimensions per post")

Sample timeline: 6 posts
Embedding shape: (6, 384)
  - 6 posts
  - 384 dimensions per post


In [5]:
def embed_all_timelines(timelines, model):
    """Embed all timelines with progress bar"""
    embeddings = []
    for timeline in tqdm(timelines, desc="Embedding timelines"):
        emb = embed_timeline(timeline, model)
        embeddings.append(emb)
    return embeddings

print("Embedding training timelines...")
X_train_emb = embed_all_timelines(X_train, sbert_model)

print("\nEmbedding validation timelines...")
X_val_emb = embed_all_timelines(X_val, sbert_model)

print("\nEmbedding test timelines...")
X_test_emb = embed_all_timelines(X_test, sbert_model)

print("\n✓ All embeddings created!")

Embedding training timelines...


Embedding timelines: 100%|██████████| 1790/1790 [02:24<00:00, 12.35it/s]



Embedding validation timelines...


Embedding timelines: 100%|██████████| 384/384 [00:34<00:00, 11.09it/s]



Embedding test timelines...


Embedding timelines: 100%|██████████| 384/384 [00:31<00:00, 12.15it/s]


✓ All embeddings created!





In [6]:
print("Embedding statistics:")
print()
print("Train set:")
train_lengths = [emb.shape[0] for emb in X_train_emb]
print(f"  Users: {len(X_train_emb)}")
print(f"  Timeline lengths: min={min(train_lengths)}, max={max(train_lengths)}, mean={np.mean(train_lengths):.1f}")
print(f"  Embedding dim: {X_train_emb[0].shape[1]}")

print()
print("Sample embedding (first user, first post):")
print(f"  Shape: {X_train_emb[0][0].shape}")
print(f"  Values: {X_train_emb[0][0][:5]}...")  # First 5 values

Embedding statistics:

Train set:
  Users: 1790
  Timeline lengths: min=5, max=50, mean=9.4
  Embedding dim: 384

Sample embedding (first user, first post):
  Shape: (384,)
  Values: [-0.00834035 -0.0725477   0.03805013  0.08869714  0.04437679]...


In [7]:
# Save embeddings
embeddings_data = {
    'X_train_emb': X_train_emb,
    'X_val_emb': X_val_emb,
    'X_test_emb': X_test_emb,
    'y_train': y_train,
    'y_val': y_val,
    'y_test': y_test,
    'label_encoder': label_encoder,
    'embedding_dim': X_train_emb[0].shape[1],
    'model_name': model_name
}

with open('../data/embeddings.pkl', 'wb') as f:
    pickle.dump(embeddings_data, f)

print("✓ Saved to data/embeddings.pkl")
print()
print("File contains:")
for key, value in embeddings_data.items():
    if isinstance(value, list):
        print(f"  {key}: {len(value)} items")
    elif isinstance(value, np.ndarray):
        print(f"  {key}: array shape {value.shape}")
    else:
        print(f"  {key}: {value}")

✓ Saved to data/embeddings.pkl

File contains:
  X_train_emb: 1790 items
  X_val_emb: 384 items
  X_test_emb: 384 items
  y_train: array shape (1790,)
  y_val: array shape (384,)
  y_test: array shape (384,)
  label_encoder: LabelEncoder()
  embedding_dim: 384
  model_name: all-MiniLM-L6-v2
