In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
import pickle

In [None]:
# Load the dataset 
df = pd.read_csv('../data/final_dataset.csv')

print(f"Loaded {len(df)} posts from {df['user_id'].nunique()} users")
print(f"Labels: {df['primary_label'].unique().tolist()}")

Loaded 78530 posts from 2558 users
Labels: ['OCD', 'ADHD', 'aspergers', 'depression', 'ptsd']


In [3]:
# Pick one user as example
example_user = df.groupby('user_id').size().sort_values(ascending=False).index[5]

user_posts = df[df['user_id'] == example_user].sort_values('timestamp')

print(f"User: {example_user}")
print(f"Primary label: {user_posts['primary_label'].iloc[0]}")
print(f"Number of posts: {len(user_posts)}")
print()
print("Timeline (chronological order):")
print("-" * 50)

for i, (_, row) in enumerate(user_posts.head(5).iterrows()):
    text = row['post_text'][:80] if pd.notna(row['post_text']) else "[empty]"
    print(f"Post {i+1} [{row['timestamp'][:10]}]: {text}...")

User: turquoiseturtle01
Primary label: OCD
Number of posts: 81

Timeline (chronological order):
--------------------------------------------------
Post 1 [2021-07-07]: Through May and June, I started getting obsessive thoughts about things like VHS...
Post 2 [2021-07-09]: I have been obsessing over the condition of my DVD collection from May and June....
Post 3 [2021-07-10]: I’ve noticed when I get intrusive thoughts, I feel a cold pressure on the left s...
Post 4 [2021-07-12]: [removed]...
Post 5 [2021-07-15]: [removed]...


In [None]:
def build_user_timelines(df, min_posts=5, max_posts=50):
   
    timelines = {}
    labels = {}
    
    # Group by user
    for user_id, user_df in df.groupby('user_id'):
        # Sort by timestamp
        user_df = user_df.sort_values('timestamp')
        
        # Skip users with too few posts
        if len(user_df) < min_posts:
            continue
        
        # Get post texts 
        posts = []
        for _, row in user_df.iterrows():
            title = row['post_title'] if pd.notna(row['post_title']) else ""
            body = row['post_text'] if pd.notna(row['post_text']) else ""
            combined = f"{title} {body}".strip()
            if combined:  # Only add non-empty posts
                posts.append(combined)
        
        # Keep only last max_posts (most recent history)
        if len(posts) > max_posts:
            posts = posts[-max_posts:]
        
        # Only keep if we still have enough posts
        if len(posts) >= min_posts:
            timelines[user_id] = posts
            labels[user_id] = user_df['primary_label'].iloc[0]
    
    return timelines, labels

print("Timeline builder function created!")

Timeline builder function created!


In [5]:
# Build timelines with minimum 5 posts, maximum 50 posts per user
timelines, labels = build_user_timelines(df, min_posts=5, max_posts=50)

print(f"Built timelines for {len(timelines)} users")
print()

# Check distribution
timeline_lengths = [len(t) for t in timelines.values()]
print("Timeline length statistics:")
print(f"  Min: {min(timeline_lengths)}")
print(f"  Max: {max(timeline_lengths)}")
print(f"  Mean: {np.mean(timeline_lengths):.1f}")
print(f"  Median: {np.median(timeline_lengths):.1f}")
print()

# Labels distribution
from collections import Counter
label_counts = Counter(labels.values())
print("Users per label:")
for label, count in sorted(label_counts.items(), key=lambda x: -x[1]):
    print(f"  {label}: {count}")

Built timelines for 2558 users

Timeline length statistics:
  Min: 5
  Max: 50
  Mean: 9.6
  Median: 7.0

Users per label:
  OCD: 960
  aspergers: 567
  ADHD: 456
  ptsd: 359
  depression: 216


In [6]:
# Look at one complete timeline
sample_user = list(timelines.keys())[0]

print(f"Sample Timeline - User: {sample_user}")
print(f"Label: {labels[sample_user]}")
print(f"Number of posts: {len(timelines[sample_user])}")
print("=" * 60)

for i, post in enumerate(timelines[sample_user][:3]):  # First 3 posts
    print(f"\n[Post {i+1}]")
    print(post[:300] + "..." if len(post) > 300 else post)
    

Sample Timeline - User: -PanFrog-
Label: OCD
Number of posts: 5

[Post 1]
Is this ZOCD?? Please someone answer quickly

Hello
I just had a thought pop up last night about what if I am attracted to animals?
It really made me disgusted and I have experienced OCD before, but I recently recovered. I dont feel much anxiety, but I feel uncomfortable and disgusted each time a im...

[Post 2]
Just a question I was wondering if OCD can make you think It's true and that I should just accept it? Even though I know I don't like those kinds of things, my mind makes it seem like I am the bad person that my mind is telling me. But I don't even agree with the thoughts and I know they're not true...

[Post 3]
It's me again: ZOCD I have already posted about this but I am unsure so..
I have had this theme only for two days now, but I do not feel much anxiety about the thoughts. I felt a lot just now, but i keep thinking It's real and that I'm just using OCD as an excuse. One of my themes from another tim

In [7]:
from sklearn.model_selection import train_test_split

# Get all user IDs and their labels
user_ids = list(timelines.keys())
user_labels = [labels[uid] for uid in user_ids]

print(f"Total users: {len(user_ids)}")

# First split: 70% train, 30% temp (will become val + test)
train_users, temp_users, train_labels, temp_labels = train_test_split(
    user_ids, 
    user_labels,
    test_size=0.3,
    stratify=user_labels,  # Maintain label proportions
    random_state=42
)

# Second split: 50% of temp = 15% val, 15% test
val_users, test_users, val_labels, test_labels = train_test_split(
    temp_users,
    temp_labels,
    test_size=0.5,
    stratify=temp_labels,
    random_state=42
)

print(f"Train users: {len(train_users)} ({len(train_users)/len(user_ids)*100:.1f}%)")
print(f"Val users: {len(val_users)} ({len(val_users)/len(user_ids)*100:.1f}%)")
print(f"Test users: {len(test_users)} ({len(test_users)/len(user_ids)*100:.1f}%)")

Total users: 2558
Train users: 1790 (70.0%)
Val users: 384 (15.0%)
Test users: 384 (15.0%)


In [8]:
from collections import Counter

print("Label distribution per split:")
print()

for split_name, split_labels in [("Train", train_labels), ("Val", val_labels), ("Test", test_labels)]:
    counts = Counter(split_labels)
    print(f"{split_name}:")
    for label in sorted(counts.keys()):
        print(f"  {label}: {counts[label]}")
    print()

Label distribution per split:

Train:
  ADHD: 319
  OCD: 672
  aspergers: 397
  depression: 151
  ptsd: 251

Val:
  ADHD: 68
  OCD: 144
  aspergers: 85
  depression: 33
  ptsd: 54

Test:
  ADHD: 69
  OCD: 144
  aspergers: 85
  depression: 32
  ptsd: 54



In [9]:
def create_dataset(user_list, timelines, labels):
    """Create dataset from list of users"""
    X = [timelines[uid] for uid in user_list]  # List of timelines
    y = [labels[uid] for uid in user_list]      # List of labels
    return X, y

# Create datasets
X_train, y_train = create_dataset(train_users, timelines, labels)
X_val, y_val = create_dataset(val_users, timelines, labels)
X_test, y_test = create_dataset(test_users, timelines, labels)

print("Datasets created!")
print()
print(f"X_train: {len(X_train)} timelines")
print(f"X_val: {len(X_val)} timelines")
print(f"X_test: {len(X_test)} timelines")
print()
print("Example - X_train[0]:")
print(f"  Number of posts: {len(X_train[0])}")
print(f"  Label: {y_train[0]}")
print(f"  First post preview: {X_train[0][0][:100]}...")

Datasets created!

X_train: 1790 timelines
X_val: 384 timelines
X_test: 384 timelines

Example - X_train[0]:
  Number of posts: 6
  Label: ADHD
  First post preview: Can I talk about the unexpected and INSANELY WELCOME change Adderall has had on my sex drive?! I hop...


In [10]:
from sklearn.preprocessing import LabelEncoder

# Create label encoder
label_encoder = LabelEncoder()
label_encoder.fit(y_train)

# Encode all labels
y_train_encoded = label_encoder.transform(y_train)
y_val_encoded = label_encoder.transform(y_val)
y_test_encoded = label_encoder.transform(y_test)

print("Label encoding:")
for i, label in enumerate(label_encoder.classes_):
    print(f"  {label} -> {i}")

print()
print(f"y_train_encoded sample: {y_train_encoded[:10]}")

Label encoding:
  ADHD -> 0
  OCD -> 1
  aspergers -> 2
  depression -> 3
  ptsd -> 4

y_train_encoded sample: [0 4 0 3 4 1 1 0 2 1]


In [11]:
import pickle

# Save all data
data_to_save = {
    'X_train': X_train,
    'X_val': X_val,
    'X_test': X_test,
    'y_train': y_train_encoded,
    'y_val': y_val_encoded,
    'y_test': y_test_encoded,
    'label_encoder': label_encoder,
    'train_users': train_users,
    'val_users': val_users,
    'test_users': test_users
}

with open('../data/processed_data.pkl', 'wb') as f:
    pickle.dump(data_to_save, f)

print("✓ Saved to data/processed_data.pkl")
print()
print("Contents saved:")
for key, value in data_to_save.items():
    if isinstance(value, list):
        print(f"  {key}: {len(value)} items")
    elif isinstance(value, np.ndarray):
        print(f"  {key}: array of shape {value.shape}")
    else:
        print(f"  {key}: {type(value).__name__}")

✓ Saved to data/processed_data.pkl

Contents saved:
  X_train: 1790 items
  X_val: 384 items
  X_test: 384 items
  y_train: array of shape (1790,)
  y_val: array of shape (384,)
  y_test: array of shape (384,)
  label_encoder: LabelEncoder
  train_users: 1790 items
  val_users: 384 items
  test_users: 384 items
