In [4]:
# import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import random
import numpy as np

def generate_consolidated_social_media_dataset(days=30):
    """Generate consolidated social media usage dataset with consistent sessions"""
    
    # Step 1: Generate temporal usage patterns first to establish sessions
    print("Generating temporal usage patterns...")
    temporal_data = []
    
    # User-specific patterns
    base_sessions_per_day = np.random.gamma(2, 3)  # 2-20 sessions/day
    weekend_multiplier = np.random.uniform(1.2, 2.0)  # More usage on weekends
    
    session_id = 0
    for day in range(days):
        is_weekend = day % 7 in [5, 6]
        sessions_today = int(base_sessions_per_day * (weekend_multiplier if is_weekend else 1))
        
        # Generate sessions for this day
        session_times = np.random.uniform(0, 24, sessions_today)
        
        for session_time in session_times:
            # Session duration (log-normal distribution, 1-60 minutes)
            duration = np.random.lognormal(2.5, 1.2)  
            duration = np.clip(duration, 1, 120)  # 1-120 minutes
            
            # Time between sessions
            time_since_last = np.random.exponential(2)  # Hours
            
            # Peak usage hours (evening bias)
            if 18 <= session_time <= 23:
                duration *= np.random.uniform(1.5, 2.5)
            
            temporal_data.append({
                'session_id': session_id,
                'day': day,
                'session_start_hour': session_time,
                'session_duration_minutes': duration,
                'time_since_last_session_hours': time_since_last,
                'is_weekend': is_weekend
            })
            session_id += 1
    
    temporal_df = pd.DataFrame(temporal_data)
    total_sessions = len(temporal_df)
    print(f"Generated {total_sessions} sessions over {days} days")
    
    # Step 2: Generate engagement depth indicators for each session
    print("Generating engagement depth indicators...")
    engagement_data = []
    
    for session_id in range(total_sessions):
        # User type influences behavior
        user_type = np.random.choice(['casual', 'moderate', 'heavy'], p=[0.4, 0.4, 0.2])
        
        # Scroll velocity (posts per minute)
        if user_type == 'casual':
            scroll_velocity = np.random.gamma(2, 2)  # Slower scrolling
        elif user_type == 'moderate':
            scroll_velocity = np.random.gamma(3, 3)
        else:  # heavy user
            scroll_velocity = np.random.gamma(5, 4)  # Faster scrolling
            
        # Scroll depth (relative to feed length)
        scroll_depth = np.random.beta(2, 3)  # Most users don't scroll to bottom
        
        # Content interaction rate (interactions per 100 posts viewed)
        base_interaction_rate = np.random.beta(2, 8) * 20  # 0-20%
        
        # Time spent per post (seconds)
        time_per_post = np.random.lognormal(1.5, 0.8)
        time_per_post = np.clip(time_per_post, 1, 60)
        
        # Return-to-feed frequency (times per session)
        return_frequency = np.random.poisson(3)
        
        engagement_data.append({
            'session_id': session_id,
            'user_type': user_type,
            'scroll_velocity_posts_per_min': scroll_velocity,
            'scroll_depth_percentage': scroll_depth * 100,
            'interaction_rate_percentage': base_interaction_rate,
            'time_per_post_seconds': time_per_post,
            'return_to_feed_frequency': return_frequency
        })
    
    engagement_df = pd.DataFrame(engagement_data)
    
    # Step 3: Generate content consumption patterns for each session
    print("Generating content consumption patterns...") 
    content_data = []
    
    content_types = ['photo', 'video', 'story', 'reel', 'text']
    content_categories = ['lifestyle', 'news', 'entertainment', 'sports', 'politics', 'wellness']
    
    for session_id in range(total_sessions):
        # User preferences
        preferred_type = np.random.choice(content_types)
        preferred_categories = np.random.choice(content_categories, size=2, replace=False)
        
        # Content type distribution
        type_probs = [0.2] * len(content_types)
        type_probs[content_types.index(preferred_type)] = 0.4
        type_consumed = np.random.choice(content_types, p=np.array(type_probs)/sum(type_probs))
        
        # Time on negative/controversial content
        negative_content_time = np.random.exponential(5)  # Minutes
        
        # Recommendation algorithm engagement
        algo_clicks = np.random.poisson(8)
        
        # Search vs feed browsing ratio
        search_ratio = np.random.beta(1, 4)  # Most time in feed, some searching
        
        content_data.append({
            'session_id': session_id,
            'primary_content_type': type_consumed,
            'preferred_categories': ','.join(preferred_categories),
            'negative_content_minutes': negative_content_time,
            'algorithm_interactions': algo_clicks,
            'search_vs_feed_ratio': search_ratio
        })
    
    content_df = pd.DataFrame(content_data)
    
    # Step 4: Generate behavioral triggers for each session
    print("Generating behavioral triggers...")
    trigger_data = []
    
    trigger_types = ['notification', 'boredom', 'habit', 'social', 'news_check']
    
    for session_id in range(total_sessions):
        # Notification response pattern
        notification_delay = np.random.lognormal(1, 1.5)  # Minutes to respond
        notification_delay = np.clip(notification_delay, 0.1, 120)
        
        # Trigger for opening app
        opening_trigger = np.random.choice(trigger_types, 
                                         p=[0.3, 0.25, 0.2, 0.15, 0.1])
        
        # Background app usage (0-1 scale)
        background_usage = np.random.beta(2, 5)
        
        # How session ended
        exit_type = np.random.choice(['intentional', 'distracted', 'forced'], 
                                   p=[0.4, 0.4, 0.2])
        
        trigger_data.append({
            'session_id': session_id,
            'notification_response_minutes': notification_delay,
            'opening_trigger': opening_trigger,
            'background_usage_score': background_usage,
            'exit_behavior': exit_type
        })
    
    trigger_df = pd.DataFrame(trigger_data)
    
    # Step 5: Generate wellbeing indicators (daily level)
    print("Generating wellbeing indicators...")
    wellbeing_data = []
    
    # User baseline characteristics
    baseline_compulsiveness = np.random.beta(2, 5)  # Most users low, some high
    baseline_sleep_quality = np.random.normal(7, 1.5)  # Hours of sleep
    
    for day in range(days):
        # Usage exceeding planned time
        planned_time = np.random.uniform(30, 120)  # Minutes planned
        actual_time = planned_time * np.random.lognormal(0, 0.5)  # Usually more
        time_overage = max(0, actual_time - planned_time)
        
        # Late night usage
        late_night_minutes = np.random.exponential(15) if np.random.random() < 0.3 else 0
        
        # Compulsive checking
        daily_compulsive_checks = np.random.poisson(baseline_compulsiveness * 20)
        
        # Sleep impact
        sleep_hours = baseline_sleep_quality - (late_night_minutes / 60) * 0.5
        sleep_hours = np.clip(sleep_hours, 4, 10)
        
        wellbeing_data.append({
            'day': day,
            'planned_usage_minutes': planned_time,
            'actual_usage_minutes': actual_time,
            'usage_overage_minutes': time_overage,
            'late_night_usage_minutes': late_night_minutes,
            'compulsive_checks_count': daily_compulsive_checks,
            'sleep_hours': sleep_hours,
            'baseline_compulsiveness': baseline_compulsiveness
        })
    
    wellbeing_df = pd.DataFrame(wellbeing_data)
    
    # Step 6: Merge all DataFrames into one consolidated DataFrame
    print("Consolidating data...")
    
    # Start with temporal data as the base
    consolidated_df = temporal_df
    
    # Merge engagement data
    consolidated_df = consolidated_df.merge(engagement_df, on='session_id', how='left')
    
    # Merge content data
    consolidated_df = consolidated_df.merge(content_df, on='session_id', how='left')
    
    # Merge behavioral trigger data
    consolidated_df = consolidated_df.merge(trigger_df, on='session_id', how='left')
    
    # Merge wellbeing data (daily level, so merge on 'day')
    consolidated_df = consolidated_df.merge(wellbeing_df, on='day', how='left')
    
    print(f"Consolidated dataset created with {len(consolidated_df)} sessions and {len(consolidated_df.columns)} columns")
    
    return consolidated_df

# Generate the consolidated dataset
consolidated_data = generate_consolidated_social_media_dataset(days=30)

# Save to CSV
consolidated_data.to_csv('synthetic_social_media_data_consolidated.csv', index=False)
print(f"Saved consolidated data with {len(consolidated_data)} sessions")

# Display basic info about the dataset
print(f"\nDataset shape: {consolidated_data.shape}")
print(f"Sessions per day range: {consolidated_data.groupby('day').size().min()} - {consolidated_data.groupby('day').size().max()}")
print(f"Average sessions per day: {consolidated_data.groupby('day').size().mean():.2f}")
print(f"\nColumn names:")
print(consolidated_data.columns.tolist())

Generating temporal usage patterns...
Generated 98 sessions over 30 days
Generating engagement depth indicators...
Generating content consumption patterns...
Generating behavioral triggers...
Generating wellbeing indicators...
Consolidating data...
Consolidated dataset created with 98 sessions and 28 columns
Saved consolidated data with 98 sessions

Dataset shape: (98, 28)
Sessions per day range: 3 - 4
Average sessions per day: 3.27

Column names:
['session_id', 'day', 'session_start_hour', 'session_duration_minutes', 'time_since_last_session_hours', 'is_weekend', 'user_type', 'scroll_velocity_posts_per_min', 'scroll_depth_percentage', 'interaction_rate_percentage', 'time_per_post_seconds', 'return_to_feed_frequency', 'primary_content_type', 'preferred_categories', 'negative_content_minutes', 'algorithm_interactions', 'search_vs_feed_ratio', 'notification_response_minutes', 'opening_trigger', 'background_usage_score', 'exit_behavior', 'planned_usage_minutes', 'actual_usage_minutes', 'u