# Behavioral Data Analysis with AWS

This notebook demonstrates cloud-based behavioral data analysis using AWS services:
- S3 for data storage
- Lambda for serverless processing
- DynamoDB for results storage

**Duration:** 30-45 minutes  
**Prerequisites:** AWS setup complete (see setup_guide.md)

## Setup

In [None]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import json
from datetime import datetime

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries loaded successfully!")

In [None]:
# Configuration
S3_BUCKET = 'behavioral-data-XXXXX'  # Replace with your bucket name
LAMBDA_FUNCTION = 'analyze-behavioral-data'
DYNAMODB_TABLE = 'BehavioralAnalysis'
AWS_REGION = 'us-east-1'

# Initialize AWS clients
s3_client = boto3.client('s3', region_name=AWS_REGION)
lambda_client = boto3.client('lambda', region_name=AWS_REGION)
dynamodb = boto3.resource('dynamodb', region_name=AWS_REGION)

print(f"AWS clients initialized for region: {AWS_REGION}")

## 1. Generate Sample Data

Generate sample behavioral data for different experimental paradigms.

In [None]:
def generate_stroop_data(participant_id, n_trials=100):
    """Generate sample Stroop task data."""
    np.random.seed(hash(participant_id) % 2**32)
    
    data = []
    for trial in range(1, n_trials + 1):
        condition = 'congruent' if np.random.random() > 0.5 else 'incongruent'
        
        if condition == 'congruent':
            rt = np.random.normal(500, 80)
            accuracy = 1 if np.random.random() > 0.05 else 0
        else:
            rt = np.random.normal(650, 100)
            accuracy = 1 if np.random.random() > 0.15 else 0
        
        rt = max(200, rt)
        
        data.append({
            'participant_id': participant_id,
            'trial': trial,
            'task_type': 'stroop',
            'stimulus': condition,
            'response': np.random.choice(['left', 'right']),
            'rt': round(rt, 2),
            'accuracy': accuracy
        })
    
    return pd.DataFrame(data)


def generate_decision_data(participant_id, n_trials=100):
    """Generate sample decision making data."""
    np.random.seed(hash(participant_id) % 2**32 + 1)
    
    data = []
    difficulties = ['easy', 'medium', 'hard']
    
    for trial in range(1, n_trials + 1):
        difficulty = np.random.choice(difficulties)
        
        if difficulty == 'easy':
            rt = np.random.normal(400, 60)
            accuracy = 1 if np.random.random() > 0.05 else 0
        elif difficulty == 'medium':
            rt = np.random.normal(550, 80)
            accuracy = 1 if np.random.random() > 0.20 else 0
        else:
            rt = np.random.normal(700, 120)
            accuracy = 1 if np.random.random() > 0.35 else 0
        
        rt = max(200, rt)
        
        data.append({
            'participant_id': participant_id,
            'trial': trial,
            'task_type': 'decision',
            'stimulus': difficulty,
            'response': np.random.choice(['option_a', 'option_b']),
            'rt': round(rt, 2),
            'accuracy': accuracy
        })
    
    return pd.DataFrame(data)


# Generate data for 10 participants
n_participants = 10
participant_data = {}

for i in range(1, n_participants + 1):
    participant_id = f'sub{i:03d}'
    
    # Each participant does 1-2 tasks
    tasks = np.random.choice(['stroop', 'decision'], size=np.random.randint(1, 3), replace=False)
    
    for task in tasks:
        if task == 'stroop':
            df = generate_stroop_data(participant_id)
        else:
            df = generate_decision_data(participant_id)
        
        key = f"{participant_id}_{task}"
        participant_data[key] = df

print(f"Generated data for {len(participant_data)} participant-task combinations")
print(f"\nSample data (first 5 rows):")
print(participant_data[list(participant_data.keys())[0]].head())

## 2. Upload Data to S3

Upload trial-level data to S3 for Lambda processing.

In [None]:
def upload_dataframe_to_s3(df, bucket, key):
    """Upload DataFrame as CSV to S3."""
    csv_buffer = df.to_csv(index=False)
    s3_client.put_object(Bucket=bucket, Key=key, Body=csv_buffer)
    return f"s3://{bucket}/{key}"


# Upload all data
uploaded_files = []

for key, df in participant_data.items():
    s3_key = f"raw/{key}.csv"
    s3_uri = upload_dataframe_to_s3(df, S3_BUCKET, s3_key)
    uploaded_files.append(s3_uri)
    print(f"Uploaded: {s3_uri}")

print(f"\nTotal files uploaded: {len(uploaded_files)}")

## 3. Process Data with Lambda

Invoke Lambda function to analyze each participant's data.

In [None]:
def invoke_lambda_for_file(bucket, key):
    """Invoke Lambda function for a specific S3 file."""
    payload = {
        'bucket': bucket,
        'key': key
    }
    
    response = lambda_client.invoke(
        FunctionName=LAMBDA_FUNCTION,
        InvocationType='RequestResponse',
        Payload=json.dumps(payload)
    )
    
    result = json.loads(response['Payload'].read())
    return result


# Process all files
lambda_results = []

for key in participant_data.keys():
    s3_key = f"raw/{key}.csv"
    print(f"Processing: {s3_key}...")
    
    result = invoke_lambda_for_file(S3_BUCKET, s3_key)
    lambda_results.append(result)
    
    if result['statusCode'] == 200:
        print(f"  ✓ Success")
    else:
        print(f"  ✗ Error: {result.get('body', 'Unknown error')}")

print(f"\nProcessed {len(lambda_results)} files")

## 4. Query Results from DynamoDB

Retrieve analysis results from DynamoDB.

In [None]:
def query_all_results(table_name):
    """Scan DynamoDB table and return all results."""
    table = dynamodb.Table(table_name)
    
    response = table.scan()
    items = response['Items']
    
    # Handle pagination
    while 'LastEvaluatedKey' in response:
        response = table.scan(ExclusiveStartKey=response['LastEvaluatedKey'])
        items.extend(response['Items'])
    
    return items


def decimal_to_float(obj):
    """Convert DynamoDB Decimal to float."""
    from decimal import Decimal
    
    if isinstance(obj, list):
        return [decimal_to_float(item) for item in obj]
    elif isinstance(obj, dict):
        return {k: decimal_to_float(v) for k, v in obj.items()}
    elif isinstance(obj, Decimal):
        return float(obj)
    else:
        return obj


# Query results
print("Querying DynamoDB...")
items = query_all_results(DYNAMODB_TABLE)
items = [decimal_to_float(item) for item in items]

# Convert to DataFrame
results_df = pd.DataFrame(items)

print(f"Retrieved {len(results_df)} participant results")
print(f"\nColumns: {', '.join(results_df.columns[:10])}...")
print(f"\nFirst few rows:")
results_df.head()

## 5. Descriptive Statistics

Calculate group-level descriptive statistics.

In [None]:
print("="*60)
print("DESCRIPTIVE STATISTICS")
print("="*60)

print(f"\nTotal participants: {len(results_df)}")
print(f"\nTasks:")
print(results_df['task_type'].value_counts())

print(f"\n{'-'*60}")
print("REACTION TIME")
print(f"{'-'*60}")
print(f"Mean RT (ms):")
print(f"  Mean:   {results_df['mean_rt'].mean():.2f}")
print(f"  Median: {results_df['mean_rt'].median():.2f}")
print(f"  SD:     {results_df['mean_rt'].std():.2f}")
print(f"  Range:  [{results_df['mean_rt'].min():.2f}, {results_df['mean_rt'].max():.2f}]")

print(f"\n{'-'*60}")
print("ACCURACY")
print(f"{'-'*60}")
print(f"Accuracy:")
print(f"  Mean:   {results_df['accuracy'].mean():.3f}")
print(f"  Median: {results_df['accuracy'].median():.3f}")
print(f"  SD:     {results_df['accuracy'].std():.3f}")
print(f"  Range:  [{results_df['accuracy'].min():.3f}, {results_df['accuracy'].max():.3f}]")

# By task type
print(f"\n{'-'*60}")
print("BY TASK TYPE")
print(f"{'-'*60}")

for task in results_df['task_type'].unique():
    task_df = results_df[results_df['task_type'] == task]
    print(f"\n{task.upper()}:")
    print(f"  N = {len(task_df)}")
    print(f"  Mean RT: {task_df['mean_rt'].mean():.2f} ms (SD = {task_df['mean_rt'].std():.2f})")
    print(f"  Accuracy: {task_df['accuracy'].mean():.3f} (SD = {task_df['accuracy'].std():.3f})")

## 6. Visualization

Create visualizations of behavioral data.

In [None]:
# RT Distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
axes[0].hist(results_df['mean_rt'], bins=20, edgecolor='black', alpha=0.7)
axes[0].axvline(results_df['mean_rt'].mean(), color='red', linestyle='--', 
                label=f"Mean: {results_df['mean_rt'].mean():.2f} ms")
axes[0].set_xlabel('Mean RT (ms)', fontsize=12)
axes[0].set_ylabel('Frequency', fontsize=12)
axes[0].set_title('Distribution of Mean Reaction Times', fontsize=14, fontweight='bold')
axes[0].legend()
axes[0].grid(axis='y', alpha=0.3)

# Box plot by task
task_data = [results_df[results_df['task_type'] == task]['mean_rt'].values 
             for task in results_df['task_type'].unique()]
task_labels = results_df['task_type'].unique()

axes[1].boxplot(task_data, labels=task_labels)
axes[1].set_ylabel('Mean RT (ms)', fontsize=12)
axes[1].set_xlabel('Task Type', fontsize=12)
axes[1].set_title('Reaction Time by Task Type', fontsize=14, fontweight='bold')
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Accuracy visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
axes[0].hist(results_df['accuracy'], bins=20, edgecolor='black', alpha=0.7)
axes[0].axvline(results_df['accuracy'].mean(), color='red', linestyle='--',
                label=f"Mean: {results_df['accuracy'].mean():.3f}")
axes[0].set_xlabel('Accuracy', fontsize=12)
axes[0].set_ylabel('Frequency', fontsize=12)
axes[0].set_title('Distribution of Accuracy', fontsize=14, fontweight='bold')
axes[0].legend()
axes[0].grid(axis='y', alpha=0.3)

# Bar plot by task
task_accuracy = results_df.groupby('task_type')['accuracy'].agg(['mean', 'std'])
task_accuracy.plot(kind='bar', y='mean', yerr='std', ax=axes[1], 
                   capsize=5, legend=False, color='steelblue')
axes[1].set_ylabel('Accuracy', fontsize=12)
axes[1].set_xlabel('Task Type', fontsize=12)
axes[1].set_title('Accuracy by Task Type', fontsize=14, fontweight='bold')
axes[1].set_xticklabels(axes[1].get_xticklabels(), rotation=0)
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Accuracy-RT tradeoff
plt.figure(figsize=(10, 6))

for task in results_df['task_type'].unique():
    task_df = results_df[results_df['task_type'] == task]
    plt.scatter(task_df['mean_rt'], task_df['accuracy'], 
                label=task.capitalize(), alpha=0.6, s=100)

plt.xlabel('Mean RT (ms)', fontsize=12)
plt.ylabel('Accuracy', fontsize=12)
plt.title('Speed-Accuracy Tradeoff', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

## 7. Task-Specific Analysis

Analyze task-specific effects.

In [None]:
# Stroop effect analysis
stroop_df = results_df[results_df['task_type'] == 'stroop'].copy()

if len(stroop_df) > 0 and 'stroop_effect_rt' in stroop_df.columns:
    print("="*60)
    print("STROOP EFFECT ANALYSIS")
    print("="*60)
    
    stroop_effect = stroop_df['stroop_effect_rt'].dropna()
    
    print(f"\nStroop Effect (Incongruent - Congruent RT):")
    print(f"  Mean:   {stroop_effect.mean():.2f} ms")
    print(f"  Median: {stroop_effect.median():.2f} ms")
    print(f"  SD:     {stroop_effect.std():.2f} ms")
    print(f"  Range:  [{stroop_effect.min():.2f}, {stroop_effect.max():.2f}]")
    
    # One-sample t-test (is Stroop effect > 0?)
    t_stat, p_value = stats.ttest_1samp(stroop_effect, 0)
    print(f"\nOne-sample t-test (H0: effect = 0):")
    print(f"  t = {t_stat:.3f}, p = {p_value:.4f}")
    
    # Visualization
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Stroop effect distribution
    axes[0].hist(stroop_effect, bins=15, edgecolor='black', alpha=0.7)
    axes[0].axvline(stroop_effect.mean(), color='red', linestyle='--',
                    label=f"Mean: {stroop_effect.mean():.2f} ms")
    axes[0].axvline(0, color='black', linestyle='-', linewidth=2, label='No effect')
    axes[0].set_xlabel('Stroop Effect (ms)', fontsize=12)
    axes[0].set_ylabel('Frequency', fontsize=12)
    axes[0].set_title('Stroop Effect Distribution', fontsize=14, fontweight='bold')
    axes[0].legend()
    axes[0].grid(axis='y', alpha=0.3)
    
    # RT by condition
    if 'mean_rt_congruent' in stroop_df.columns and 'mean_rt_incongruent' in stroop_df.columns:
        congruent_rts = stroop_df['mean_rt_congruent'].dropna()
        incongruent_rts = stroop_df['mean_rt_incongruent'].dropna()
        
        axes[1].boxplot([congruent_rts, incongruent_rts], 
                        labels=['Congruent', 'Incongruent'])
        axes[1].set_ylabel('Mean RT (ms)', fontsize=12)
        axes[1].set_xlabel('Condition', fontsize=12)
        axes[1].set_title('RT by Stroop Condition', fontsize=14, fontweight='bold')
        axes[1].grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plt.show()
else:
    print("No Stroop data available for analysis")

In [None]:
# Decision task difficulty analysis
decision_df = results_df[results_df['task_type'] == 'decision'].copy()

if len(decision_df) > 0 and 'difficulty_effect_rt' in decision_df.columns:
    print("="*60)
    print("DECISION DIFFICULTY ANALYSIS")
    print("="*60)
    
    difficulty_effect = decision_df['difficulty_effect_rt'].dropna()
    
    print(f"\nDifficulty Effect (Hard - Easy RT):")
    print(f"  Mean:   {difficulty_effect.mean():.2f} ms")
    print(f"  Median: {difficulty_effect.median():.2f} ms")
    print(f"  SD:     {difficulty_effect.std():.2f} ms")
    
    # Visualization
    if all(col in decision_df.columns for col in ['mean_rt_easy', 'mean_rt_medium', 'mean_rt_hard']):
        fig, axes = plt.subplots(1, 2, figsize=(14, 5))
        
        # RT by difficulty
        difficulty_data = [
            decision_df['mean_rt_easy'].dropna(),
            decision_df['mean_rt_medium'].dropna(),
            decision_df['mean_rt_hard'].dropna()
        ]
        
        axes[0].boxplot(difficulty_data, labels=['Easy', 'Medium', 'Hard'])
        axes[0].set_ylabel('Mean RT (ms)', fontsize=12)
        axes[0].set_xlabel('Difficulty', fontsize=12)
        axes[0].set_title('RT by Difficulty Level', fontsize=14, fontweight='bold')
        axes[0].grid(axis='y', alpha=0.3)
        
        # Accuracy by difficulty
        accuracy_data = [
            decision_df['accuracy_easy'].dropna(),
            decision_df['accuracy_medium'].dropna(),
            decision_df['accuracy_hard'].dropna()
        ]
        
        axes[1].boxplot(accuracy_data, labels=['Easy', 'Medium', 'Hard'])
        axes[1].set_ylabel('Accuracy', fontsize=12)
        axes[1].set_xlabel('Difficulty', fontsize=12)
        axes[1].set_title('Accuracy by Difficulty Level', fontsize=14, fontweight='bold')
        axes[1].grid(axis='y', alpha=0.3)
        
        plt.tight_layout()
        plt.show()
else:
    print("No decision making data available for analysis")

## 8. Signal Detection Theory Analysis

Analyze d-prime and response criterion.

In [None]:
if 'dprime' in results_df.columns:
    dprime_data = results_df['dprime'].dropna()
    criterion_data = results_df['criterion'].dropna()
    
    if len(dprime_data) > 0:
        print("="*60)
        print("SIGNAL DETECTION THEORY")
        print("="*60)
        
        print(f"\nd-prime (sensitivity):")
        print(f"  Mean:   {dprime_data.mean():.3f}")
        print(f"  Median: {dprime_data.median():.3f}")
        print(f"  SD:     {dprime_data.std():.3f}")
        print(f"  Range:  [{dprime_data.min():.3f}, {dprime_data.max():.3f}]")
        
        if len(criterion_data) > 0:
            print(f"\nCriterion (response bias):")
            print(f"  Mean:   {criterion_data.mean():.3f}")
            print(f"  Median: {criterion_data.median():.3f}")
            print(f"  SD:     {criterion_data.std():.3f}")
            
            # One-sample t-test (is criterion different from 0?)
            t_stat, p_value = stats.ttest_1samp(criterion_data, 0)
            print(f"\nOne-sample t-test (H0: criterion = 0):")
            print(f"  t = {t_stat:.3f}, p = {p_value:.4f}")
        
        # Visualization
        fig, axes = plt.subplots(1, 2, figsize=(14, 5))
        
        axes[0].hist(dprime_data, bins=15, edgecolor='black', alpha=0.7)
        axes[0].axvline(dprime_data.mean(), color='red', linestyle='--',
                        label=f"Mean: {dprime_data.mean():.3f}")
        axes[0].set_xlabel("d-prime", fontsize=12)
        axes[0].set_ylabel('Frequency', fontsize=12)
        axes[0].set_title('Sensitivity Distribution', fontsize=14, fontweight='bold')
        axes[0].legend()
        axes[0].grid(axis='y', alpha=0.3)
        
        if len(criterion_data) > 0:
            axes[1].hist(criterion_data, bins=15, edgecolor='black', alpha=0.7)
            axes[1].axvline(criterion_data.mean(), color='red', linestyle='--',
                            label=f"Mean: {criterion_data.mean():.3f}")
            axes[1].axvline(0, color='black', linestyle='-', linewidth=2, 
                            label='No bias')
            axes[1].set_xlabel('Criterion', fontsize=12)
            axes[1].set_ylabel('Frequency', fontsize=12)
            axes[1].set_title('Response Bias Distribution', fontsize=14, fontweight='bold')
            axes[1].legend()
            axes[1].grid(axis='y', alpha=0.3)
        
        plt.tight_layout()
        plt.show()
else:
    print("No signal detection theory data available")

## 9. Statistical Tests

Perform group-level statistical tests.

In [None]:
print("="*60)
print("STATISTICAL TESTS")
print("="*60)

# Compare RT between tasks
if len(results_df['task_type'].unique()) >= 2:
    tasks = results_df['task_type'].unique()[:2]
    task1_rt = results_df[results_df['task_type'] == tasks[0]]['mean_rt']
    task2_rt = results_df[results_df['task_type'] == tasks[1]]['mean_rt']
    
    print(f"\nIndependent t-test: {tasks[0]} vs {tasks[1]} (RT)")
    t_stat, p_value = stats.ttest_ind(task1_rt, task2_rt)
    print(f"  t = {t_stat:.3f}, p = {p_value:.4f}")
    print(f"  Effect size (Cohen's d): {(task1_rt.mean() - task2_rt.mean()) / np.sqrt((task1_rt.std()**2 + task2_rt.std()**2) / 2):.3f}")
    
    # Compare accuracy
    task1_acc = results_df[results_df['task_type'] == tasks[0]]['accuracy']
    task2_acc = results_df[results_df['task_type'] == tasks[1]]['accuracy']
    
    print(f"\nIndependent t-test: {tasks[0]} vs {tasks[1]} (Accuracy)")
    t_stat, p_value = stats.ttest_ind(task1_acc, task2_acc)
    print(f"  t = {t_stat:.3f}, p = {p_value:.4f}")
    print(f"  Effect size (Cohen's d): {(task1_acc.mean() - task2_acc.mean()) / np.sqrt((task1_acc.std()**2 + task2_acc.std()**2) / 2):.3f}")

# RT-Accuracy correlation
print(f"\nCorrelation: RT vs Accuracy")
r, p = stats.pearsonr(results_df['mean_rt'], results_df['accuracy'])
print(f"  r = {r:.3f}, p = {p:.4f}")

print("\n" + "="*60)

## 10. Export Results

Export results for further analysis or publication.

In [None]:
# Export to CSV
output_filename = f'behavioral_results_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv'
results_df.to_csv(output_filename, index=False)
print(f"Results exported to: {output_filename}")

# Summary statistics
summary = results_df.groupby('task_type').agg({
    'mean_rt': ['mean', 'std', 'count'],
    'accuracy': ['mean', 'std']
})

summary_filename = f'summary_stats_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv'
summary.to_csv(summary_filename)
print(f"Summary statistics exported to: {summary_filename}")

## Summary

This notebook demonstrated:
1. ✅ Generating sample behavioral data
2. ✅ Uploading data to S3
3. ✅ Processing with Lambda (serverless)
4. ✅ Storing results in DynamoDB
5. ✅ Querying and aggregating results
6. ✅ Statistical analysis and visualization
7. ✅ Task-specific analyses (Stroop, decision making)
8. ✅ Signal detection theory
9. ✅ Exporting results

### Next Steps
- Analyze your own behavioral data
- Implement additional computational models
- Scale to hundreds or thousands of participants
- Move to Tier 3 for production CloudFormation deployment

### Cost Tracking
Remember to check AWS Cost Explorer to monitor your spending!

### Cleanup
When finished, follow `cleanup_guide.md` to delete all AWS resources.