# SoulSense Exploratory Data Analysis (EDA)

This notebook analyzes the SoulSense dataset to understand feature distributions, identify outliers, and check for correlations. This analysis informs the data cleaning pipeline (Issue #92) and model improvements.

In [None]:
import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Set Style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# Constants
DB_PATH = "../soulsense_db"

## 1. Load Data
Loading data from the SQLite database `scores` table.

In [None]:
def load_data():
    if not os.path.exists(DB_PATH):
        print(f"Database not found at {DB_PATH}. Using mock data for demonstration.")
        # Generate mock data if DB is empty
        np.random.seed(42)
        n = 200
        data = {
            'age': np.random.normal(30, 10, n).astype(int),
            'total_score': np.random.normal(60, 15, n).astype(int),
            'num_questions': [15] * n,
            'timestamp': pd.date_range(start='2024-01-01', periods=n)
        }
        return pd.DataFrame(data)
        
    conn = sqlite3.connect(DB_PATH)
    query = "SELECT * FROM scores"
    try:
        df = pd.read_sql_query(query, conn)
        return df
    except Exception as e:
        print(f"Error reading DB: {e}")
        return pd.DataFrame()
    finally:
        conn.close()

df = load_data()
print(f"Loaded {len(df)} records.")
df.head()

## 2. Basic Statistics & Structure

In [None]:
df.info()

In [None]:
df.describe()

## 3. Data Visualization

In [None]:
if not df.empty:
    fig, ax = plt.subplots(1, 2, figsize=(15, 6))

    # Age Distribution
    sns.histplot(df['age'], kde=True, ax=ax[0], color='skyblue')
    ax[0].set_title('Age Distribution')
    ax[0].set_xlabel('Age')

    # Score Distribution
    if 'total_score' in df.columns:
        sns.histplot(df['total_score'], kde=True, ax=ax[1], color='salmon')
        ax[1].set_title('Score Distribution')
        ax[1].set_xlabel('Total Score')

    plt.tight_layout()
    plt.show()

## 4. Correlation Analysis
Checking relationships between Age and Score.

In [None]:
if not df.empty and 'total_score' in df.columns:
    plt.figure(figsize=(8, 6))
    sns.scatterplot(data=df, x='age', y='total_score', alpha=0.6)
    plt.title('Age vs Total Score')
    plt.show()

    # Correlation Matrix
    try:
        corr = df.select_dtypes(include=[np.number]).corr()
        plt.figure(figsize=(8, 6))
        sns.heatmap(corr, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
        plt.title('Correlation Matrix')
        plt.show()
    except Exception as e:
        print("Could not calc correlation:", e)

## 5. Outlier Detection
Using Boxplots to identify anomalous scores or ages.

In [None]:
if not df.empty:
    fig, ax = plt.subplots(1, 2, figsize=(15, 6))
    
    sns.boxplot(y=df['age'], ax=ax[0], color='lightblue')
    ax[0].set_title('Age Boxplot')

    if 'total_score' in df.columns:
        sns.boxplot(y=df['total_score'], ax=ax[1], color='lightcoral')
        ax[1].set_title('Score Boxplot')
    
    plt.show()