# Data Exploration - Stress Level Prediction

This notebook performs comprehensive data exploration for the stress level prediction project.

## Objectives:
1. Load and examine the dataset structure
2. Perform basic statistical analysis
3. Check data quality and missing values
4. Analyze correlations between features
5. Visualize data distributions and patterns
6. Generate automated profiling report

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pathlib import Path

# Set up plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
warnings.filterwarnings('ignore')

# Set up paths
import sys
sys.path.append('../src')
from utils.config import *
from data.data_loader import DataLoader

print("Libraries imported successfully!")

## 1. Load Dataset

In [None]:
# Initialize data loader
data_loader = DataLoader()

# Load your dataset - Replace 'your_dataset.csv' with your actual filename
# df = data_loader.load_csv('your_dataset.csv')

# For demonstration, we'll create a sample dataset
# Remove this section when you have your actual data
np.random.seed(42)
n_samples = 1000

# Create sample stress-related features
sample_data = {
    'heart_rate': np.random.normal(75, 15, n_samples),
    'sleep_hours': np.random.normal(7, 1.5, n_samples),
    'exercise_minutes': np.random.exponential(30, n_samples),
    'caffeine_intake': np.random.poisson(2, n_samples),
    'work_hours': np.random.normal(8, 2, n_samples),
    'age': np.random.randint(18, 65, n_samples),
    'bmi': np.random.normal(25, 4, n_samples),
    'blood_pressure_sys': np.random.normal(120, 20, n_samples),
    'blood_pressure_dia': np.random.normal(80, 10, n_samples),
    'stress_level': np.random.choice(['Low', 'Medium', 'High'], n_samples, p=[0.3, 0.5, 0.2])
}

df = pd.DataFrame(sample_data)

# Introduce some missing values for demonstration
missing_indices = np.random.choice(df.index, size=int(0.05 * len(df)), replace=False)
df.loc[missing_indices, 'sleep_hours'] = np.nan

print(f"Dataset loaded successfully!")
print(f"Shape: {df.shape}")
df.head()

## 2. Basic Dataset Information

In [None]:
# Get basic dataset information
data_info = data_loader.get_data_info(df)

print("Dataset Information:")
print(f"Shape: {data_info['shape']}")
print(f"Columns: {data_info['columns']}")
print(f"Memory usage: {data_info['memory_usage'] / 1024:.2f} KB")
print("\nData types:")
for col, dtype in data_info['dtypes'].items():
    print(f"  {col}: {dtype}")

In [None]:
# Display basic statistics
print("Descriptive Statistics:")
df.describe()

## 3. Missing Values Analysis

In [None]:
# Check for missing values
missing_data = df.isnull().sum()
missing_percentage = (missing_data / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing_data,
    'Missing Percentage': missing_percentage
})

print("Missing Values Analysis:")
print(missing_df[missing_df['Missing Count'] > 0])

# Visualize missing values
if missing_data.sum() > 0:
    plt.figure(figsize=(10, 6))
    missing_data[missing_data > 0].plot(kind='bar')
    plt.title('Missing Values by Column')
    plt.xlabel('Columns')
    plt.ylabel('Count of Missing Values')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
else:
    print("No missing values found!")

## 4. Target Variable Analysis

In [None]:
# Analyze target variable distribution
target_col = 'stress_level'  # Update with your actual target column

if target_col in df.columns:
    print(f"Target Variable: {target_col}")
    print("\nValue Counts:")
    print(df[target_col].value_counts())
    
    print("\nProportions:")
    print(df[target_col].value_counts(normalize=True))
    
    # Visualize target distribution
    plt.figure(figsize=(12, 4))
    
    # Count plot
    plt.subplot(1, 2, 1)
    sns.countplot(data=df, x=target_col)
    plt.title('Distribution of Stress Levels')
    plt.xticks(rotation=45)
    
    # Pie chart
    plt.subplot(1, 2, 2)
    df[target_col].value_counts().plot(kind='pie', autopct='%1.1f%%')
    plt.title('Stress Level Proportions')
    plt.ylabel('')
    
    plt.tight_layout()
    plt.show()
else:
    print(f"Target column '{target_col}' not found in dataset")
    print(f"Available columns: {list(df.columns)}")

## 5. Feature Distributions

In [None]:
# Plot distributions of numerical features
numerical_cols = df.select_dtypes(include=[np.number]).columns

if len(numerical_cols) > 0:
    n_cols = 3
    n_rows = (len(numerical_cols) + n_cols - 1) // n_cols
    
    plt.figure(figsize=(15, 4 * n_rows))
    
    for i, col in enumerate(numerical_cols, 1):
        plt.subplot(n_rows, n_cols, i)
        sns.histplot(df[col], kde=True)
        plt.title(f'Distribution of {col}')
        plt.xlabel(col)
    
    plt.tight_layout()
    plt.show()
else:
    print("No numerical columns found")

## 6. Correlation Analysis

In [None]:
# Calculate correlation matrix for numerical features
if len(numerical_cols) > 1:
    correlation_matrix = df[numerical_cols].corr()
    
    # Plot correlation heatmap
    plt.figure(figsize=(12, 10))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
                square=True, linewidths=0.5)
    plt.title('Feature Correlation Matrix')
    plt.tight_layout()
    plt.show()
    
    # Find highly correlated feature pairs
    high_corr_pairs = []
    threshold = 0.7
    
    for i in range(len(correlation_matrix.columns)):
        for j in range(i+1, len(correlation_matrix.columns)):
            if abs(correlation_matrix.iloc[i, j]) > threshold:
                high_corr_pairs.append((
                    correlation_matrix.columns[i],
                    correlation_matrix.columns[j],
                    correlation_matrix.iloc[i, j]
                ))
    
    if high_corr_pairs:
        print(f"\nHighly correlated feature pairs (|correlation| > {threshold}):")
        for pair in high_corr_pairs:
            print(f"  {pair[0]} - {pair[1]}: {pair[2]:.3f}")
    else:
        print(f"\nNo highly correlated feature pairs found (threshold: {threshold})")
else:
    print("Not enough numerical features for correlation analysis")

## 7. Feature vs Target Analysis

In [None]:
# Analyze relationship between features and target variable
if target_col in df.columns and len(numerical_cols) > 0:
    # Box plots for numerical features vs target
    n_cols = 3
    n_rows = (len(numerical_cols) + n_cols - 1) // n_cols
    
    plt.figure(figsize=(15, 4 * n_rows))
    
    for i, col in enumerate(numerical_cols, 1):
        plt.subplot(n_rows, n_cols, i)
        sns.boxplot(data=df, x=target_col, y=col)
        plt.title(f'{col} by {target_col}')
        plt.xticks(rotation=45)
    
    plt.tight_layout()
    plt.show()

## 8. Outlier Detection

In [None]:
# Detect outliers using IQR method
def detect_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers

print("Outlier Analysis:")
for col in numerical_cols:
    outliers = detect_outliers(df, col)
    outlier_percentage = (len(outliers) / len(df)) * 100
    print(f"{col}: {len(outliers)} outliers ({outlier_percentage:.2f}%)")

# Visualize outliers with box plots
if len(numerical_cols) > 0:
    plt.figure(figsize=(15, 6))
    df[numerical_cols].boxplot()
    plt.title('Box Plots for Outlier Detection')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

## 9. Data Quality Summary

In [None]:
# Generate data quality summary
quality_summary = {
    'Total Rows': len(df),
    'Total Columns': len(df.columns),
    'Numerical Columns': len(numerical_cols),
    'Categorical Columns': len(df.select_dtypes(exclude=[np.number]).columns),
    'Missing Values': df.isnull().sum().sum(),
    'Duplicate Rows': df.duplicated().sum(),
    'Memory Usage (KB)': df.memory_usage(deep=True).sum() / 1024
}

print("Data Quality Summary:")
print("=" * 30)
for key, value in quality_summary.items():
    if key == 'Memory Usage (KB)':
        print(f"{key}: {value:.2f}")
    else:
        print(f"{key}: {value}")

# Data quality recommendations
print("\nRecommendations:")
print("=" * 20)

if quality_summary['Missing Values'] > 0:
    missing_percentage = (quality_summary['Missing Values'] / (len(df) * len(df.columns))) * 100
    print(f"- Handle {quality_summary['Missing Values']} missing values ({missing_percentage:.2f}% of total data)")

if quality_summary['Duplicate Rows'] > 0:
    print(f"- Remove {quality_summary['Duplicate Rows']} duplicate rows")

if len(high_corr_pairs) > 0:
    print(f"- Consider removing highly correlated features ({len(high_corr_pairs)} pairs found)")

print("- Consider feature scaling for numerical variables")
print("- Encode categorical variables for machine learning")

## 10. Automated Data Profiling (Optional)

In [None]:
# Generate automated profiling report using ydata-profiling
# Uncomment the following lines if you want to generate a detailed report

# try:
#     from ydata_profiling import ProfileReport
#     
#     # Generate profile report
#     profile = ProfileReport(df, title="Stress Level Prediction - Data Profile", explorative=True)
#     
#     # Save report
#     profile.to_file("../reports/data_profile_report.html")
#     print("Data profiling report saved to ../reports/data_profile_report.html")
#     
# except ImportError:
#     print("ydata-profiling not installed. Install with: pip install ydata-profiling")

print("Data exploration completed!")
print("\nNext steps:")
print("1. Clean the data based on findings")
print("2. Handle missing values and outliers")
print("3. Proceed to feature selection and engineering")