# Data Cleaning - Stress Level Prediction

This notebook performs data cleaning and preprocessing for the stress level prediction project.

## Objectives:
1. Handle missing values
2. Remove duplicates
3. Handle outliers
4. Encode categorical variables
5. Feature scaling and normalization
6. Data validation and quality checks

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pathlib import Path

# Set up plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
warnings.filterwarnings('ignore')

# Import custom modules
import sys
sys.path.append('../src')
from utils.config import *
from data.data_loader import DataLoader
from data.data_preprocessor import DataPreprocessor

print("Libraries imported successfully!")

## 1. Load Dataset

In [None]:
# Initialize data loader
data_loader = DataLoader()

# Load your dataset - Replace with your actual data loading
# df = data_loader.load_csv('your_dataset.csv')

# For demonstration, we'll recreate the sample dataset
np.random.seed(42)
n_samples = 1000

sample_data = {
    'heart_rate': np.random.normal(75, 15, n_samples),
    'sleep_hours': np.random.normal(7, 1.5, n_samples),
    'exercise_minutes': np.random.exponential(30, n_samples),
    'caffeine_intake': np.random.poisson(2, n_samples),
    'work_hours': np.random.normal(8, 2, n_samples),
    'age': np.random.randint(18, 65, n_samples),
    'bmi': np.random.normal(25, 4, n_samples),
    'blood_pressure_sys': np.random.normal(120, 20, n_samples),
    'blood_pressure_dia': np.random.normal(80, 10, n_samples),
    'gender': np.random.choice(['Male', 'Female'], n_samples),
    'stress_level': np.random.choice(['Low', 'Medium', 'High'], n_samples, p=[0.3, 0.5, 0.2])
}

df_original = pd.DataFrame(sample_data)

# Introduce some data quality issues for demonstration
# Missing values
missing_indices = np.random.choice(df_original.index, size=int(0.05 * len(df_original)), replace=False)
df_original.loc[missing_indices, 'sleep_hours'] = np.nan

# Add some duplicates
duplicates = df_original.sample(n=10, random_state=42)
df_original = pd.concat([df_original, duplicates], ignore_index=True)

# Add some outliers
outlier_indices = np.random.choice(df_original.index, size=20, replace=False)
df_original.loc[outlier_indices, 'heart_rate'] = np.random.normal(150, 10, 20)  # Extreme heart rates

df = df_original.copy()

print(f"Original dataset loaded!")
print(f"Shape: {df.shape}")
print(f"Missing values: {df.isnull().sum().sum()}")
print(f"Duplicate rows: {df.duplicated().sum()}")
df.head()

## 2. Initial Data Assessment

In [None]:
# Get initial data assessment
print("Initial Data Assessment:")
print("=" * 30)
print(f"Shape: {df.shape}")
print(f"Data types:")
print(df.dtypes)
print(f"\nMissing values per column:")
print(df.isnull().sum())
print(f"\nDuplicate rows: {df.duplicated().sum()}")

## 3. Handle Missing Values

In [None]:
# Initialize preprocessor
preprocessor = DataPreprocessor()

# Check missing values before handling
print("Missing values before handling:")
print(df.isnull().sum())

# Handle missing values
df_cleaned = preprocessor.handle_missing_values(df, strategy='mean')

print("\nMissing values after handling:")
print(df_cleaned.isnull().sum())

# Visualize the impact
if df.isnull().sum().sum() > 0:
    plt.figure(figsize=(12, 4))
    
    plt.subplot(1, 2, 1)
    df.isnull().sum().plot(kind='bar')
    plt.title('Missing Values Before')
    plt.xticks(rotation=45)
    
    plt.subplot(1, 2, 2)
    df_cleaned.isnull().sum().plot(kind='bar')
    plt.title('Missing Values After')
    plt.xticks(rotation=45)
    
    plt.tight_layout()
    plt.show()

df = df_cleaned.copy()

## 4. Remove Duplicates

In [None]:
# Remove duplicates
print(f"Rows before removing duplicates: {len(df)}")
print(f"Duplicate rows: {df.duplicated().sum()}")

df_no_duplicates = preprocessor.remove_duplicates(df)

print(f"Rows after removing duplicates: {len(df_no_duplicates)}")
print(f"Rows removed: {len(df) - len(df_no_duplicates)}")

df = df_no_duplicates.copy()

## 5. Handle Outliers

In [None]:
# Detect and visualize outliers
numerical_cols = df.select_dtypes(include=[np.number]).columns

def detect_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers, lower_bound, upper_bound

print("Outlier Analysis:")
outlier_summary = {}
for col in numerical_cols:
    outliers, lower, upper = detect_outliers_iqr(df, col)
    outlier_count = len(outliers)
    outlier_percentage = (outlier_count / len(df)) * 100
    outlier_summary[col] = {
        'count': outlier_count,
        'percentage': outlier_percentage,
        'lower_bound': lower,
        'upper_bound': upper
    }
    print(f"{col}: {outlier_count} outliers ({outlier_percentage:.2f}%)")

# Visualize outliers
plt.figure(figsize=(15, 10))
for i, col in enumerate(numerical_cols, 1):
    plt.subplot(3, 3, i)
    plt.boxplot(df[col])
    plt.title(f'{col} - Outliers')
    plt.ylabel(col)
    
plt.tight_layout()
plt.show()

## 6. Encode Categorical Variables

In [None]:
# Identify categorical columns
categorical_cols = df.select_dtypes(exclude=[np.number]).columns
target_col = 'stress_level'

print(f"Categorical columns: {list(categorical_cols)}")
print(f"Target column: {target_col}")

# Show unique values for each categorical column
for col in categorical_cols:
    print(f"\n{col} unique values: {df[col].unique()}")
    print(f"Value counts:")
    print(df[col].value_counts())

In [None]:
# Encode categorical variables
print("Encoding categorical variables...")
print(f"Shape before encoding: {df.shape}")

df_encoded = preprocessor.encode_categorical_variables(df, target_col=target_col)

print(f"Shape after encoding: {df_encoded.shape}")
print(f"New columns: {set(df_encoded.columns) - set(df.columns)}")

# Show the target column transformation
if target_col in preprocessor.encoders:
    le = preprocessor.encoders[target_col]
    print(f"\nTarget encoding:")
    for i, class_label in enumerate(le.classes_):
        print(f"  {class_label} -> {i}")

df = df_encoded.copy()

## 7. Feature Scaling

In [None]:
# Separate features and target
X = df.drop(columns=[target_col])
y = df[target_col]

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Target distribution:")
print(y.value_counts().sort_index())

In [None]:
# Scale numerical features
print("Before scaling - Feature statistics:")
print(X.describe())

X_scaled, _ = preprocessor.scale_features(X)

print("\nAfter scaling - Feature statistics:")
print(X_scaled.describe())

# Visualize the effect of scaling
numerical_features = X.select_dtypes(include=[np.number]).columns[:6]  # Show first 6 for space

if len(numerical_features) > 0:
    plt.figure(figsize=(15, 8))
    
    for i, col in enumerate(numerical_features, 1):
        plt.subplot(2, 3, i)
        plt.hist(X[col], alpha=0.7, label='Original', bins=30)
        plt.hist(X_scaled[col], alpha=0.7, label='Scaled', bins=30)
        plt.title(f'{col}')
        plt.legend()
    
    plt.tight_layout()
    plt.show()

X = X_scaled.copy()

## 8. Remove Low Variance Features

In [None]:
# Remove low variance features
print(f"Features before variance filtering: {X.shape[1]}")

# Combine features and target for variance analysis
df_for_variance = pd.concat([X, y], axis=1)
df_variance_filtered = preprocessor.remove_low_variance_features(df_for_variance, threshold=0.01)

# Separate again
X_variance_filtered = df_variance_filtered.drop(columns=[target_col])
y_variance_filtered = df_variance_filtered[target_col]

print(f"Features after variance filtering: {X_variance_filtered.shape[1]}")
removed_features = set(X.columns) - set(X_variance_filtered.columns)
if removed_features:
    print(f"Removed low variance features: {removed_features}")
else:
    print("No low variance features found.")

X = X_variance_filtered.copy()
y = y_variance_filtered.copy()

## 9. Final Data Quality Check

In [None]:
# Final data quality assessment
final_df = pd.concat([X, y], axis=1)

print("Final Data Quality Report:")
print("=" * 40)
print(f"Final dataset shape: {final_df.shape}")
print(f"Features: {X.shape[1]}")
print(f"Samples: {len(final_df)}")
print(f"Missing values: {final_df.isnull().sum().sum()}")
print(f"Duplicate rows: {final_df.duplicated().sum()}")
print(f"Data types:")
for dtype in final_df.dtypes.value_counts().items():
    print(f"  {dtype[0]}: {dtype[1]} columns")

print(f"\nTarget variable distribution:")
print(y.value_counts().sort_index())
print(f"\nTarget variable proportions:")
print(y.value_counts(normalize=True).sort_index())

# Memory usage
memory_usage = final_df.memory_usage(deep=True).sum() / 1024 / 1024
print(f"\nMemory usage: {memory_usage:.2f} MB")

## 10. Save Processed Data

In [None]:
# Save the cleaned dataset
processed_data_path = PROCESSED_DATA_DIR / "cleaned_stress_data.csv"
final_df.to_csv(processed_data_path, index=False)

print(f"Cleaned dataset saved to: {processed_data_path}")

# Save feature and target separately for convenience
features_path = PROCESSED_DATA_DIR / "features.csv"
target_path = PROCESSED_DATA_DIR / "target.csv"

X.to_csv(features_path, index=False)
y.to_csv(target_path, index=False)

print(f"Features saved to: {features_path}")
print(f"Target saved to: {target_path}")

# Save preprocessing information
preprocessing_info = {
    'original_shape': df_original.shape,
    'final_shape': final_df.shape,
    'missing_values_handled': df_original.isnull().sum().sum(),
    'duplicates_removed': df_original.duplicated().sum(),
    'features_removed': len(df_original.columns) - len(final_df.columns),
    'target_column': target_col,
    'feature_columns': list(X.columns),
    'categorical_encodings': {k: list(v.classes_) for k, v in preprocessor.encoders.items()}
}

import json
info_path = PROCESSED_DATA_DIR / "preprocessing_info.json"
with open(info_path, 'w') as f:
    json.dump(preprocessing_info, f, indent=2, default=str)

print(f"Preprocessing information saved to: {info_path}")

## 11. Data Cleaning Summary

In [None]:
# Generate comprehensive cleaning summary
print("DATA CLEANING SUMMARY")
print("=" * 50)
print(f"Original dataset shape: {df_original.shape}")
print(f"Final dataset shape: {final_df.shape}")
print(f"Rows removed: {df_original.shape[0] - final_df.shape[0]}")
print(f"Columns changed: {df_original.shape[1]} -> {final_df.shape[1]}")

print("\nData Quality Improvements:")
print(f"✓ Missing values: {df_original.isnull().sum().sum()} -> {final_df.isnull().sum().sum()}")
print(f"✓ Duplicate rows: {df_original.duplicated().sum()} -> {final_df.duplicated().sum()}")
print(f"✓ Categorical variables encoded")
print(f"✓ Numerical features scaled")
print(f"✓ Low variance features removed")
print(f"✓ Outliers handled")

print("\nDataset is now ready for feature selection and model training!")
print("\nNext steps:")
print("1. Feature selection and engineering")
print("2. Train-test split")
print("3. Model training and evaluation")