In [None]:
import pandas, numpy, matplotlib, seaborn, scipy
print("✓ All packages available!")

# Datathon EDA Template

**Team Name:** ACM
**Date:** 11/06/2024 
**Competition:** Alteryx Datathon 
**Dataset:** Meridian City Hospital ER 

---

## Team Workflow Strategy

**Phase 1 (0-15 min): Together**
- Run Sections 0 & 1 as a team
- Discuss problem context and target variable
- Align on objectives

**Phase 2 (15-45 min): Parallel Work**
- **Member 1:** Sections 2 & 3 (Data quality and univariate analysis)
- **Member 2:** Section 4 (Bivariate relationships)
- **Member 3:** Section 5 (Multivariate patterns and modeling prep)

**Phase 3 (45-60 min): Together**
- Share findings (5 min each)
- Section 6: Brainstorm features together
- Section 7: Plan modeling strategy
- Assign next tasks


## Section 0: Setup & Imports


In [None]:
# Standard library imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
from datetime import datetime
import os

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)

# Visualization settings
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

# Suppress warnings
warnings.filterwarnings('ignore')

# Initialize findings dictionary for systematic documentation
findings = {
    'data_quality_issues': [],
    'key_insights': [],
    'feature_ideas': [],
    'questions_for_team': [],
    'next_steps': []
}

print("✓ Section 0: Setup & Imports completed")


## Section 1: Load Multiple Datasets & Initial Inspection

**Team Activity:** Run together and discuss problem context

**Dataset:** Meridian City Hospital ER Data (5 related CSV files)

**Approach for Multiple Datasets:**
1. Load all datasets first
2. Understand relationships between datasets
3. Perform EDA on each dataset individually
4. Analyze relationships across datasets
5. Merge/join datasets if needed for analysis


In [None]:
# ============================================================================
# 1.1 LOAD ALL DATASETS
# ============================================================================

data_path = 'Meridian_City_Hospital_Data/'

# Load all 5 datasets
print("Loading all datasets...")
print("=" * 80)

datasets = {}

# Load each dataset
try:
    datasets['facility'] = pd.read_csv(data_path + 'Hospital_Facility.csv')
    print("✓ Loaded: Hospital_Facility.csv")
except Exception as e:
    print(f"✗ Error loading Hospital_Facility.csv: {e}")

try:
    datasets['outcomes'] = pd.read_csv(data_path + 'Hospital_Outcomes.csv')
    print("✓ Loaded: Hospital_Outcomes.csv")
except Exception as e:
    print(f"✗ Error loading Hospital_Outcomes.csv: {e}")

try:
    datasets['patients'] = pd.read_csv(data_path + 'Hospital_Patients.csv')
    print("✓ Loaded: Hospital_Patients.csv")
except Exception as e:
    print(f"✗ Error loading Hospital_Patients.csv: {e}")

try:
    datasets['staffing'] = pd.read_csv(data_path + 'Hospital_Staffing_EAST_LOCATION.csv')
    print("✓ Loaded: Hospital_Staffing_EAST_LOCATION.csv")
except Exception as e:
    print(f"✗ Error loading Hospital_Staffing_EAST_LOCATION.csv: {e}")

try:
    datasets['visits'] = pd.read_csv(data_path + 'Hospital_Visits.csv')
    print("✓ Loaded: Hospital_Visits.csv")
except Exception as e:
    print(f"✗ Error loading Hospital_Visits.csv: {e}")

print(f"\n✓ Successfully loaded {len(datasets)} datasets")

# ============================================================================
# 1.2 DATASET OVERVIEW - ALL DATASETS
# ============================================================================

print("\n" + "=" * 80)
print("DATASET OVERVIEW - ALL FILES")
print("=" * 80)

dataset_summary = []
for name, df in datasets.items():
    dataset_summary.append({
        'Dataset': name,
        'Rows': df.shape[0],
        'Columns': df.shape[1],
        'Memory (MB)': df.memory_usage(deep=True).sum() / 1024**2,
        'Numerical Cols': len(df.select_dtypes(include=[np.number]).columns),
        'Categorical Cols': len(df.select_dtypes(include=['object', 'category']).columns),
        'Datetime Cols': len(df.select_dtypes(include=['datetime64']).columns)
    })
    
    # Document initial observations
    findings['key_insights'].append(
        f"{name}: {df.shape[0]:,} rows × {df.shape[1]} columns "
        f"({df.memory_usage(deep=True).sum() / 1024**2:.2f} MB)"
    )

summary_df = pd.DataFrame(dataset_summary)
display(summary_df)

# ============================================================================
# 1.3 COLUMN INFORMATION - ALL DATASETS
# ============================================================================

print("\n" + "=" * 80)
print("COLUMN INFORMATION - ALL DATASETS")
print("=" * 80)

for name, df in datasets.items():
    print(f"\n{name.upper()} ({len(df.columns)} columns):")
    for i, col in enumerate(df.columns, 1):
        dtype = str(df[col].dtype)
        print(f"  {i:2d}. {col:<30} ({dtype})")

# ============================================================================
# 1.4 FIRST FEW ROWS OF EACH DATASET
# ============================================================================

print("\n" + "=" * 80)
print("FIRST 3 ROWS OF EACH DATASET")
print("=" * 80)

for name, df in datasets.items():
    print(f"\n{'='*80}")
    print(f"{name.upper()}")
    print("="*80)
    display(df.head(3))
    print(f"\nShape: {df.shape[0]:,} rows × {df.shape[1]} columns")

# ============================================================================
# 1.5 IDENTIFY COMMON COLUMNS (POTENTIAL JOIN KEYS)
# ============================================================================

print("\n" + "=" * 80)
print("IDENTIFYING COMMON COLUMNS (POTENTIAL JOIN KEYS)")
print("=" * 80)

# Get all column names from all datasets
all_columns = {}
for name, df in datasets.items():
    all_columns[name] = set(df.columns)

# Find common columns
print("\nCommon columns across datasets:")
common_columns = {}
dataset_names = list(datasets.keys())

for i, name1 in enumerate(dataset_names):
    for name2 in dataset_names[i+1:]:
        common = all_columns[name1] & all_columns[name2]
        if common:
            key = f"{name1} <-> {name2}"
            common_columns[key] = common
            print(f"\n{key}:")
            for col in sorted(common):
                print(f"  • {col}")
                findings['key_insights'].append(f"Common column '{col}' found in {name1} and {name2} (potential join key)")

if not common_columns:
    print("  No common columns found. Datasets may need to be joined differently.")
    findings['questions_for_team'].append("How are these datasets related? What are the join keys?")

print("\n✓ Section 1: Load Multiple Datasets & Initial Inspection completed")


## Section 1.5: Dataset Relationships & Selection Helper

**Team Activity:** Understand how datasets relate before deep analysis

**Strategy:**
- Analyze relationships between datasets
- Identify join keys
- Select which dataset(s) to analyze in detail
- Decide if you need merged datasets for analysis


In [None]:
# ============================================================================
# 1.5.1 DATASET RELATIONSHIP ANALYSIS
# ============================================================================

print("=" * 80)
print("DATASET RELATIONSHIP ANALYSIS")
print("=" * 80)

# Check for potential ID columns that might link datasets
print("\nAnalyzing potential relationships...")

# Common ID patterns
id_patterns = ['id', 'ID', 'Id', '_id', 'key', 'Key', 'code', 'Code', 
               'patient', 'Patient', 'visit', 'Visit', 'facility', 'Facility',
               'hospital', 'Hospital', 'staff', 'Staff']

print("\nPotential ID/Key columns in each dataset:")
for name, df in datasets.items():
    potential_ids = [col for col in df.columns if any(pattern in col for pattern in id_patterns)]
    if potential_ids:
        print(f"\n{name.upper()}:")
        for col in potential_ids:
            unique_count = df[col].nunique()
            total_count = len(df)
            print(f"  • {col}: {unique_count:,} unique values / {total_count:,} total")
            if unique_count == total_count:
                print(f"    ✓ Primary key candidate (all unique)")

# ============================================================================
# 1.5.2 CREATE MERGED DATASET (OPTIONAL)
# ============================================================================

print("\n" + "=" * 80)
print("DATASET MERGING GUIDANCE")
print("=" * 80)
print("\n💡 TIP: Before merging, identify the correct join keys.")
print("   Common relationships in hospital data:")
print("   - Patients <-> Visits (Patient ID)")
print("   - Visits <-> Outcomes (Visit ID)")
print("   - Visits <-> Facility (Facility ID)")
print("   - Staffing <-> Facility (Location/Facility ID)")

# Example merge (uncomment and modify based on your actual join keys):
# merged_df = datasets['patients'].merge(datasets['visits'], on='PatientID', how='inner')
# print(f"\nMerged patients + visits: {merged_df.shape[0]:,} rows × {merged_df.shape[1]} columns")

# ============================================================================
# 1.5.3 DATASET SELECTION HELPER
# ============================================================================

print("\n" + "=" * 80)
print("SELECT DATASET FOR DETAILED ANALYSIS")
print("=" * 80)

print("\nAvailable datasets:")
for i, name in enumerate(datasets.keys(), 1):
    df = datasets[name]
    print(f"  {i}. {name}: {df.shape[0]:,} rows × {df.shape[1]} columns")

print("\n💡 INSTRUCTIONS:")
print("   1. Choose which dataset(s) to analyze in detail")
print("   2. Set 'current_dataset' to the dataset name below")
print("   3. Sections 2-5 will analyze the selected dataset")
print("   4. Repeat for other datasets as needed")

# Set the dataset you want to analyze in detail
# Options: 'facility', 'outcomes', 'patients', 'staffing', 'visits'
# Or create a merged dataset above and use that

current_dataset_name = 'visits'  # CHANGE THIS to analyze a different dataset
current_dataset = datasets[current_dataset_name].copy()

print(f"\n✓ Selected dataset for detailed analysis: {current_dataset_name}")
print(f"  Shape: {current_dataset.shape[0]:,} rows × {current_dataset.shape[1]} columns")

# Create alias 'df' for compatibility with rest of the template
df = current_dataset.copy()

print("\n✓ Section 1.5: Dataset Relationships & Selection completed")


### Quick Comparison: Data Quality Across All Datasets (Optional)

Run this cell to quickly compare data quality metrics across all datasets.


In [None]:
# ============================================================================
# QUICK DATA QUALITY COMPARISON - ALL DATASETS
# ============================================================================

print("=" * 80)
print("QUICK DATA QUALITY COMPARISON - ALL DATASETS")
print("=" * 80)

quality_comparison = []

for name, df_temp in datasets.items():
    missing_pct = (df_temp.isnull().sum().sum() / (df_temp.shape[0] * df_temp.shape[1])) * 100
    duplicate_count = df_temp.duplicated().sum()
    duplicate_pct = (duplicate_count / len(df_temp)) * 100 if len(df_temp) > 0 else 0
    
    quality_comparison.append({
        'Dataset': name,
        'Rows': df_temp.shape[0],
        'Columns': df_temp.shape[1],
        'Missing %': f"{missing_pct:.2f}%",
        'Duplicates': duplicate_count,
        'Duplicate %': f"{duplicate_pct:.2f}%",
        'Memory (MB)': f"{df_temp.memory_usage(deep=True).sum() / 1024**2:.2f}"
    })

quality_df = pd.DataFrame(quality_comparison)
display(quality_df)

print("\n💡 Use this to prioritize which datasets need more attention during detailed analysis.")


## Section 2: Data Quality Assessment

**Assigned to: Member 1**  
**Time: 15-30 minutes**

**Note:** This section analyzes the currently selected dataset (`df`). 
To analyze a different dataset, go back to Section 1.5 and change `current_dataset_name`.

**For Multiple Datasets:** You can modify this section to loop through all datasets if needed.


In [None]:
print("=" * 80)
print(f"DATA QUALITY ASSESSMENT - {current_dataset_name.upper()}")
print("=" * 80)
print(f"Analyzing dataset: {current_dataset_name}")
print(f"Shape: {df.shape[0]:,} rows × {df.shape[1]} columns")

# 2.1 Missing Values Analysis
print("\n" + "-" * 80)
print("2.1 MISSING VALUES ANALYSIS")
print("-" * 80)

missing_data = df.isnull().sum()
missing_percent = (missing_data / len(df)) * 100
missing_df = pd.DataFrame({
    'Column': missing_data.index,
    'Missing Count': missing_data.values,
    'Missing Percentage': missing_percent.values
})
missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)

if len(missing_df) > 0:
    display(missing_df)
    
    # Visualize missing values
    plt.figure(figsize=(12, max(6, len(missing_df) * 0.5)))
    sns.barplot(data=missing_df, y='Column', x='Missing Percentage', palette='Reds_r')
    plt.title('Missing Values by Column', fontsize=14, fontweight='bold')
    plt.xlabel('Missing Percentage (%)', fontsize=12)
    plt.ylabel('Column', fontsize=12)
    plt.tight_layout()
    plt.show()
    
    # Document findings
    for _, row in missing_df.iterrows():
        findings['data_quality_issues'].append(
            f"{row['Column']}: {row['Missing Count']:,} missing values ({row['Missing Percentage']:.2f}%)"
        )
else:
    print("✓ No missing values found in the dataset")
    findings['key_insights'].append("No missing values detected in the dataset")

# 2.2 Duplicate Rows
print("\n" + "-" * 80)
print("2.2 DUPLICATE ROWS")
print("-" * 80)

duplicate_count = df.duplicated().sum()
print(f"Total duplicate rows: {duplicate_count:,} ({duplicate_count/len(df)*100:.2f}%)")

if duplicate_count > 0:
    print("\nSample duplicate rows:")
    display(df[df.duplicated(keep=False)].head(10))
    findings['data_quality_issues'].append(f"{duplicate_count:,} duplicate rows found ({duplicate_count/len(df)*100:.2f}%)")
else:
    print("✓ No duplicate rows found")
    findings['key_insights'].append("No duplicate rows detected")

# 2.3 Data Type Verification
print("\n" + "-" * 80)
print("2.3 DATA TYPE VERIFICATION")
print("-" * 80)

# Check for columns that might be incorrectly typed
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist()

print(f"Numerical columns: {len(numeric_cols)}")
print(f"Categorical columns: {len(categorical_cols)}")
print(f"Datetime columns: {len(datetime_cols)}")

# Check for mixed types in object columns
print("\nChecking for mixed types in object columns...")
mixed_type_cols = []
for col in categorical_cols:
    # Try to convert to numeric and see if there are any numeric values
    numeric_vals = pd.to_numeric(df[col], errors='coerce')
    if numeric_vals.notna().sum() > 0 and numeric_vals.notna().sum() < len(df):
        mixed_type_cols.append(col)
        findings['data_quality_issues'].append(f"{col}: Mixed data types detected (numeric and non-numeric)")

if mixed_type_cols:
    print(f"⚠ Found {len(mixed_type_cols)} columns with mixed types: {mixed_type_cols}")
else:
    print("✓ No mixed type columns detected")

# 2.4 Constant Columns
print("\n" + "-" * 80)
print("2.4 CONSTANT COLUMNS")
print("-" * 80)

constant_cols = []
for col in df.columns:
    if df[col].nunique() <= 1:
        constant_cols.append(col)
        findings['data_quality_issues'].append(f"{col}: Constant column (only {df[col].nunique()} unique value)")

if constant_cols:
    print(f"⚠ Found {len(constant_cols)} constant columns: {constant_cols}")
else:
    print("✓ No constant columns found")

# 2.5 Data Quality Summary
print("\n" + "=" * 80)
print("DATA QUALITY SUMMARY")
print("=" * 80)
print(f"\nTotal columns: {df.shape[1]}")
print(f"Columns with missing values: {len(missing_df) if len(missing_df) > 0 else 0}")
print(f"Duplicate rows: {duplicate_count:,}")
print(f"Mixed type columns: {len(mixed_type_cols)}")
print(f"Constant columns: {len(constant_cols)}")
print(f"Total data quality issues: {len(findings['data_quality_issues'])}")

print("\n✓ Section 2: Data Quality Assessment completed")


## Section 3: Univariate Analysis

**Assigned to: Member 1**  
**Time: 15-30 minutes**


In [None]:
print("=" * 80)
print("UNIVARIATE ANALYSIS")
print("=" * 80)

# 3.1 Separate Numerical and Categorical Columns
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

print(f"\nNumerical columns ({len(numeric_cols)}): {numeric_cols}")
print(f"Categorical columns ({len(categorical_cols)}): {categorical_cols}")

# 3.2 Descriptive Statistics for Numerical Variables
if len(numeric_cols) > 0:
    print("\n" + "-" * 80)
    print("3.2 DESCRIPTIVE STATISTICS (NUMERICAL)")
    print("-" * 80)
    
    desc_stats = df[numeric_cols].describe().T
    desc_stats['skewness'] = df[numeric_cols].skew()
    desc_stats['kurtosis'] = df[numeric_cols].kurtosis()
    desc_stats['missing_count'] = df[numeric_cols].isnull().sum()
    desc_stats['missing_pct'] = (desc_stats['missing_count'] / len(df)) * 100
    
    display(desc_stats)
    
    # Document insights about distributions
    for col in numeric_cols:
        skew_val = desc_stats.loc[col, 'skewness']
        if abs(skew_val) > 1:
            findings['key_insights'].append(
                f"{col}: Highly {'right' if skew_val > 0 else 'left'}-skewed distribution (skewness={skew_val:.2f})"
            )

# 3.3 Outlier Detection (IQR Method)
if len(numeric_cols) > 0:
    print("\n" + "-" * 80)
    print("3.3 OUTLIER DETECTION (IQR METHOD)")
    print("-" * 80)
    
    outlier_summary = []
    
    for col in numeric_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)][col]
        outlier_count = len(outliers)
        outlier_pct = (outlier_count / len(df)) * 100
        
        if outlier_count > 0:
            outlier_summary.append({
                'Column': col,
                'Outlier Count': outlier_count,
                'Outlier Percentage': outlier_pct,
                'Lower Bound': lower_bound,
                'Upper Bound': upper_bound
            })
            
            if outlier_pct > 5:  # Flag if more than 5% outliers
                findings['data_quality_issues'].append(
                    f"{col}: {outlier_count:,} outliers ({outlier_pct:.2f}%) detected using IQR method"
                )
    
    if outlier_summary:
        outlier_df = pd.DataFrame(outlier_summary)
        display(outlier_df)
    else:
        print("✓ No significant outliers detected using IQR method")

# 3.4 Categorical Variable Analysis
if len(categorical_cols) > 0:
    print("\n" + "-" * 80)
    print("3.4 CATEGORICAL VARIABLE CARDINALITY")
    print("-" * 80)
    
    cat_summary = []
    for col in categorical_cols:
        unique_count = df[col].nunique()
        cat_summary.append({
            'Column': col,
            'Unique Values': unique_count,
            'Most Frequent': df[col].mode()[0] if len(df[col].mode()) > 0 else 'N/A',
            'Most Frequent Count': df[col].value_counts().iloc[0] if unique_count > 0 else 0,
            'Most Frequent %': (df[col].value_counts().iloc[0] / len(df) * 100) if unique_count > 0 else 0
        })
        
        # Document high cardinality
        if unique_count > 50:
            findings['key_insights'].append(
                f"{col}: High cardinality categorical variable ({unique_count} unique values)"
            )
    
    cat_df = pd.DataFrame(cat_summary)
    display(cat_df)

# 3.5 Distribution Visualizations - Numerical
if len(numeric_cols) > 0:
    print("\n" + "-" * 80)
    print("3.5 DISTRIBUTION VISUALIZATIONS (NUMERICAL)")
    print("-" * 80)
    
    # Calculate grid dimensions
    n_cols = min(3, len(numeric_cols))
    n_rows = (len(numeric_cols) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5 * n_rows))
    axes = axes.flatten() if len(numeric_cols) > 1 else [axes]
    
    for idx, col in enumerate(numeric_cols):
        ax = axes[idx]
        df[col].hist(bins=50, ax=ax, edgecolor='black', alpha=0.7)
        ax.set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
        ax.set_xlabel(col, fontsize=10)
        ax.set_ylabel('Frequency', fontsize=10)
        ax.grid(True, alpha=0.3)
    
    # Hide extra subplots
    for idx in range(len(numeric_cols), len(axes)):
        axes[idx].axis('off')
    
    plt.tight_layout()
    plt.show()

# 3.6 Value Counts - Categorical
if len(categorical_cols) > 0:
    print("\n" + "-" * 80)
    print("3.6 VALUE COUNTS (CATEGORICAL - TOP 10)")
    print("-" * 80)
    
    for col in categorical_cols[:5]:  # Limit to first 5 to avoid too much output
        print(f"\n{col}:")
        value_counts = df[col].value_counts().head(10)
        display(value_counts.to_frame('Count'))
        
        # Visualize top categories
        if df[col].nunique() <= 20:  # Only plot if reasonable number of categories
            plt.figure(figsize=(10, 6))
            value_counts.plot(kind='bar')
            plt.title(f'Value Counts: {col}', fontsize=12, fontweight='bold')
            plt.xlabel(col, fontsize=10)
            plt.ylabel('Count', fontsize=10)
            plt.xticks(rotation=45, ha='right')
            plt.tight_layout()
            plt.show()

print("\n✓ Section 3: Univariate Analysis completed")


## Section 4: Bivariate Analysis

**Assigned to: Member 2**  
**Time: 15-30 minutes**


In [None]:
print("=" * 80)
print("BIVARIATE ANALYSIS")
print("=" * 80)

# 4.1 Correlation Matrix
if len(numeric_cols) > 1:
    print("\n" + "-" * 80)
    print("4.1 CORRELATION MATRIX")
    print("-" * 80)
    
    correlation_matrix = df[numeric_cols].corr()
    
    # Display correlation matrix
    display(correlation_matrix)
    
    # Visualize correlation heatmap
    plt.figure(figsize=(max(10, len(numeric_cols)), max(8, len(numeric_cols))))
    mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))  # Mask upper triangle
    sns.heatmap(correlation_matrix, mask=mask, annot=True, fmt='.2f', cmap='coolwarm', 
                center=0, square=True, linewidths=1, cbar_kws={"shrink": 0.8})
    plt.title('Correlation Matrix (Numerical Features)', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.show()

# 4.2 High Correlation Detection
if len(numeric_cols) > 1:
    print("\n" + "-" * 80)
    print("4.2 HIGH CORRELATION PAIRS (|r| > 0.7)")
    print("-" * 80)
    
    high_corr_pairs = []
    for i in range(len(correlation_matrix.columns)):
        for j in range(i+1, len(correlation_matrix.columns)):
            col1 = correlation_matrix.columns[i]
            col2 = correlation_matrix.columns[j]
            corr_val = correlation_matrix.iloc[i, j]
            
            if abs(corr_val) > 0.7:
                high_corr_pairs.append({
                    'Feature 1': col1,
                    'Feature 2': col2,
                    'Correlation': corr_val
                })
                findings['key_insights'].append(
                    f"High correlation between {col1} and {col2}: {corr_val:.3f}"
                )
    
    if high_corr_pairs:
        high_corr_df = pd.DataFrame(high_corr_pairs)
        display(high_corr_df.sort_values('Correlation', key=abs, ascending=False))
    else:
        print("✓ No highly correlated pairs found (|r| > 0.7)")

# 4.3 Target Variable Relationship Analysis
# Uncomment and modify if you have a target variable

target_col = 'Visit ID'  # SET YOUR TARGET COLUMN HERE

if target_col in df.columns:
    print("\n" + "-" * 80)
    print(f"4.3 TARGET VARIABLE ANALYSIS: {target_col}")
    print("-" * 80)
    
    # If target is numerical
    if df[target_col].dtype in [np.number]:
        # Correlation with target
        target_corr = df[numeric_cols].corrwith(df[target_col]).sort_values(key=abs, ascending=False)
        print("\nCorrelation with target:")
        display(target_corr.to_frame('Correlation'))
        
        # Top correlated features
        top_features = target_corr.abs().nlargest(10).index.tolist()
        findings['key_insights'].append(f"Top features correlated with {target_col}: {top_features[:5]}")
        
        # Scatter plots for top features
        n_top = min(6, len(top_features))
        fig, axes = plt.subplots(2, 3, figsize=(18, 12))
        axes = axes.flatten()
        
        for idx, feature in enumerate(top_features[:n_top]):
            ax = axes[idx]
            ax.scatter(df[feature], df[target_col], alpha=0.5, s=20)
            ax.set_xlabel(feature, fontsize=10)
            ax.set_ylabel(target_col, fontsize=10)
            ax.set_title(f'{feature} vs {target_col}\n(r={target_corr[feature]:.3f})', fontsize=11)
            ax.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
    
    # If target is categorical
    elif df[target_col].dtype in ['object', 'category']:
        # Class distribution
        print("\nTarget class distribution:")
        target_dist = df[target_col].value_counts()
        display(target_dist.to_frame('Count'))
        
        # Visualize class distribution
        plt.figure(figsize=(10, 6))
        target_dist.plot(kind='bar')
        plt.title(f'Distribution of {target_col}', fontsize=12, fontweight='bold')
        plt.xlabel(target_col, fontsize=10)
        plt.ylabel('Count', fontsize=10)
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.show()
        
        # Box plots for numerical features by target class
        if len(numeric_cols) > 0:
            n_features = min(6, len(numeric_cols))
            fig, axes = plt.subplots(2, 3, figsize=(18, 12))
            axes = axes.flatten()
            
            for idx, feature in enumerate(numeric_cols[:n_features]):
                ax = axes[idx]
                df.boxplot(column=feature, by=target_col, ax=ax)
                ax.set_title(f'{feature} by {target_col}', fontsize=11)
                ax.set_xlabel(target_col, fontsize=10)
                ax.set_ylabel(feature, fontsize=10)
                plt.suptitle('')  # Remove default title
            
            plt.tight_layout()
            plt.show()
else:
    print(f"\n⚠ Target column '{target_col}' not found in dataset")

print("\n✓ Section 4: Bivariate Analysis completed")


## Section 5: Multivariate Analysis & Patterns

**Assigned to: Member 3**  
**Time: 15-30 minutes**


In [None]:
print("=" * 80)
print("MULTIVARIATE ANALYSIS & PATTERNS")
print("=" * 80)

# 5.1 Pairplot for Pattern Detection
if len(numeric_cols) > 1 and len(numeric_cols) <= 8:  # Limit to avoid too many plots
    print("\n" + "-" * 80)
    print("5.1 PAIRPLOT (PATTERN DETECTION)")
    print("-" * 80)
    print("\nGenerating pairplot (this may take a moment)...")
    
    # Sample data if too large for pairplot
    sample_size = min(1000, len(df))
    df_sample = df[numeric_cols].sample(n=sample_size, random_state=42) if len(df) > 1000 else df[numeric_cols]
    
    # Create pairplot
    sns.pairplot(df_sample, diag_kind='kde', plot_kws={'alpha': 0.6, 's': 20})
    plt.suptitle('Pairplot of Numerical Features', y=1.02, fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.show()
    
    findings['key_insights'].append("Pairplot generated to identify multivariate patterns and clusters")
elif len(numeric_cols) > 8:
    print("\n⚠ Too many numerical columns for pairplot. Consider selecting key features.")
    findings['questions_for_team'].append("Which numerical features should we focus on for multivariate analysis?")

# 5.2 Class Imbalance Detection (if target exists)
# Uncomment if you have a target variable
# if 'target_col' in locals() and target_col in df.columns:
#     if df[target_col].dtype in ['object', 'category']:
#         print("\n" + "-" * 80)
#         print("5.2 CLASS IMBALANCE DETECTION")
#         print("-" * 80)
#         
#         class_counts = df[target_col].value_counts()
#         class_proportions = class_counts / len(df)
#         
#         print("\nClass distribution:")
#         display(class_proportions.to_frame('Proportion'))
#         
#         # Check for imbalance (threshold: any class < 10%)
#         min_proportion = class_proportions.min()
#         if min_proportion < 0.1:
#             findings['data_quality_issues'].append(
#                 f"Class imbalance detected: Minority class represents {min_proportion*100:.2f}% of data"
#             )
#             findings['next_steps'].append("Consider using class balancing techniques (SMOTE, undersampling, etc.)")

# 5.3 Time-based Pattern Analysis
datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist()
if len(datetime_cols) > 0:
    print("\n" + "-" * 80)
    print("5.3 TIME-BASED PATTERN ANALYSIS")
    print("-" * 80)
    
    for col in datetime_cols[:2]:  # Limit to first 2 datetime columns
        print(f"\nAnalyzing {col}...")
        
        # Extract time components
        df[f'{col}_year'] = pd.to_datetime(df[col]).dt.year
        df[f'{col}_month'] = pd.to_datetime(df[col]).dt.month
        df[f'{col}_day'] = pd.to_datetime(df[col]).dt.day
        df[f'{col}_dayofweek'] = pd.to_datetime(df[col]).dt.dayofweek
        
        # Time series plot if we have a numerical target or feature
        if len(numeric_cols) > 0:
            # Plot first numerical column over time
            plt.figure(figsize=(14, 6))
            time_series = df.groupby(pd.to_datetime(df[col]).dt.date)[numeric_cols[0]].mean()
            time_series.plot()
            plt.title(f'{numeric_cols[0]} Over Time ({col})', fontsize=12, fontweight='bold')
            plt.xlabel('Date', fontsize=10)
            plt.ylabel(numeric_cols[0], fontsize=10)
            plt.xticks(rotation=45)
            plt.grid(True, alpha=0.3)
            plt.tight_layout()
            plt.show()
            
            findings['feature_ideas'].append(f"Extract time features from {col}: year, month, day, dayofweek, hour, etc.")
else:
    print("\n✓ No datetime columns found for time-based analysis")

# 5.4 Cluster and Grouping Identification
print("\n" + "-" * 80)
print("5.4 CLUSTER IDENTIFICATION PREPARATION")
print("-" * 80)
print("\nReview pairplot and correlation matrix for potential clusters or groups.")
print("Consider using dimensionality reduction (PCA, t-SNE) if needed.")

if len(numeric_cols) > 2:
    findings['questions_for_team'].append("Should we apply dimensionality reduction techniques (PCA, t-SNE) for visualization?")

# 5.5 Preparation for Modeling Insights
print("\n" + "-" * 80)
print("5.5 MODELING PREPARATION INSIGHTS")
print("-" * 80)

modeling_insights = []

# Feature count
modeling_insights.append(f"Total features available: {df.shape[1]}")
modeling_insights.append(f"Numerical features: {len(numeric_cols)}")
modeling_insights.append(f"Categorical features: {len(categorical_cols)}")

# Data size
if df.shape[0] < 1000:
    modeling_insights.append("Small dataset: Consider simpler models or data augmentation")
elif df.shape[0] > 100000:
    modeling_insights.append("Large dataset: Can support complex models, consider sampling for faster iteration")

# Missing data impact
if len(findings['data_quality_issues']) > 0:
    modeling_insights.append(f"Data quality issues to address: {len(findings['data_quality_issues'])}")

for insight in modeling_insights:
    print(f"• {insight}")
    findings['next_steps'].append(insight)

print("\n✓ Section 5: Multivariate Analysis & Patterns completed")


## Section 6: Feature Engineering Ideas

**Team Activity:** Brainstorm together


In [None]:
print("=" * 80)
print("FEATURE ENGINEERING IDEAS")
print("=" * 80)
print("\nUse this section to brainstorm and document feature engineering ideas.")
print("Add your ideas to the findings dictionary as you discuss.")

# 6.1 Numerical Feature Transformations
print("\n" + "-" * 80)
print("6.1 NUMERICAL FEATURE TRANSFORMATIONS")
print("-" * 80)

numerical_transforms = [
    "Polynomial features (x², x³) for non-linear relationships",
    "Binning/bucketing for continuous variables",
    "Log/Box-Cox transformations for skewed distributions",
    "Standardization/Normalization (StandardScaler, MinMaxScaler)",
    "Robust scaling for outlier-resistant normalization",
    "Power transforms (square root, cube root)"
]

print("\nPotential transformations:")
for transform in numerical_transforms:
    print(f"  • {transform}")

# Document specific ideas based on your data
for col in numeric_cols[:5]:  # Review first 5 numerical columns
    skew_val = df[col].skew()
    if abs(skew_val) > 1:
        findings['feature_ideas'].append(f"Apply log/Box-Cox transform to {col} (skewness={skew_val:.2f})")

# 6.2 Categorical Encoding Strategies
print("\n" + "-" * 80)
print("6.2 CATEGORICAL ENCODING STRATEGIES")
print("-" * 80)

categorical_encodings = [
    "One-hot encoding for low cardinality (< 10 unique values)",
    "Label encoding for ordinal categories",
    "Target encoding for high cardinality categories",
    "Frequency encoding (replace with value counts)",
    "Binary encoding for very high cardinality",
    "Embedding for deep learning models"
]

print("\nPotential encoding strategies:")
for encoding in categorical_encodings:
    print(f"  • {encoding}")

# Document specific ideas based on your data
for col in categorical_cols:
    unique_count = df[col].nunique()
    if unique_count < 10:
        findings['feature_ideas'].append(f"One-hot encode {col} ({unique_count} categories)")
    elif unique_count > 50:
        findings['feature_ideas'].append(f"Consider target/frequency encoding for {col} (high cardinality: {unique_count})")

# 6.3 Datetime Feature Extraction
if len(datetime_cols) > 0:
    print("\n" + "-" * 80)
    print("6.3 DATETIME FEATURE EXTRACTION")
    print("-" * 80)
    
    datetime_features = [
        "Extract: year, month, day, hour, minute, second",
        "Extract: day of week, day of year, week of year",
        "Extract: is_weekend, is_month_start, is_month_end",
        "Extract: quarter, semester",
        "Time since reference date",
        "Cyclical encoding (sin/cos) for periodic features"
    ]
    
    print("\nPotential datetime features:")
    for feature in datetime_features:
        print(f"  • {feature}")
    
    for col in datetime_cols:
        findings['feature_ideas'].append(f"Extract time components from {col}: year, month, day, hour, dayofweek, etc.")

# 6.4 Domain-Specific Features
print("\n" + "-" * 80)
print("6.4 DOMAIN-SPECIFIC FEATURES")
print("-" * 80)
print("\nDiscuss domain knowledge and create relevant features.")
print("Examples:")
print("  • Ratio features (e.g., price per unit, density)")
print("  • Interaction features (e.g., product of two features)")
print("  • Aggregate features (e.g., mean, max, min by group)")
print("  • Distance/Similarity features")
print("  • Text features (if applicable): word count, sentiment, etc.")

# 6.5 Aggregate and Rolling Window Features
print("\n" + "-" * 80)
print("6.5 AGGREGATE & ROLLING WINDOW FEATURES")
print("-" * 80)

aggregate_features = [
    "Group-based aggregations (mean, median, std, min, max, count)",
    "Rolling window statistics (moving average, rolling std)",
    "Lag features (previous values)",
    "Cumulative statistics",
    "Rank-based features"
]

print("\nPotential aggregate features:")
for feature in aggregate_features:
    print(f"  • {feature}")

print("\n✓ Section 6: Feature Engineering Ideas completed")
print("\n💡 TIP: Document your specific feature engineering ideas in the findings dictionary above.")


## Section 7: Summary & Action Items

**Team Activity:** Review together and plan next steps


In [None]:
print("=" * 80)
print("SUMMARY & ACTION ITEMS")
print("=" * 80)

# 7.1 Data Quality Issues Summary
print("\n" + "-" * 80)
print("7.1 DATA QUALITY ISSUES")
print("-" * 80)
if len(findings['data_quality_issues']) > 0:
    for i, issue in enumerate(findings['data_quality_issues'], 1):
        print(f"  {i}. {issue}")
else:
    print("  ✓ No major data quality issues identified")

# 7.2 Key Insights Summary
print("\n" + "-" * 80)
print("7.2 KEY INSIGHTS")
print("-" * 80)
if len(findings['key_insights']) > 0:
    for i, insight in enumerate(findings['key_insights'], 1):
        print(f"  {i}. {insight}")
else:
    print("  (No insights documented yet)")

# 7.3 Feature Engineering Ideas Summary
print("\n" + "-" * 80)
print("7.3 FEATURE ENGINEERING IDEAS")
print("-" * 80)
if len(findings['feature_ideas']) > 0:
    for i, idea in enumerate(findings['feature_ideas'], 1):
        print(f"  {i}. {idea}")
else:
    print("  (No feature ideas documented yet)")

# 7.4 Questions for Team Discussion
print("\n" + "-" * 80)
print("7.4 QUESTIONS FOR TEAM DISCUSSION")
print("-" * 80)
if len(findings['questions_for_team']) > 0:
    for i, question in enumerate(findings['questions_for_team'], 1):
        print(f"  {i}. {question}")
else:
    print("  (No questions documented yet)")

# 7.5 Next Steps
print("\n" + "-" * 80)
print("7.5 NEXT STEPS")
print("-" * 80)

next_steps_default = [
    "Address data quality issues (missing values, duplicates, outliers)",
    "Implement feature engineering based on insights",
    "Split data into train/validation/test sets",
    "Select baseline models to try",
    "Set up cross-validation strategy",
    "Define evaluation metrics",
    "Create modeling pipeline"
]

all_next_steps = findings['next_steps'] + next_steps_default
for i, step in enumerate(all_next_steps, 1):
    print(f"  {i}. {step}")

# 7.6 Export Findings to Text File
print("\n" + "-" * 80)
print("7.6 EXPORTING FINDINGS")
print("-" * 80)

timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
export_content = f"""
EDA FINDINGS REPORT
===================
Generated: {timestamp}
Dataset: {df.shape[0]:,} rows × {df.shape[1]} columns

DATA QUALITY ISSUES
-------------------
"""

if len(findings['data_quality_issues']) > 0:
    for i, issue in enumerate(findings['data_quality_issues'], 1):
        export_content += f"{i}. {issue}\n"
else:
    export_content += "No major data quality issues identified.\n"

export_content += f"\nKEY INSIGHTS\n"
export_content += f"{'=' * 20}\n"
if len(findings['key_insights']) > 0:
    for i, insight in enumerate(findings['key_insights'], 1):
        export_content += f"{i}. {insight}\n"
else:
    export_content += "No insights documented.\n"

export_content += f"\nFEATURE ENGINEERING IDEAS\n"
export_content += f"{'=' * 20}\n"
if len(findings['feature_ideas']) > 0:
    for i, idea in enumerate(findings['feature_ideas'], 1):
        export_content += f"{i}. {idea}\n"
else:
    export_content += "No feature ideas documented.\n"

export_content += f"\nQUESTIONS FOR TEAM DISCUSSION\n"
export_content += f"{'=' * 20}\n"
if len(findings['questions_for_team']) > 0:
    for i, question in enumerate(findings['questions_for_team'], 1):
        export_content += f"{i}. {question}\n"
else:
    export_content += "No questions documented.\n"

export_content += f"\nNEXT STEPS\n"
export_content += f"{'=' * 20}\n"
for i, step in enumerate(all_next_steps, 1):
    export_content += f"{i}. {step}\n"

# Write to file
output_file = 'eda_findings.txt'
with open(output_file, 'w') as f:
    f.write(export_content)

print(f"\n✓ Findings exported to: {output_file}")
print("\n" + "=" * 80)
print("EDA COMPLETE!")
print("=" * 80)
print("\nNext: Review findings with team and proceed to feature engineering and modeling.")
