In [0]:
# Databricks notebook source
# MAGIC %md
# MAGIC # House Price Prediction - Data Exploration
# MAGIC ## MLflow Production Pipeline - Step 1
# MAGIC 
# MAGIC **Objective**: Load, explore, and validate the house price dataset
# MAGIC 
# MAGIC **Author**: Satish  
# MAGIC **Date**: 2026-01-17
# MAGIC 
# MAGIC ---
# MAGIC 
# MAGIC ### What This Notebook Does:
# MAGIC - ‚úÖ Load and validate raw data
# MAGIC - ‚úÖ Perform exploratory data analysis (EDA)
# MAGIC - ‚úÖ Check data quality (missing values, duplicates, outliers)
# MAGIC - ‚úÖ Analyze feature distributions and correlations
# MAGIC - ‚úÖ Generate data quality report
# MAGIC - ‚úÖ Save summary statistics

In [0]:
# MAGIC %md
# MAGIC ## 1. Setup and Imports

In [0]:
# COMMAND ----------

# MAGIC %md
# MAGIC ## 1. Setup and Imports

# COMMAND ----------

# Clear cached modules and reload
import sys
import importlib

# Add project path
project_path = '/Workspace/COMM - Commercial Analytics (CMAN)/MMM Quattro 2025/Satish/MLFLOW_sample'
if project_path not in sys.path:
    sys.path.append(project_path)

# Remove cached src modules
modules_to_clear = [key for key in list(sys.modules.keys()) if key.startswith('src')]
for module in modules_to_clear:
    del sys.modules[module]

print(f"üîÑ Cleared {len(modules_to_clear)} cached modules")

# Standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import mlflow  # 
import warnings
warnings.filterwarnings('ignore')

# Custom imports
from src.utils import (
    ConfigLoader, 
    DataLoader, 
    DataValidator,
    safe_display,
    setup_mlflow_databricks,
    log_dataset_summary
)

# Set plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

print("‚úÖ All imports successful!")
print(f"üì¶ Pandas version: {pd.__version__}")
print(f"üì¶ NumPy version: {np.__version__}")
print(f"üì¶ MLflow version: {mlflow.__version__}")

In [0]:
# %restart_python

In [0]:
# MAGIC %md
# MAGIC ## 2. Load Configuration

In [0]:
# Load configuration
config_path = '/Workspace/COMM - Commercial Analytics (CMAN)/MMM Quattro 2025/Satish/MLFLOW_sample/config/config.yaml'
config = ConfigLoader.load_config(config_path)

# Setup MLflow
setup_mlflow_databricks(config) 

print("‚úÖ Configuration & MLflow ready!")
print(f"üìÅ Project: {config['project']['name']} v{config['project']['version']}")
print(f"üî¨ MLflow Experiment: {config['mlflow']['experiment_name']}")


In [0]:
# MAGIC %md
# MAGIC ## 3. Load Data

In [0]:
# Load data
data_path = config['data']['raw_path']
df = DataLoader.load_csv(data_path)

print(f"Data loaded successfully!")
print(f"Shape: {df.shape}")
print(f"\nFirst few rows:")
display(df.head(10))


In [0]:
# MAGIC %md
# MAGIC ## 4. Data Overview


In [0]:
# Basic information
print("="*60)
print("DATASET OVERVIEW")
print("="*60)
print(f"\nShape: {df.shape[0]} rows √ó {df.shape[1]} columns")
print(f"\nColumn Names:\n{list(df.columns)}")
print(f"\nData Types:\n{df.dtypes}")
print(f"\nMemory Usage: {df.memory_usage(deep=True).sum() / 1024:.2f} KB")


In [0]:
# Statistical summary
print("="*60)
print("STATISTICAL SUMMARY")
print("="*60)
display(df.describe())


In [0]:
# MAGIC %md
# MAGIC ## 5. Data Quality Check


In [0]:
df.head()

In [0]:
# COMMAND ----------

# Check for missing values
print("="*60)
print("MISSING VALUES ANALYSIS")
print("="*60)

validator = DataValidator()
missing_values = validator.check_missing_values(df)

missing_df = pd.DataFrame({
    'Column': missing_values.index,
    'Missing_Count': missing_values.values,
    'Missing_Percent': (missing_values.values / len(df)) * 100
})

# Display only if there are missing values
missing_filtered = missing_df[missing_df['Missing_Count'] > 0]

if len(missing_filtered) > 0:
    print("\n‚ö†Ô∏è Columns with missing values:")
    safe_display(missing_filtered)  # Use safe_display instead of display
    
    # Visualize missing values
    plt.figure(figsize=(10, 6))
    plt.bar(missing_filtered['Column'], missing_filtered['Missing_Percent'], 
            color='coral', edgecolor='black', alpha=0.7)
    plt.xlabel('Column', fontsize=12)
    plt.ylabel('Missing %', fontsize=12)
    plt.title('Missing Values by Column', fontsize=14, fontweight='bold')
    plt.xticks(rotation=45, ha='right')
    plt.grid(True, alpha=0.3, axis='y')
    plt.tight_layout()
    plt.show()
else:
    print("\n‚úÖ No missing values found!")
    print("\nAll columns are complete:")
    print(missing_df.to_string(index=False))

# Log to MLflow
mlflow.log_metric("missing_values_total", int(missing_values.sum()))
mlflow.log_metric("missing_values_pct", float((missing_values.sum() / df.size) * 100))

In [0]:

# Check for duplicates
print("="*60)
print("DUPLICATE CHECK")
print("="*60)
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

if duplicates > 0:
    print(f"‚ö†Ô∏è Found {duplicates} duplicate rows")
    display(df[df.duplicated(keep=False)])
else:
    print("‚úÖ No duplicates found!")

In [0]:
# Data types validation
print("="*60)
print("DATA TYPE VALIDATION")
print("="*60)

numerical_features = config['preprocessing']['numerical_features']
categorical_features = config['preprocessing']['categorical_features']
target = config['preprocessing']['target']

print(f"\nNumerical Features: {numerical_features}")
print(f"Categorical Features: {categorical_features}")
print(f"Target Variable: {target}")

# Validate numerical features
for col in numerical_features:
    if col in df.columns:
        is_numeric = pd.api.types.is_numeric_dtype(df[col])
        print(f"  {col}: {'‚úÖ Numeric' if is_numeric else '‚ùå Not Numeric'}")

# Validate categorical features
for col in categorical_features:
    if col in df.columns:
        unique_values = df[col].nunique()
        print(f"  {col}: {unique_values} unique values - {df[col].unique()}")


### 5. Data Overview


In [0]:

# Basic information
print("="*60)
print("DATASET OVERVIEW")
print("="*60)
print(f"\nüìä Shape: {df.shape[0]} rows √ó {df.shape[1]} columns")
print(f"\nüìã Column Names:")
for i, col in enumerate(df.columns, 1):
    print(f"  {i}. {col}")

print(f"\nüî¢ Data Types:")
print(df.dtypes.to_string())

print(f"\nüíæ Memory Usage:")
memory_usage = df.memory_usage(deep=True)
for col in memory_usage.index:
    print(f"  {col}: {memory_usage[col] / 1024:.2f} KB")
print(f"  TOTAL: {memory_usage.sum() / 1024:.2f} KB")


In [0]:
# Statistical summary
print("="*60)
print("STATISTICAL SUMMARY")
print("="*60)
print("\nüìä Numerical Features:")
print(df.describe().to_string())

print("\nüìä Categorical Features:")
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    print(f"\n{col}:")
    print(df[col].value_counts().to_string())


 ## 6. Data Quality Check


In [0]:

# MAGIC %md
# MAGIC ### 6.1 Missing Values


In [0]:

# Check for missing values
print("="*60)
print("MISSING VALUES ANALYSIS")
print("="*60)

validator = DataValidator()
missing_values = validator.check_missing_values(df)

missing_df = pd.DataFrame({
    'Column': missing_values.index,
    'Missing_Count': missing_values.values,
    'Missing_Percent': (missing_values.values / len(df)) * 100
})

# Display only if there are missing values
missing_filtered = missing_df[missing_df['Missing_Count'] > 0]
if len(missing_filtered) > 0:
    print("\n‚ö†Ô∏è Columns with missing values:")
    print(missing_filtered.to_string(index=False))
    
    # Visualize missing values
    plt.figure(figsize=(10, 6))
    plt.bar(missing_filtered['Column'], missing_filtered['Missing_Percent'], 
            color='coral', edgecolor='black', alpha=0.7)
    plt.xlabel('Column', fontsize=12)
    plt.ylabel('Missing %', fontsize=12)
    plt.title('Missing Values by Column', fontsize=14, fontweight='bold')
    plt.xticks(rotation=45, ha='right')
    plt.grid(True, alpha=0.3, axis='y')
    plt.tight_layout()
    plt.show()
else:
    print("\n‚úÖ No missing values found!")
    print("\nAll columns are complete:")
    print(missing_df.to_string(index=False))

# Log to MLflow
mlflow.log_metric("missing_values_total", int(missing_values.sum()))
mlflow.log_metric("missing_values_pct", float((missing_values.sum() / df.size) * 100))


In [0]:

# MAGIC %md
# MAGIC ### 7. Exploratory Data Analysis (EDA)



In [0]:
# MAGIC %md
# MAGIC ### 7.1 Target Variable Distribution


In [0]:

# Home Price distribution
target_col = config['preprocessing']['target']

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Histogram
axes[0].hist(df[target_col], bins=30, edgecolor='black', alpha=0.7, color='skyblue')
axes[0].axvline(df[target_col].mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: ${df[target_col].mean():,.0f}')
axes[0].axvline(df[target_col].median(), color='green', linestyle='--', linewidth=2, label=f'Median: ${df[target_col].median():,.0f}')
axes[0].set_xlabel('Home Price', fontsize=12)
axes[0].set_ylabel('Frequency', fontsize=12)
axes[0].set_title('Distribution of Home Prices', fontsize=14, fontweight='bold')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Box plot
axes[1].boxplot(df[target_col], vert=True)
axes[1].set_ylabel('Home Price', fontsize=12)
axes[1].set_title('Home Price Box Plot', fontsize=14, fontweight='bold')
axes[1].grid(True, alpha=0.3, axis='y')

# Q-Q plot
from scipy import stats
stats.probplot(df[target_col], dist="norm", plot=axes[2])
axes[2].set_title('Q-Q Plot (Normality Check)', fontsize=14, fontweight='bold')
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Statistics
print("="*60)
print("TARGET VARIABLE STATISTICS")
print("="*60)
print(f"Mean:     ${df[target_col].mean():,.2f}")
print(f"Median:   ${df[target_col].median():,.2f}")
print(f"Std Dev:  ${df[target_col].std():,.2f}")
print(f"Min:      ${df[target_col].min():,.2f}")
print(f"Max:      ${df[target_col].max():,.2f}")
print(f"Range:    ${df[target_col].max() - df[target_col].min():,.2f}")
print(f"Skewness: {df[target_col].skew():.4f}")
print(f"Kurtosis: {df[target_col].kurtosis():.4f}")

# Log to MLflow
mlflow.log_metrics({
    "target_mean": float(df[target_col].mean()),
    "target_median": float(df[target_col].median()),
    "target_std": float(df[target_col].std()),
    "target_skewness": float(df[target_col].skew()),
    "target_kurtosis": float(df[target_col].kurtosis())
})


In [0]:
# MAGIC %md
# MAGIC ### 7.2 Numerical Features Analysis


In [0]:

# Distribution of numerical features
numerical_features = config['preprocessing']['numerical_features']

n_features = len(numerical_features)
n_cols = 2
n_rows = (n_features + 1) // 2

fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, n_rows * 4))
axes = axes.ravel() if n_features > 1 else [axes]

for idx, col in enumerate(numerical_features):
    if col in df.columns:
        axes[idx].hist(df[col], bins=20, edgecolor='black', alpha=0.7, color='coral')
        axes[idx].axvline(df[col].mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {df[col].mean():.1f}')
        axes[idx].set_xlabel(col, fontsize=11)
        axes[idx].set_ylabel('Frequency', fontsize=11)
        axes[idx].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
        axes[idx].legend()
        axes[idx].grid(True, alpha=0.3)

# Hide unused subplots
for idx in range(n_features, len(axes)):
    axes[idx].axis('off')

plt.tight_layout()
plt.show()

# Summary statistics
print("="*60)
print("NUMERICAL FEATURES SUMMARY")
print("="*60)
for col in numerical_features:
    if col in df.columns:
        print(f"\n{col}:")
        print(f"  Mean:   {df[col].mean():.2f}")
        print(f"  Median: {df[col].median():.2f}")
        print(f"  Std:    {df[col].std():.2f}")
        print(f"  Min:    {df[col].min():.2f}")
        print(f"  Max:    {df[col].max():.2f}")


In [0]:

# MAGIC %md
# MAGIC ### 7.3 Categorical Features Analysis


In [0]:

# Categorical features distribution
categorical_features = config['preprocessing']['categorical_features']

n_cat_features = len(categorical_features)
fig, axes = plt.subplots(1, n_cat_features, figsize=(7 * n_cat_features, 5))

if n_cat_features == 1:
    axes = [axes]

for idx, col in enumerate(categorical_features):
    if col in df.columns:
        value_counts = df[col].value_counts()
        axes[idx].bar(value_counts.index, value_counts.values, 
                     edgecolor='black', alpha=0.7, color='lightgreen')
        axes[idx].set_xlabel(col, fontsize=11)
        axes[idx].set_ylabel('Count', fontsize=11)
        axes[idx].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
        axes[idx].grid(True, alpha=0.3, axis='y')
        
        # Add value labels on bars
        for i, v in enumerate(value_counts.values):
            axes[idx].text(i, v + 0.5, str(v), ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

# Summary
print("="*60)
print("CATEGORICAL FEATURES SUMMARY")
print("="*60)
for col in categorical_features:
    if col in df.columns:
        print(f"\n{col}:")
        print(df[col].value_counts().to_string())
        print(f"  Unique values: {df[col].nunique()}")



In [0]:

# MAGIC %md
# MAGIC ### 7.4 Correlation Analysis


In [0]:

# Correlation matrix
numeric_df = df.select_dtypes(include=[np.number])
correlation_matrix = numeric_df.corr()

plt.figure(figsize=(12, 10))
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, mask=mask, annot=True, fmt='.2f', cmap='coolwarm', 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8}, center=0,
            vmin=-1, vmax=1)
plt.title('Feature Correlation Matrix', fontsize=14, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

# Correlation with target
print("="*60)
print(f"CORRELATION WITH TARGET ({target_col})")
print("="*60)
target_corr = correlation_matrix[target_col].sort_values(ascending=False)
print(target_corr.to_string())

# Visualize correlation with target
plt.figure(figsize=(10, 6))
target_corr_filtered = target_corr[target_corr.index != target_col]  # Exclude target itself
colors = ['green' if x > 0 else 'red' for x in target_corr_filtered.values]
plt.barh(target_corr_filtered.index, target_corr_filtered.values, color=colors, alpha=0.7, edgecolor='black')
plt.xlabel('Correlation Coefficient', fontsize=12)
plt.ylabel('Feature', fontsize=12)
plt.title(f'Feature Correlation with {target_col}', fontsize=14, fontweight='bold')
plt.axvline(x=0, color='black', linestyle='-', linewidth=1)
plt.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.show()

# Log top correlations to MLflow
top_corr = target_corr_filtered.head(3)
for idx, (feature, corr_value) in enumerate(top_corr.items(), 1):
    mlflow.log_metric(f"top_corr_{idx}_{feature}", float(corr_value))


In [0]:

# MAGIC %md
# MAGIC ### 7.5 Relationship with Target Variable


In [0]:

# Scatter plots: Features vs Target
numerical_features = config['preprocessing']['numerical_features']
n_features = len(numerical_features)
n_cols = 2
n_rows = (n_features + 1) // 2

fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, n_rows * 4))
axes = axes.ravel() if n_features > 1 else [axes]

for idx, col in enumerate(numerical_features):
    if col in df.columns:
        axes[idx].scatter(df[col], df[target_col], alpha=0.6, color='purple', edgecolors='black', linewidth=0.5)
        axes[idx].set_xlabel(col, fontsize=11)
        axes[idx].set_ylabel(target_col, fontsize=11)
        axes[idx].set_title(f'{col} vs {target_col}', fontsize=12, fontweight='bold')
        axes[idx].grid(True, alpha=0.3)
        
        # Add trend line
        z = np.polyfit(df[col], df[target_col], 1)
        p = np.poly1d(z)
        axes[idx].plot(df[col], p(df[col]), "r--", alpha=0.8, linewidth=2, label='Trend')
        
        # Add correlation coefficient
        corr = df[col].corr(df[target_col])
        axes[idx].text(0.05, 0.95, f'r = {corr:.3f}', 
                      transform=axes[idx].transAxes, 
                      fontsize=10, verticalalignment='top',
                      bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
        axes[idx].legend()

# Hide unused subplots
for idx in range(n_features, len(axes)):
    axes[idx].axis('off')

plt.tight_layout()
plt.show()


In [0]:

# MAGIC %md
# MAGIC ### 7.6 Categorical Features vs Target

In [0]:

# Box plots: Categorical features vs Target
categorical_features = config['preprocessing']['categorical_features']
n_cat_features = len(categorical_features)

fig, axes = plt.subplots(1, n_cat_features, figsize=(7 * n_cat_features, 6))

if n_cat_features == 1:
    axes = [axes]

for idx, col in enumerate(categorical_features):
    if col in df.columns:
        df.boxplot(column=target_col, by=col, ax=axes[idx])
        axes[idx].set_xlabel(col, fontsize=11)
        axes[idx].set_ylabel(target_col, fontsize=11)
        axes[idx].set_title(f'{target_col} by {col}', fontsize=12, fontweight='bold')
        axes[idx].grid(True, alpha=0.3)
        axes[idx].get_figure().suptitle('')  # Remove default title

plt.tight_layout()
plt.show()

# Statistical comparison
print("="*60)
print("CATEGORICAL FEATURES VS TARGET")
print("="*60)
for col in categorical_features:
    if col in df.columns:
        print(f"\n{col}:")
        grouped = df.groupby(col)[target_col].agg(['mean', 'median', 'std', 'count'])
        print(grouped.to_string())

In [0]:

# MAGIC %md
# MAGIC ## 8. Outlier Detection


In [0]:

# Detect outliers using IQR method
print("="*60)
print("OUTLIER DETECTION (IQR Method)")
print("="*60)

outlier_summary = []

for col in numerical_features + [target]:
    if col in df.columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
        outlier_count = len(outliers)
        outlier_pct = (outlier_count / len(df)) * 100
        
        outlier_summary.append({
            'Feature': col,
            'Lower_Bound': lower_bound,
            'Upper_Bound': upper_bound,
            'Outlier_Count': outlier_count,
            'Outlier_Percent': outlier_pct
        })
        
        print(f"\n{col}:")
        print(f"  Q1: {Q1:.2f}")
        print(f"  Q3: {Q3:.2f}")
        print(f"  IQR: {IQR:.2f}")
        print(f"  Lower Bound: {lower_bound:.2f}")
        print(f"  Upper Bound: {upper_bound:.2f}")
        print(f"  Outliers: {outlier_count} ({outlier_pct:.2f}%)")

# Summary DataFrame
outlier_df = pd.DataFrame(outlier_summary)
print("\n" + "="*60)
print("OUTLIER SUMMARY")
print("="*60)
print(outlier_df.to_string(index=False))

# Log total outliers to MLflow
total_outliers = outlier_df['Outlier_Count'].sum()
mlflow.log_metric("total_outliers", int(total_outliers))


In [0]:

# MAGIC %md
# MAGIC ## 9. Data Summary Report


In [0]:

# Generate comprehensive data summary
from src.data_processing import DataProcessor

processor = DataProcessor(config)
data_summary = processor.get_data_summary(df)

print("="*60)
print("COMPREHENSIVE DATA SUMMARY REPORT")
print("="*60)

print(f"\nüìä DATASET DIMENSIONS")
print(f"  Rows: {data_summary['shape'][0]}")
print(f"  Columns: {data_summary['shape'][1]}")

print(f"\nüìã COLUMNS")
for i, col in enumerate(data_summary['columns'], 1):
    print(f"  {i}. {col}")

print(f"\nüî¢ DATA TYPES")
for col, dtype in data_summary['dtypes'].items():
    print(f"  {col}: {dtype}")

print(f"\n‚ùì MISSING VALUES")
has_missing = False
for col, missing in data_summary['missing_values'].items():
    if missing > 0:
        has_missing = True
        print(f"  {col}: {missing}")
if not has_missing:
    print("  None ‚úÖ")

print(f"\nüìä NUMERICAL SUMMARY")
for col, stats in data_summary['numeric_summary'].items():
    if col != 'count':
        print(f"\n  {col}:")
        for stat, value in stats.items():
            print(f"    {stat}: {value:.2f}")

print(f"\nüìù CATEGORICAL SUMMARY")
for col, values in data_summary['categorical_summary'].items():
    print(f"\n  {col}:")
    for value, count in values.items():
        print(f"    {value}: {count}")


In [0]:

# MAGIC %md
# MAGIC ## 10. Save Exploration Results


In [0]:

# Save summary statistics
processed_path = config['data']['processed_path']

# Summary statistics
summary_stats = df.describe()
summary_stats.to_csv(f'{processed_path}summary_statistics.csv')
print("‚úÖ Summary statistics saved!")

# Correlation matrix
correlation_matrix.to_csv(f'{processed_path}correlation_matrix.csv')
print("‚úÖ Correlation matrix saved!")

# Outlier summary
outlier_df.to_csv(f'{processed_path}outlier_summary.csv', index=False)
print("‚úÖ Outlier summary saved!")

# Missing values summary
missing_df.to_csv(f'{processed_path}missing_values_summary.csv', index=False)
print("‚úÖ Missing values summary saved!")

print(f"\nüìÅ All files saved to: {processed_path}")


In [0]:

# MAGIC %md
# MAGIC ## 11. Log to MLflow


In [0]:

# Log dataset summary to MLflow
log_dataset_summary(df, config)

# Log artifacts
from src.utils import MLflowLogger

MLflowLogger.log_dataframe_as_artifact(summary_stats, "summary_statistics.csv")
MLflowLogger.log_dataframe_as_artifact(correlation_matrix, "correlation_matrix.csv")
MLflowLogger.log_dataframe_as_artifact(outlier_df, "outlier_summary.csv")

print("‚úÖ All artifacts logged to MLflow!")


In [0]:

# MAGIC %md
# MAGIC ## 12. Key Insights & Recommendations


In [0]:

# Generate insights
target_corr_top = target_corr[target_corr.index != target_col].head(3)
strongest_feature = target_corr_top.index[0]
strongest_corr = target_corr_top.values[0]

print("="*60)
print("KEY INSIGHTS")
print("="*60)

print(f"""
üìä DATASET OVERVIEW
  ‚Ä¢ Total Records: {df.shape[0]:,}
  ‚Ä¢ Total Features: {df.shape[1]}
  ‚Ä¢ Memory Usage: {df.memory_usage(deep=True).sum() / 1024:.2f} KB

‚úÖ DATA QUALITY
  ‚Ä¢ Missing Values: {missing_values.sum()} ({(missing_values.sum() / df.size) * 100:.2f}%)
  ‚Ä¢ Duplicate Rows: {duplicates} ({duplicates/len(df)*100:.2f}%)
  ‚Ä¢ Total Outliers: {total_outliers}

üéØ TARGET VARIABLE ({target_col})
  ‚Ä¢ Mean: ${df[target_col].mean():,.2f}
  ‚Ä¢ Median: ${df[target_col].median():,.2f}
  ‚Ä¢ Std Dev: ${df[target_col].std():,.2f}
  ‚Ä¢ Skewness: {df[target_col].skew():.4f}
  ‚Ä¢ Distribution: {'Normal' if abs(df[target_col].skew()) < 0.5 else 'Skewed'}

üîó STRONGEST CORRELATIONS WITH TARGET
""")

for i, (feature, corr_value) in enumerate(target_corr_top.items(), 1):
    print(f"  {i}. {feature}: {corr_value:.4f}")

print(f"""
üìù CATEGORICAL FEATURES
""")
for col in categorical_features:
    if col in df.columns:
        print(f"  ‚Ä¢ {col}: {df[col].nunique()} unique values")

print("\n" + "="*60)
print("RECOMMENDATIONS")
print("="*60)
print("""
‚úì Data quality is good - ready for feature engineering
‚úì Consider feature scaling for numerical features
‚úì Encode categorical variables before modeling
‚úì Monitor outliers during model training
‚úì Strong correlation found with: {}
‚úì Consider creating interaction features

NEXT STEPS:
‚Üí Feature Engineering (Notebook 02)
  ‚Ä¢ Create derived features
  ‚Ä¢ Encode categorical variables
  ‚Ä¢ Scale numerical features
  ‚Ä¢ Handle outliers if needed
  ‚Ä¢ Prepare train/test split
""".format(strongest_feature))

print("="*60)


In [0]:

# MAGIC %md
# MAGIC ## 13. End MLflow Run


In [0]:

# End MLflow run
mlflow.end_run()

print("="*60)
print("‚úÖ DATA EXPLORATION COMPLETE!")
print("="*60)
print("\nüìä MLflow run ended successfully")
print("üéØ Ready for Feature Engineering (Notebook 02)")
print("\n" + "="*60)