# 📊 تحلیل اکتشافی داده‌های خانه‌های بوستون

این نوت‌بوک شامل تحلیل جامع و اکتشافی مجموعه داده Boston Housing است.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pathlib import Path
import sys

# Add src to path
sys.path.append(str(Path.cwd().parent / 'src'))

# Suppress warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("✅ Libraries imported successfully!")

## 📥 بارگذاری داده‌ها

In [None]:
# Load data using our custom loader
from data_loader import BostonHousingDataLoader

loader = BostonHousingDataLoader()
features, target, feature_names = loader.load_data()

print(f"📊 Dataset loaded successfully!")
print(f"Shape: {features.shape}")
print(f"Features: {len(feature_names)}")
print(f"Target: {target.name}")

## 🔍 بررسی اطلاعات پایه داده‌ها

In [None]:
# Get basic dataset information
info = loader.get_data_info()
print("📋 Dataset Information:")
for key, value in info.items():
    print(f"  {key}: {value}")

In [None]:
# Display first few rows
print("📋 First 5 rows of features:")
display(features.head())

print("\n📋 First 5 rows of target:")
display(target.head())

## 📊 آمار توصیفی

In [None]:
# Statistical summary of features
print("📊 Statistical Summary of Features:")
display(features.describe())

print("\n📊 Statistical Summary of Target:")
display(target.describe())

## 🔍 بررسی انواع داده‌ها

In [None]:
# Data types information
print("🔍 Data Types:")
print(features.dtypes)

print("\n🔍 Target Data Type:")
print(target.dtype)

## 🚨 بررسی مقادیر گم‌شده

In [None]:
# Check for missing values
missing_values = features.isnull().sum()
missing_percentage = (missing_values / len(features)) * 100

missing_df = pd.DataFrame({
    'Missing_Count': missing_values,
    'Missing_Percentage': missing_percentage
})

print("🚨 Missing Values Analysis:")
display(missing_df[missing_df['Missing_Count'] > 0])

if missing_df['Missing_Count'].sum() == 0:
    print("✅ No missing values found in the dataset!")

## 📈 توزیع متغیر هدف

In [None]:
# Plot target variable distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Histogram
ax1.hist(target, bins=30, alpha=0.7, edgecolor='black', color='skyblue')
ax1.set_title('Distribution of House Prices (MEDV)')
ax1.set_xlabel('Price (in $1000s)')
ax1.set_ylabel('Frequency')
ax1.grid(True, alpha=0.3)

# Box plot
ax2.boxplot(target, patch_artist=True, boxprops=dict(facecolor='lightgreen'))
ax2.set_title('Box Plot of House Prices')
ax2.set_ylabel('Price (in $1000s)')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Print statistics
print(f"📊 Target Variable Statistics:")
print(f"  Mean: {target.mean():.2f}")
print(f"  Median: {target.median():.2f}")
print(f"  Std: {target.std():.2f}")
print(f"  Min: {target.min():.2f}")
print(f"  Max: {target.max():.2f}")

## 📊 توزیع ویژگی‌ها

In [None]:
# Plot distributions of all features
n_features = len(feature_names)
n_cols = 3
n_rows = (n_features + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5*n_rows))
axes = axes.flatten()

for i, feature in enumerate(feature_names):
    if i < len(axes):
        axes[i].hist(features[feature], bins=20, alpha=0.7, edgecolor='black')
        axes[i].set_title(f'Distribution of {feature}')
        axes[i].set_xlabel(feature)
        axes[i].set_ylabel('Frequency')
        axes[i].grid(True, alpha=0.3)

# Hide empty subplots
for i in range(n_features, len(axes)):
    axes[i].set_visible(False)

plt.tight_layout()
plt.show()

## 🔗 ماتریس همبستگی

In [None]:
# Create correlation matrix including target
data_with_target = pd.concat([features, target], axis=1)
correlation_matrix = data_with_target.corr()

# Plot correlation heatmap
plt.figure(figsize=(12, 10))
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(
    correlation_matrix, 
    mask=mask, 
    annot=True, 
    cmap='coolwarm', 
    center=0,
    square=True,
    fmt='.2f'
)
plt.title('Correlation Matrix Heatmap')
plt.tight_layout()
plt.show()

# Show top correlations with target
target_correlations = correlation_matrix[target.name].sort_values(ascending=False)
print("🔗 Top correlations with target variable:")
display(target_correlations)

## 📊 تحلیل ویژگی‌های مهم

In [None]:
# Get feature descriptions
descriptions = loader.get_feature_descriptions()

# Show top 5 features by correlation with target
top_features = target_correlations[1:6]  # Exclude target itself

print("🏆 Top 5 Most Important Features:")
for i, (feature, corr) in enumerate(top_features.items(), 1):
    print(f"{i}. {feature} (Correlation: {corr:.3f})")
    print(f"   Description: {descriptions.get(feature, 'No description available')}")
    print()

## 📈 نمودارهای پراکندگی

In [None]:
# Scatter plots for top features vs target
top_5_features = top_features.index[:5]
n_cols = 2
n_rows = (len(top_5_features) + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5*n_rows))
axes = axes.flatten()

for i, feature in enumerate(top_5_features):
    if i < len(axes):
        axes[i].scatter(features[feature], target, alpha=0.6, color='blue')
        axes[i].set_xlabel(feature)
        axes[i].set_ylabel('House Price (MEDV)')
        axes[i].set_title(f'{feature} vs House Price')
        axes[i].grid(True, alpha=0.3)

# Hide empty subplots
for i in range(len(top_5_features), len(axes)):
    axes[i].set_visible(False)

plt.tight_layout()
plt.show()

## 🚨 شناسایی Outliers

In [None]:
# Box plots for all features to identify outliers
plt.figure(figsize=(15, 8))
features.boxplot(figsize=(15, 8))
plt.title('Box Plots of All Features (Outlier Detection)')
plt.xticks(rotation=45)
plt.ylabel('Feature Values')
plt.tight_layout()
plt.show()

# Count outliers using IQR method
outlier_counts = {}
for feature in feature_names:
    Q1 = features[feature].quantile(0.25)
    Q3 = features[feature].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = features[(features[feature] < lower_bound) | (features[feature] > upper_bound)]
    outlier_counts[feature] = len(outliers)

print("🚨 Outlier Counts (IQR method):")
for feature, count in sorted(outlier_counts.items(), key=lambda x: x[1], reverse=True):
    if count > 0:
        percentage = (count / len(features)) * 100
        print(f"  {feature}: {count} outliers ({percentage:.1f}%)")

## 📋 خلاصه تحلیل

In [None]:
print("📋 Data Exploration Summary:")
print("=" * 50)
print(f"Dataset Shape: {features.shape}")
print(f"Number of Features: {len(feature_names)}")
print(f"Number of Samples: {len(features)}")
print(f"Target Variable: {target.name}")
print(f"Missing Values: {features.isnull().sum().sum()}")
print(f"Data Types: {features.dtypes.unique()}")
print(f"\nTop 3 Features by Correlation:")
for i, (feature, corr) in enumerate(top_features.head(3).items(), 1):
    print(f"  {i}. {feature}: {corr:.3f}")
print(f"\nTarget Statistics:")
print(f"  Mean: {target.mean():.2f}")
print(f"  Std: {target.std():.2f}")
print(f"  Range: {target.max() - target.min():.2f}")

## 💾 ذخیره نتایج

In [None]:
# Save exploration results
exploration_results = {
    'dataset_shape': features.shape,
    'feature_names': list(feature_names),
    'target_name': target.name,
    'missing_values': features.isnull().sum().to_dict(),
    'target_correlations': target_correlations.to_dict(),
    'outlier_counts': outlier_counts,
    'target_statistics': {
        'mean': float(target.mean()),
        'std': float(target.std()),
        'min': float(target.min()),
        'max': float(target.max()),
        'median': float(target.median())
    }
}

import json
with open('../results/exploration_results.json', 'w') as f:
    json.dump(exploration_results, f, indent=2)

print("💾 Exploration results saved to 'results/exploration_results.json'")