# 🔧 پیش‌پردازش داده‌های خانه‌های بوستون

این نوت‌بوک شامل مراحل پیش‌پردازش داده‌ها مانند تمیز کردن، مدیریت outliers، مقیاس‌بندی و تقسیم داده‌ها است.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pathlib import Path
import sys

# Add src to path
sys.path.append(str(Path.cwd().parent / 'src'))

# Suppress warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("✅ Libraries imported successfully!")

## 📥 بارگذاری داده‌ها

In [None]:
# Load data
from data_loader import BostonHousingDataLoader
from preprocessing import DataPreprocessor

loader = BostonHousingDataLoader()
features, target, feature_names = loader.load_data()

print(f"📊 Dataset loaded successfully!")
print(f"Original shape: {features.shape}")

## 🔍 بررسی کیفیت داده‌ها

In [None]:
# Create preprocessor
preprocessor = DataPreprocessor(scaler_type='standard')

# Check missing values
missing_info = preprocessor.check_missing_values(features)
print("🚨 Missing Values Analysis:")
print(f"Total missing: {missing_info['total_missing']}")
if missing_info['total_missing'] > 0:
    print("\nMissing values per column:")
    for col, count in missing_info['missing_per_column'].items():
        if count > 0:
            print(f"  {col}: {count} ({missing_info['missing_percentage'][col]:.2f}%)")
else:
    print("✅ No missing values found!")

## 🚨 شناسایی و مدیریت Outliers

In [None]:
# Check outliers using IQR method
outlier_info = preprocessor.check_outliers(features, method='iqr', threshold=1.5)
print("🚨 Outlier Analysis (IQR method):")
print(f"Features with outliers: {len([k for k, v in outlier_info.items() if v['count'] > 0])}")

# Show features with most outliers
outlier_summary = [(k, v['count'], v['percentage']) for k, v in outlier_info.items() if v['count'] > 0]
outlier_summary.sort(key=lambda x: x[1], reverse=True)

if outlier_summary:
    print("\nTop features with outliers:")
    for feature, count, percentage in outlier_summary[:5]:
        print(f"  {feature}: {count} outliers ({percentage:.2f}%)")
else:
    print("✅ No outliers detected!")

In [None]:
# Visualize outliers for top features
if outlier_summary:
    top_outlier_features = [feature for feature, _, _ in outlier_summary[:6]]
    n_cols = 2
    n_rows = (len(top_outlier_features) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5*n_rows))
    axes = axes.flatten()
    
    for i, feature in enumerate(top_outlier_features):
        if i < len(axes):
            # Box plot
            axes[i].boxplot(features[feature], patch_artist=True, 
                           boxprops=dict(facecolor='lightblue'))
            axes[i].set_title(f'Outliers in {feature}')
            axes[i].set_ylabel(feature)
            axes[i].grid(True, alpha=0.3)
    
    # Hide empty subplots
    for i in range(len(top_outlier_features), len(axes)):
        axes[i].set_visible(False)
    
    plt.tight_layout()
    plt.show()

## 🧹 تمیز کردن داده‌ها

In [None]:
# Handle outliers using clipping strategy
print("🧹 Handling outliers using clipping strategy...")
features_clean = preprocessor.handle_outliers(features, strategy='clip')

# Compare before and after
print(f"\n📊 Data shape comparison:")
print(f"  Before cleaning: {features.shape}")
print(f"  After cleaning: {features_clean.shape}")

# Check outliers after cleaning
outlier_info_after = preprocessor.check_outliers(features_clean, method='iqr', threshold=1.5)
outliers_after = sum([v['count'] for v in outlier_info_after.values()])
print(f"  Outliers before: {sum([v['count'] for v in outlier_info.values()])}")
print(f"  Outliers after: {outliers_after}")

## 📊 تحلیل توزیع قبل و بعد از تمیز کردن

In [None]:
# Compare distributions before and after cleaning for top outlier features
if outlier_summary:
    top_features = [feature for feature, _, _ in outlier_summary[:4]]
    
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    axes = axes.flatten()
    
    for i, feature in enumerate(top_features):
        if i < len(axes):
            # Before cleaning
            axes[i].hist(features[feature], bins=20, alpha=0.7, 
                        label='Before', color='red', edgecolor='black')
            # After cleaning
            axes[i].hist(features_clean[feature], bins=20, alpha=0.7, 
                        label='After', color='green', edgecolor='black')
            axes[i].set_title(f'Distribution of {feature}')
            axes[i].set_xlabel(feature)
            axes[i].set_ylabel('Frequency')
            axes[i].legend()
            axes[i].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

## 📈 تقسیم داده‌ها

In [None]:
# Split data into training and testing sets
print("📈 Splitting data into training and testing sets...")
X_train, X_test, y_train, y_test = preprocessor.split_data(features_clean, target)

print(f"\n📊 Split results:")
print(f"  Training set: {X_train.shape[0]} samples ({X_train.shape[0]/len(features_clean)*100:.1f}%)")
print(f"  Testing set: {X_test.shape[0]} samples ({X_test.shape[0]/len(features_clean)*100:.1f}%)")
print(f"  Features: {X_train.shape[1]}")

## 🔄 مقیاس‌بندی ویژگی‌ها

In [None]:
# Scale features using StandardScaler
print("🔄 Scaling features using StandardScaler...")
X_train_scaled, X_test_scaled = preprocessor.scale_features(X_train, X_test)

print(f"✅ Scaling completed!")
print(f"  Training set shape: {X_train_scaled.shape}")
print(f"  Testing set shape: {X_test_scaled.shape}")

In [None]:
# Compare distributions before and after scaling
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.flatten()

# Select a few features for visualization
sample_features = feature_names[:4]

for i, feature in enumerate(sample_features):
    if i < len(axes):
        # Before scaling
        axes[i].hist(X_train[feature], bins=20, alpha=0.7, 
                    label='Before Scaling', color='blue', edgecolor='black')
        # After scaling
        feature_idx = list(feature_names).index(feature)
        axes[i].hist(X_train_scaled[:, feature_idx], bins=20, alpha=0.7, 
                    label='After Scaling', color='orange', edgecolor='black')
        axes[i].set_title(f'Scaling Effect on {feature}')
        axes[i].set_xlabel('Feature Value')
        axes[i].set_ylabel('Frequency')
        axes[i].legend()
        axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 🔧 تست انواع مختلف Scaler

In [None]:
# Test different scaler types
scaler_types = ['standard', 'robust', 'minmax']
scaled_data = {}

for scaler_type in scaler_types:
    print(f"\n🔧 Testing {scaler_type.title()} Scaler...")
    temp_preprocessor = DataPreprocessor(scaler_type=scaler_type)
    X_train_temp, X_test_temp = temp_preprocessor.scale_features(X_train, X_test)
    
    # Calculate statistics
    train_mean = X_train_temp.mean(axis=0).mean()
    train_std = X_train_temp.std(axis=0).mean()
    train_min = X_train_temp.min(axis=0).mean()
    train_max = X_train_temp.max(axis=0).mean()
    
    print(f"  Mean: {train_mean:.4f}")
    print(f"  Std: {train_std:.4f}")
    print(f"  Min: {train_min:.4f}")
    print(f"  Max: {train_max:.4f}")
    
    scaled_data[scaler_type] = {
        'X_train': X_train_temp,
        'X_test': X_test_temp,
        'stats': {'mean': train_mean, 'std': train_std, 'min': train_min, 'max': train_max}
    }

## 📊 مقایسه انواع Scaler

In [None]:
# Compare scaler effects
scaler_names = list(scaled_data.keys())
stats_names = ['mean', 'std', 'min', 'max']

fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.flatten()

for i, stat in enumerate(stats_names):
    values = [scaled_data[scaler]['stats'][stat] for scaler in scaler_names]
    
    bars = axes[i].bar(scaler_names, values, color=['skyblue', 'lightgreen', 'lightcoral'], alpha=0.7)
    axes[i].set_title(f'{stat.upper()} Comparison Across Scalers')
    axes[i].set_ylabel(stat.upper())
    axes[i].grid(True, alpha=0.3)
    
    # Add value labels on bars
    for bar, value in zip(bars, values):
        height = bar.get_height()
        axes[i].text(bar.get_x() + bar.get_width()/2., height + height*0.01,
                    f'{value:.3f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

## 🎯 ویژگی‌های چندجمله‌ای

In [None]:
# Create polynomial features
print("🎯 Creating polynomial features (degree 2)...")
X_train_poly = preprocessor.create_polynomial_features(X_train, degree=2)
X_test_poly = preprocessor.create_polynomial_features(X_test, degree=2)

print(f"✅ Polynomial features created!")
print(f"  Original features: {X_train.shape[1]}")
print(f"  Polynomial features: {X_train_poly.shape[1]}")
print(f"  Additional features: {X_train_poly.shape[1] - X_train.shape[1]}")

# Show some polynomial feature names
print(f"\n📋 Sample polynomial feature names:")
for i, feature in enumerate(X_train_poly.columns[:10]):
    print(f"  {i+1}. {feature}")

## 📋 خلاصه پیش‌پردازش

In [None]:
# Get preprocessing summary
summary = preprocessor.get_preprocessing_summary(features, features_clean)

print("📋 Preprocessing Summary:")
print("=" * 50)
for key, value in summary.items():
    print(f"{key}: {value}")

print(f"\n🔧 Scaler used: {preprocessor.scaler_type}")
print(f"📊 Final data shapes:")
print(f"  Training: {X_train_scaled.shape}")
print(f"  Testing: {X_test_scaled.shape}")
print(f"  Target training: {y_train.shape}")
print(f"  Target testing: {y_test.shape}")

## 💾 ذخیره داده‌های پیش‌پردازش شده

In [None]:
# Save preprocessed data
import joblib
import json

# Save scaled data
joblib.dump(X_train_scaled, '../results/X_train_scaled.pkl')
joblib.dump(X_test_scaled, '../results/X_test_scaled.pkl')
joblib.dump(y_train, '../results/y_train.pkl')
joblib.dump(y_test, '../results/y_test.pkl')

# Save preprocessing info
preprocessing_info = {
    'scaler_type': preprocessor.scaler_type,
    'feature_names': list(feature_names),
    'original_shape': features.shape,
    'cleaned_shape': features_clean.shape,
    'training_samples': X_train_scaled.shape[0],
    'testing_samples': X_test_scaled.shape[0],
    'features_count': X_train_scaled.shape[1],
    'outlier_info': outlier_info,
    'missing_values': missing_info
}

with open('../results/preprocessing_info.json', 'w') as f:
    json.dump(preprocessing_info, f, indent=2)

print("💾 Preprocessed data saved successfully!")
print("  - X_train_scaled.pkl")
print("  - X_test_scaled.pkl")
print("  - y_train.pkl")
print("  - y_test.pkl")
print("  - preprocessing_info.json")