In [0]:
# COMMAND ----------
# MAGIC %md
# MAGIC # Fix Numerical Values - DAYS_EMPLOYED Anomaly

# COMMAND ----------

from pyspark.sql.functions import col, when

# Load the encoded data
spark.sql("USE credit_risk")
df_encoded = spark.table("application_train_encoded")

print(f"Loaded data: {df_encoded.count():,} rows √ó {len(df_encoded.columns)} columns")

# COMMAND ----------
# MAGIC %md
# MAGIC ## Step 1: Verify the DAYS_EMPLOYED Anomaly

# COMMAND ----------

# Check how many rows have the anomaly
anomaly_count = df_encoded.filter(col("DAYS_EMPLOYED") == 365243).count()
total_count = df_encoded.count()
anomaly_pct = (anomaly_count / total_count) * 100

print(f"=== DAYS_EMPLOYED Anomaly Check ===")
print(f"Total rows: {total_count:,}")
print(f"Rows with DAYS_EMPLOYED = 365243: {anomaly_count:,}")
print(f"Percentage: {anomaly_pct:.2f}%")
print()

# Show distribution before fix
print("DAYS_EMPLOYED distribution (top 10 values):")
df_encoded.groupBy("DAYS_EMPLOYED").count().orderBy(col("count").desc()).show(10)

# COMMAND ----------
# MAGIC %md
# MAGIC ## Step 2: Create Unemployment Flag

# COMMAND ----------

print("Creating IS_UNEMPLOYED flag...")

# Create binary flag: 1 if unemployed (365243), 0 if employed
df_fixed = df_encoded.withColumn(
    "IS_UNEMPLOYED",
    when(col("DAYS_EMPLOYED") == 365243, 1.0).otherwise(0.0)
)

# Verify flag creation
unemployed_flag_count = df_fixed.filter(col("IS_UNEMPLOYED") == 1.0).count()
print(f"‚úÖ IS_UNEMPLOYED flag created")
print(f"   Unemployed (IS_UNEMPLOYED = 1): {unemployed_flag_count:,}")
print(f"   Employed (IS_UNEMPLOYED = 0): {total_count - unemployed_flag_count:,}")
print()

# COMMAND ----------
# MAGIC %md
# MAGIC ## Step 3: Replace 365243 with 0

# COMMAND ----------

print("Replacing DAYS_EMPLOYED = 365243 with 0...")

# Replace the anomaly with 0
df_fixed = df_fixed.withColumn(
    "DAYS_EMPLOYED",
    when(col("DAYS_EMPLOYED") == 365243, 0).otherwise(col("DAYS_EMPLOYED"))
)

print("‚úÖ DAYS_EMPLOYED anomaly replaced with 0")
print()

# COMMAND ----------
# MAGIC %md
# MAGIC ## Step 4: Verify the Fix

# COMMAND ----------

print("=== Verification ===\n")

# Check that 365243 is gone
anomaly_after = df_fixed.filter(col("DAYS_EMPLOYED") == 365243).count()
print(f"Rows with DAYS_EMPLOYED = 365243 after fix: {anomaly_after}")

# Check that zeros were added
zeros_count = df_fixed.filter(col("DAYS_EMPLOYED") == 0).count()
print(f"Rows with DAYS_EMPLOYED = 0: {zeros_count:,}")

# Check flag matches zeros
print(f"IS_UNEMPLOYED = 1 count: {df_fixed.filter(col('IS_UNEMPLOYED') == 1.0).count():,}")
print(f"Match: {zeros_count == df_fixed.filter(col('IS_UNEMPLOYED') == 1.0).count()}")
print()

# Show new distribution
print("DAYS_EMPLOYED distribution after fix (top 10 values):")
df_fixed.groupBy("DAYS_EMPLOYED").count().orderBy(col("count").desc()).show(10)

# COMMAND ----------
# MAGIC %md
# MAGIC ## Step 5: Check Statistics

# COMMAND ----------

# Compare statistics before and after
print("=== DAYS_EMPLOYED Statistics ===\n")

# After fix (excluding zeros to see employed distribution)
employed_only = df_fixed.filter(col("DAYS_EMPLOYED") != 0)

stats = df_fixed.select(
    col("DAYS_EMPLOYED")
).summary("min", "25%", "50%", "75%", "max", "mean", "stddev")

print("Statistics (including unemployed = 0):")
stats.show()

print("\nStatistics (employed only, excluding zeros):")
employed_only.select("DAYS_EMPLOYED").summary("min", "25%", "50%", "75%", "max", "mean").show()

# COMMAND ----------
# MAGIC %md
# MAGIC ## Step 6: Verify Column Count

# COMMAND ----------

print("=== Column Count ===")
print(f"Original columns: {len(df_encoded.columns)}")
print(f"After adding IS_UNEMPLOYED: {len(df_fixed.columns)}")
print(f"New columns added: {len(df_fixed.columns) - len(df_encoded.columns)}")
print()

# Verify IS_UNEMPLOYED is in the dataframe
if "IS_UNEMPLOYED" in df_fixed.columns:
    print("‚úÖ IS_UNEMPLOYED column successfully added")
else:
    print("‚ùå ERROR: IS_UNEMPLOYED column not found")

# COMMAND ----------
# MAGIC %md
# MAGIC ## Step 7: Save Fixed Dataset

# COMMAND ----------

# Save the fixed dataset
df_fixed.write.mode("overwrite").saveAsTable("credit_risk.application_train_fixed")

print("‚úÖ Fixed dataset saved as 'credit_risk.application_train_fixed'")
print()
print("=== Summary ===")
print(f"‚Ä¢ Fixed DAYS_EMPLOYED anomaly (365243 ‚Üí 0)")
print(f"‚Ä¢ Created IS_UNEMPLOYED flag")
print(f"‚Ä¢ Affected rows: {anomaly_count:,} ({anomaly_pct:.2f}%)")
print(f"‚Ä¢ Total rows: {df_fixed.count():,}")
print(f"‚Ä¢ Total columns: {len(df_fixed.columns)}")
print()
print("üéâ Ready for feature scaling!")

# COMMAND ----------

# Display sample to verify
print("Sample of fixed data:")
display(df_fixed.select("SK_ID_CURR", "DAYS_EMPLOYED", "IS_UNEMPLOYED", "TARGET").limit(20))

# COMMAND ----------

In [0]:
# COMMAND ----------
# MAGIC %md
# MAGIC ## Apply StandardScaler to Numerical Features

# COMMAND ----------

from pyspark.sql.functions import col
import pandas as pd
from sklearn.preprocessing import StandardScaler

print("=== Starting Feature Scaling ===\n")

# Load the fixed data
spark.sql("USE credit_risk")
df_fixed = spark.table("application_train_fixed")

print(f"Loaded data: {df_fixed.count():,} rows √ó {len(df_fixed.columns)} columns")

# COMMAND ----------
# MAGIC %md
# MAGIC ### Step 1: Identify Columns to Scale vs Not Scale

# COMMAND ----------

# Get all column names and types
all_columns = df_fixed.columns

# Columns to EXCLUDE from scaling
columns_to_exclude = [
    'SK_ID_CURR',  # ID column - don't scale
    'TARGET'       # Target variable - don't scale
]

# Binary/one-hot encoded columns (0/1 values) - don't scale these either
# These are columns created from categorical encoding
binary_columns = [col for col in all_columns if col not in columns_to_exclude and 
                  df_fixed.select(col).distinct().count() == 2]

print(f"Total columns: {len(all_columns)}")
print(f"Binary/one-hot encoded columns (will NOT scale): {len(binary_columns)}")
print(f"\nFirst 20 binary columns:")
for col_name in binary_columns[:20]:
    print(f"  ‚Ä¢ {col_name}")

# COMMAND ----------

# Numerical columns to scale
# These are columns that are NOT binary and NOT excluded
numerical_columns_to_scale = [
    col for col in all_columns 
    if col not in columns_to_exclude and col not in binary_columns
]

print(f"\n=== Columns to Scale ===")
print(f"Total numerical columns to scale: {len(numerical_columns_to_scale)}")
print(f"\nNumerical columns that will be scaled:")
for col_name in numerical_columns_to_scale[:30]:  # Show first 30
    print(f"  ‚Ä¢ {col_name}")
if len(numerical_columns_to_scale) > 30:
    print(f"  ... and {len(numerical_columns_to_scale) - 30} more")

# COMMAND ----------
# MAGIC %md
# MAGIC ### Step 2: Convert to Pandas for Scaling

# COMMAND ----------

print("\nConverting to Pandas (this may take 2-3 minutes)...")
df_pandas = df_fixed.toPandas()

print(f"‚úÖ Converted to Pandas: {df_pandas.shape[0]:,} rows √ó {df_pandas.shape[1]} columns")

# COMMAND ----------
# MAGIC %md
# MAGIC ### Step 3: Apply StandardScaler

# COMMAND ----------

print("\nApplying StandardScaler to numerical columns...")

# Create a copy of the dataframe
df_scaled = df_pandas.copy()

# Initialize StandardScaler
scaler = StandardScaler()

# Scale only the numerical columns
df_scaled[numerical_columns_to_scale] = scaler.fit_transform(
    df_pandas[numerical_columns_to_scale]
)

print(f"‚úÖ StandardScaler applied to {len(numerical_columns_to_scale)} columns")

# COMMAND ----------
# MAGIC %md
# MAGIC ### Step 4: Verify Scaling

# COMMAND ----------

print("\n=== Verification: Scaled vs Original ===\n")

# Check a few example columns
example_cols = ['AMT_INCOME_TOTAL', 'AMT_CREDIT', 'DAYS_BIRTH', 'DAYS_EMPLOYED']

for col_name in example_cols:
    if col_name in numerical_columns_to_scale:
        print(f"{col_name}:")
        print(f"  Original - Mean: {df_pandas[col_name].mean():.2f}, Std: {df_pandas[col_name].std():.2f}")
        print(f"  Scaled   - Mean: {df_scaled[col_name].mean():.2e}, Std: {df_scaled[col_name].std():.2f}")
        print(f"  Original range: [{df_pandas[col_name].min():.2f}, {df_pandas[col_name].max():.2f}]")
        print(f"  Scaled range:   [{df_scaled[col_name].min():.2f}, {df_scaled[col_name].max():.2f}]")
        print()

# COMMAND ----------

# Verify binary columns were NOT scaled
print("=== Binary Columns (Should NOT be scaled) ===")
example_binary = binary_columns[:5]

for col_name in example_binary:
    unique_values = sorted(df_scaled[col_name].unique())
    print(f"{col_name}: unique values = {unique_values}")

print("\n‚úÖ Binary columns preserved (still 0 and 1)")

# COMMAND ----------

# Verify TARGET column was not scaled
print("\n=== Target Variable ===")
print(f"TARGET unique values: {sorted(df_scaled['TARGET'].unique())}")
print(f"TARGET distribution:")
print(df_scaled['TARGET'].value_counts())
print("\n‚úÖ TARGET column preserved")

# COMMAND ----------
# MAGIC %md
# MAGIC ### Step 5: Summary Statistics

# COMMAND ----------

print("\n=== Summary Statistics for Scaled Numerical Columns ===\n")

# Get stats for scaled numerical columns
scaled_stats = df_scaled[numerical_columns_to_scale].describe()
print(scaled_stats)

print("\nKey observations:")
print(f"‚Ä¢ Mean of scaled columns should be ~0: {scaled_stats.loc['mean'].abs().mean():.6f}")
print(f"‚Ä¢ Std of scaled columns should be ~1: {scaled_stats.loc['std'].mean():.6f}")

# COMMAND ----------
# MAGIC %md
# MAGIC ### Step 6: Convert Back to Spark and Save

# COMMAND ----------

print("\nConverting back to Spark DataFrame...")

# Convert back to Spark
df_scaled_spark = spark.createDataFrame(df_scaled)

print(f"‚úÖ Converted back to Spark")

# Save to table
print("Saving to table...")
df_scaled_spark.write.mode("overwrite").saveAsTable("credit_risk.application_train_scaled")

print("‚úÖ Scaled dataset saved as 'credit_risk.application_train_scaled'")

# COMMAND ----------
# MAGIC %md
# MAGIC ### Step 7: Final Summary

# COMMAND ----------

print("\n" + "="*60)
print("SCALING COMPLETE!")
print("="*60)
print(f"\nüìä Dataset Summary:")
print(f"  ‚Ä¢ Total rows: {df_scaled.shape[0]:,}")
print(f"  ‚Ä¢ Total columns: {df_scaled.shape[1]}")
print(f"  ‚Ä¢ Columns scaled: {len(numerical_columns_to_scale)}")
print(f"  ‚Ä¢ Binary columns (not scaled): {len(binary_columns)}")
print(f"  ‚Ä¢ Excluded columns: {len(columns_to_exclude)}")

print(f"\n‚úÖ Scaling Method: StandardScaler")
print(f"  ‚Ä¢ Formula: (x - mean) / std")
print(f"  ‚Ä¢ Result: Mean ‚âà 0, Std ‚âà 1")

print(f"\nüìÅ Saved as: credit_risk.application_train_scaled")

print(f"\nüéâ Dataset is now ready for:")
print(f"  ‚Ä¢ Correlation analysis")
print(f"  ‚Ä¢ Feature selection")
print(f"  ‚Ä¢ Logistic regression modeling")

# COMMAND ----------

# Display sample of scaled data
print("\n=== Sample of Scaled Data ===")
display(df_scaled.head(10))

# COMMAND ----------