In [0]:
# Set database context
spark.sql("USE credit_risk")

# Load the application_train table
df = spark.sql("SELECT * FROM application_train")

# Display first few rows
display(df.limit(10))

In [0]:
# COMMAND ----------
# MAGIC %md
# MAGIC ## Missing Values Analysis

# COMMAND ----------

from pyspark.sql.functions import col, count, when, lit

# Get total row count
total_rows = df.count()

# Calculate missing values for each column
missing_data = []

for column in df.columns:
    # Count null values
    null_count = df.filter(col(column).isNull()).count()
    null_percentage = (null_count / total_rows) * 100
    
    # Only add if there are missing values
    if null_count > 0:
        missing_data.append((column, null_count, null_percentage))

# Convert to DataFrame and sort by percentage
missing_df = spark.createDataFrame(
    missing_data,
    ["Column_Name", "Missing_Count", "Missing_Percentage"]
)

# Sort by missing percentage (highest first)
missing_df = missing_df.orderBy(col("Missing_Percentage").desc())

# Display results
print(f"=== Missing Values Report ===")
print(f"Total Rows: {total_rows:,}")
print(f"Columns with Missing Values: {missing_df.count()} out of {len(df.columns)}")
print()

display(missing_df)

# COMMAND ----------

# Optional: Create categorized view
print("=== Missing Values by Severity ===\n")

critical = missing_df.filter(col("Missing_Percentage") >= 70)
high = missing_df.filter((col("Missing_Percentage") >= 40) & (col("Missing_Percentage") < 70))
medium = missing_df.filter((col("Missing_Percentage") >= 10) & (col("Missing_Percentage") < 40))
low = missing_df.filter(col("Missing_Percentage") < 10)

print(f"ðŸ”´ CRITICAL (â‰¥70% missing): {critical.count()} columns - RECOMMEND DROP")
display(critical)

print(f"\nðŸŸ  HIGH (40-69% missing): {high.count()} columns - CONSIDER DROP OR CAREFUL IMPUTATION")
display(high)

print(f"\nðŸŸ¡ MEDIUM (10-39% missing): {medium.count()} columns - IMPUTE")
display(medium)

print(f"\nðŸŸ¢ LOW (<10% missing): {low.count()} columns - IMPUTE")
display(low)

# COMMAND ----------

In [0]:
# COMMAND ----------
# MAGIC %md
# MAGIC ## Impute Missing Values - Low Missing Percentage Columns (<10%)

# COMMAND ----------

from pyspark.sql.functions import col, when, lit

# Make a copy to work with
df_imputed = df

print("=== Starting Imputation Process ===\n")

# COMMAND ----------

# MAGIC %md
# MAGIC ### Step 1: Calculate Medians for Numerical Columns

# COMMAND ----------

# Calculate medians (using approxQuantile for better performance)
median_days_phone = df.approxQuantile("DAYS_LAST_PHONE_CHANGE", [0.5], 0.01)[0]
median_annuity = df.approxQuantile("AMT_ANNUITY", [0.5], 0.01)[0]
median_goods_price = df.approxQuantile("AMT_GOODS_PRICE", [0.5], 0.01)[0]
median_ext_source_2 = df.approxQuantile("EXT_SOURCE_2", [0.5], 0.01)[0]

print("Calculated Medians:")
print(f"  DAYS_LAST_PHONE_CHANGE: {median_days_phone}")
print(f"  AMT_ANNUITY: {median_annuity}")
print(f"  AMT_GOODS_PRICE: {median_goods_price}")
print(f"  EXT_SOURCE_2: {median_ext_source_2}")

# COMMAND ----------

# MAGIC %md
# MAGIC ### Step 2: Impute with Medians

# COMMAND ----------

# Impute DAYS_LAST_PHONE_CHANGE with median
df_imputed = df_imputed.withColumn(
    "DAYS_LAST_PHONE_CHANGE",
    when(col("DAYS_LAST_PHONE_CHANGE").isNull(), median_days_phone)
    .otherwise(col("DAYS_LAST_PHONE_CHANGE"))
)

# Impute AMT_ANNUITY with median
df_imputed = df_imputed.withColumn(
    "AMT_ANNUITY",
    when(col("AMT_ANNUITY").isNull(), median_annuity)
    .otherwise(col("AMT_ANNUITY"))
)

# Impute AMT_GOODS_PRICE with median
df_imputed = df_imputed.withColumn(
    "AMT_GOODS_PRICE",
    when(col("AMT_GOODS_PRICE").isNull(), median_goods_price)
    .otherwise(col("AMT_GOODS_PRICE"))
)

# Impute EXT_SOURCE_2 with median
df_imputed = df_imputed.withColumn(
    "EXT_SOURCE_2",
    when(col("EXT_SOURCE_2").isNull(), median_ext_source_2)
    .otherwise(col("EXT_SOURCE_2"))
)

print("âœ… Median imputation complete for 4 columns")

# COMMAND ----------

# MAGIC %md
# MAGIC ### Step 3: Impute with Direct Values

# COMMAND ----------

# Impute CNT_FAM_MEMBERS with 1
df_imputed = df_imputed.withColumn(
    "CNT_FAM_MEMBERS",
    when(col("CNT_FAM_MEMBERS").isNull(), 1)
    .otherwise(col("CNT_FAM_MEMBERS"))
)

# Impute social circle columns with 0
df_imputed = df_imputed.withColumn(
    "OBS_30_CNT_SOCIAL_CIRCLE",
    when(col("OBS_30_CNT_SOCIAL_CIRCLE").isNull(), 0)
    .otherwise(col("OBS_30_CNT_SOCIAL_CIRCLE"))
)

df_imputed = df_imputed.withColumn(
    "DEF_30_CNT_SOCIAL_CIRCLE",
    when(col("DEF_30_CNT_SOCIAL_CIRCLE").isNull(), 0)
    .otherwise(col("DEF_30_CNT_SOCIAL_CIRCLE"))
)

df_imputed = df_imputed.withColumn(
    "OBS_60_CNT_SOCIAL_CIRCLE",
    when(col("OBS_60_CNT_SOCIAL_CIRCLE").isNull(), 0)
    .otherwise(col("OBS_60_CNT_SOCIAL_CIRCLE"))
)

df_imputed = df_imputed.withColumn(
    "DEF_60_CNT_SOCIAL_CIRCLE",
    when(col("DEF_60_CNT_SOCIAL_CIRCLE").isNull(), 0)
    .otherwise(col("DEF_60_CNT_SOCIAL_CIRCLE"))
)

print("âœ… Direct value imputation complete for 5 columns")

# COMMAND ----------

# MAGIC %md
# MAGIC ### Step 4: Impute Categorical Column

# COMMAND ----------

# Impute NAME_TYPE_SUITE with "Unaccompanied"
df_imputed = df_imputed.withColumn(
    "NAME_TYPE_SUITE",
    when(col("NAME_TYPE_SUITE").isNull(), "Unaccompanied")
    .otherwise(col("NAME_TYPE_SUITE"))
)

print("âœ… Categorical imputation complete for 1 column")

# COMMAND ----------

# MAGIC %md
# MAGIC ### Step 5: Verify Imputation Results

# COMMAND ----------

# Check that these columns now have no missing values
columns_imputed = [
    "DAYS_LAST_PHONE_CHANGE",
    "CNT_FAM_MEMBERS",
    "AMT_ANNUITY",
    "AMT_GOODS_PRICE",
    "EXT_SOURCE_2",
    "NAME_TYPE_SUITE",
    "OBS_30_CNT_SOCIAL_CIRCLE",
    "DEF_30_CNT_SOCIAL_CIRCLE",
    "OBS_60_CNT_SOCIAL_CIRCLE",
    "DEF_60_CNT_SOCIAL_CIRCLE"
]

print("\n=== Verification: Missing Values After Imputation ===")
for column in columns_imputed:
    missing_count = df_imputed.filter(col(column).isNull()).count()
    print(f"  {column}: {missing_count} missing values")

# COMMAND ----------

# Show before/after comparison
print("\n=== Summary ===")
print(f"Total rows: {df_imputed.count():,}")
print(f"Total columns: {len(df_imputed.columns)}")
print(f"\nâœ… Successfully imputed 10 columns with low missing percentages!")

# Display sample of imputed data
display(df_imputed.select(columns_imputed).limit(20))

# COMMAND ----------

# MAGIC %md
# MAGIC ### Step 6: Save Imputed Dataset (Optional)

# COMMAND ----------

# Optional: Save the imputed dataframe as a new table
df_imputed.write.mode("overwrite").saveAsTable("credit_risk.application_train_imputed")

print("âœ… Imputed dataset saved as 'credit_risk.application_train_imputed'")

# COMMAND ----------