# Step 2: Missing Value Imputation

Section: Step 2: Missing Value Imputation

**Part of:** [marketing_campaign_082825_working.ipynb](./marketing_campaign_082825_working.ipynb)

In [None]:
# Setup and data loading
from utils import ProjectConfig, load_intermediate_results, save_intermediate_results
import pandas as pd

config = ProjectConfig()
# Load data from previous notebook
df = load_intermediate_results('data_from_03_step_1.pkl', config)


In [None]:
skewness = df["Income"].skew()
print(f"\nSkewness of Income distribution: {round(skewness, 2)}")
# The skewness value of 6.76 indicates extremely high positive skewness in income distribution
# - Mean income will be substantially higher than median income
# - Standard deviation may not accurately represent typical variation
# - The bulk of data points cluster at lower income levels
# - Use median instead of mean as a measure of central tendency

In [None]:
# Impute missing Income values using the median income for the appropriate Education and Maritial_Status categories
income_median = df.groupby(['Education', 'Marital_Status'])['Income'].median()
def impute_income(row):
    if pd.isna(row['Income']):
        try:
            return income_median[row['Education'], row['Marital_Status']]
        except KeyError:
            return df['Income'].median()  # Fallback to overall mean if combination is missing
    return row['Income']
df['Income'] = df.apply(impute_income, axis=1)

In [None]:
# Stats update post Input column update
print("Missing values:",df['Income'].isna().sum())
print("Basic Statistical Data for Income column after imputation:")
print(df['Income'].describe().round(2))
# Notice the count value has increased from 2216 to 2240 which has also altered the overall mean and median

In [None]:
median_pivot = df.pivot_table(values='Income', index='Marital_Status', columns='Education',
                             aggfunc='median').round(2)

print("\nMedian Income Pivot Table:")
print(median_pivot)
# The categorical median has been retained as seen in the updated pivot table

In [None]:

# Save results for next notebook
save_intermediate_results(df, 'data_from_04_step_2.pkl', config)
print('✓ Results saved for next notebook')