# CSC3831 Final Assessment - Part I: Data Engineering


In [None]:
# Loading in standard packages for analysis, feel free to add an extra packages you'd like to use here
import random
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import missingno as msno

from sklearn.neighbors import NearestNeighbors, LocalOutlierFactor
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer, IterativeImputer

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score, root_mean_squared_error

# Loading in the corrupted dataset to be used in analysis and imputation
houses_corrupted = pd.read_csv('https://raw.githubusercontent.com/PaoloMissier/CSC3831-2021-22/main/IMPUTATION/TARGET-DATASETS/CORRUPTED/HOUSES/houses_0.1_MAR.csv', header=0)
# Remove an artifact from the dataset
houses_corrupted.drop(["Unnamed: 0"], axis=1, inplace=True)

Above we've loaded in a corrupted version of a housing dataset. The anomalies need to be dealt with and missing values imputed.

### 1. Data Understanding [7]
- Perform ad hoc EDA to understand and describe what you see in the raw dataset
  - Include graphs, statistics, and written descritpions as appropriate
  - Any extra information about the data you can provide here is useful, think about performing an analysis (ED**A**), what would you find interesting or useful?
- Identify features with missing records, outlier records


In [None]:
# Initial data overview
print("Dataset Shape:", houses_corrupted.shape)
print("\nFeature Types:")
print(houses_corrupted.dtypes)

# Basic statistical summary
print("\nStatistical Summary:")
print(houses_corrupted.describe())

**Summary:** I printed the size of the dataset, its types of data, and basic statistics for each feature.

**Reasoning:**
- Provided an overview of the dataset's size and structure by checking its shape
- Examined data types to understand and categorize the different features
- Used statistical summaries to identify which features need attention, through their center, spread, and missing values [1], [2]

**Results Analysis:**
* There are 20,640 rows, with 9 columns each.
* The features can be divided into three categories:
  1. House Features
    * median_house_value
    * housing_median_age
    * total_rooms
    * total_bedrooms
  2. Demographic Features
    * median_income
    * population
    * households
  3. Geographic Features
    * latitude
    * longitude
* Three columns have missing data: median_income, housing_median_age, and population



In [None]:
# Distribution analysis with skewness
fig, axes = plt.subplots(3, 3, figsize=(20, 15))
axes = axes.ravel()

for i, col in enumerate(houses_corrupted.columns):
    sns.histplot(data=houses_corrupted, x=col, ax=axes[i], kde=True)
    skewness = houses_corrupted[col].skew()
    axes[i].set_title(f'{col}\nSkewness: {skewness:.2f}')

plt.tight_layout()
plt.show()

# Compare classical vs robust statistics for skewed variables
print("\nComparison of Classical vs Robust Statistics:")
non_normal_cols = ['median_income', 'total_rooms', 'total_bedrooms',
               'population', 'households', 'latitude', 'longitude']

for col in non_normal_cols:
    print(f"\n{col}:")
    print(f"Classical measures:")
    print(f"Mean: {houses_corrupted[col].mean():.2f}")
    print(f"Standard Deviation: {houses_corrupted[col].std():.2f}")
    print(f"Robust measures:")
    print(f"Median: {houses_corrupted[col].median():.2f}")
    print(f"MAD: {1.4826 * abs(houses_corrupted[col] - houses_corrupted[col].median()).median():.2f}")

**Summary:**
I created histogram plots for each feature to see their distributions and calculated skewness values. I also compared classic statistics (mean, standard deviation) with robust statistics (median, MAD) for features that weren't normally distributed.

**Reasoning:**
- Plotted distributions and calculated skewness to further understand feature distributions [2]
- Compared classical and robust statistics to see which will be most reliable for each feature [3]

**Results Analysis:**

- Non-normal features can be grouped into three types:
  - Very skewed features (skewness > 3)
    * total_rooms, total_bedrooms, population, and households
  - Somewhat skewed features (skewness 0.5-3)
    * median_house_value and median_income are less skewed but still not normal
  - Bimodal features
    * latitude and longitude show two distinct groups, likely showing different city clusters

- There are some extreme high values pulling the means and standard deviations up
  * The means are always higher than the medians
  * Standard deviations are much larger than MAD values


In [None]:
def calculate_iqr_outliers(data):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return (data < lower_bound) | (data > upper_bound)

# Box plots for initial outlier visualization
plt.figure(figsize=(15, 10))
for i, col in enumerate(houses_corrupted.columns):
    plt.subplot(3, 3, i+1)
    sns.boxplot(x=houses_corrupted[col])
    plt.title(f'Boxplot of {col}')
plt.tight_layout()
plt.show()

# IQR-based outlier detection
print("IQR-based Outlier Analysis:")
for col in houses_corrupted.columns:
    outliers = houses_corrupted[calculate_iqr_outliers(houses_corrupted[col])]

    # Only print results if meaningful number of outliers found (>1%)
    outlier_percentage = (len(outliers)/len(houses_corrupted[col].dropna()))*100
    if outlier_percentage > 1:
        print(f"\n{col}:")
        print(f"Number of outliers: {len(outliers)}")
        print(f"Percentage of outliers: {outlier_percentage:.2f}%")

**Summary:**

I created box plots for each feature to visually spot outliers and then used the IQR method to find outliers statistically. I focused on features where outliers made up more than 1% of the data.

**Reasoning:**
- The IQR method is good for a first check because of its simplicity [1]
- Setting a 1% threshold focuses on features where outliers might actually be a problem

**Results Analysis:**

- Size features had the most outliers
  * total_rooms: 6.24%
  * total_bedrooms: 6.21%
  * households: 5.91%
  * population: 5.80%
- Value features also had outliers
  * median_house_value: 5.19%
  * median_income: 3.03%
- Most features have outliers making up more than 5% of their data

In [None]:
# Try different transformations on skewed columns
skewed_cols = ['total_rooms', 'total_bedrooms', 'population', 'households', 'median_income']
print("Skewness Comparison Across Different Transformations:")

# Compare transformations for each column
transformed_data = houses_corrupted.copy()
for col in skewed_cols:
    # Calculate skewness for original and transformed data
    original_skew = houses_corrupted[col].skew()
    log_skew = np.log(houses_corrupted[col]).skew()
    sqrt_skew = np.sqrt(houses_corrupted[col]).skew()
    cbrt_skew = np.cbrt(houses_corrupted[col]).skew()

    print(f"\n{col}:")
    print(f"Original skewness: {original_skew:.3f}")
    print(f"Log transform skewness: {log_skew:.3f}")
    print(f"Square root transform skewness: {sqrt_skew:.3f}")
    print(f"Cube root transform skewness: {cbrt_skew:.3f}")

    # Identify best transformation
    skews = {'log': abs(log_skew), 'sqrt': abs(sqrt_skew), 'cbrt': abs(cbrt_skew)}
    best_transform = min(skews, key=skews.get)
    print(f"Best transformation: {best_transform}")

    # Apply cube root transformation
    transformed_data[col] = np.cbrt(houses_corrupted[col])

# Visualize original vs transformed distributions
fig, axes = plt.subplots(len(skewed_cols), 2, figsize=(15, 4*len(skewed_cols)))
for i, col in enumerate(skewed_cols):
    # Original distribution
    sns.histplot(data=houses_corrupted, x=col, ax=axes[i, 0], kde=True)
    axes[i, 0].set_title(f'Original {col}\nSkewness: {houses_corrupted[col].skew():.3f}')

    # Transformed distribution
    sns.histplot(data=transformed_data[col], ax=axes[i, 1], kde=True)
    axes[i, 1].set_title(f'Cube Root Transformed {col}\nSkewness: {transformed_data[col].skew():.3f}')

plt.tight_layout()
plt.show()

**Summary:**

I tested three different ways to transform the skewed data:
- Log transformation
- Square root transformation
- Cube root transformation

Then I compared how well each method worked by looking at the skewness before and after. I chose the transformation that performed best on average and applied it to the entire dataset.

Finally, I made plots to see the differences between the best transformed distributions and the original distributions visually.

**Reasoning:**
- Finding correlations between features is easier when they have normal distributions [1]
- Using the same transformation for most features maintains consistency
- I didn't transform latitude and longitude because transformations had little effect on their bimodality

**Results Analysis:**
- median_income
  * Started with skewness of 1.588
  * Log transformation worked best, reducing skewness to 0.210
- All other skewed features
  * Started with high skewness (3.41 - 4.83)
  * Cube root transformation worked best
- The histograms show the data looks much more normal after transformation



In [None]:
# Correlation Analysis
# Original correlation heatmap
plt.figure(figsize=(12, 8))
correlation_matrix = houses_corrupted.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title("Original Feature Correlations")
plt.xticks(rotation=45)
plt.yticks(rotation=45)
plt.tight_layout()
plt.show()

# Transformed correlations for skewed variables
plt.figure(figsize=(10, 8))
transformed_correlation = transformed_data[skewed_cols].corr()
sns.heatmap(transformed_correlation, annot=True, cmap='coolwarm', center=0)
plt.title("Correlations of Transformed Features")
plt.xticks(rotation=45)
plt.yticks(rotation=45)
plt.tight_layout()
plt.show()

size_cols = ['total_rooms', 'total_bedrooms', 'households', 'population']
geo_cols = ['latitude', 'longitude']
value_cols = ['median_house_value', 'median_income']

# Feature Group Correlations
feature_groups = [
    (size_cols, "Size-related Features"),
    (geo_cols, "Geographic Features"),
    (value_cols, "Value-related Features")
]

for cols, title in feature_groups:
    plt.figure(figsize=(8, 6))
    sns.heatmap(houses_corrupted[cols].corr(), annot=True, cmap='coolwarm', center=0)
    plt.title(title)
    plt.tight_layout()
    plt.show()

# Scatter plots for key relationships
important_pairs = [
    ('total_rooms', 'total_bedrooms'),
    ('latitude', 'longitude'),
    ('median_house_value', 'median_income')
]

fig, axes = plt.subplots(1, 3, figsize=(18, 5))
for i, (feat1, feat2) in enumerate(important_pairs):
    sns.scatterplot(data=houses_corrupted, x=feat1, y=feat2, ax=axes[i], alpha=0.5)
    corr = correlation_matrix.loc[feat1, feat2]
    axes[i].set_title(f'{feat1} vs {feat2}\nCorrelation: {corr:.3f}')

plt.tight_layout()
plt.show()

**Summary:**

I created several correlation analyses using heatmaps and scatter plots:
- A correlation heatmap for all features
- A heatmap just for the transformed features
- Separate heatmaps for related feature groups (size, location, value)
- Scatter plots for some key relationships

**Reasoning:**
- Created correlation heatmaps to understand relationships between features [1], [2], [3]
- Grouped transormed features to demonstrate the effect of transforming the data on correlation
- Grouped related features together to identify strong group relationships
- Used scatter plots to visualize the strongest correlations and verify their patterns [1], [2]

**Results Analysis:**
- Size features are strongly related
  * total_rooms and total_bedrooms are the most highly correlated (0.98)
- Geographic features
  * latitude and longitude have a strong negative correlation (-0.92)
- Value features
  * median_income and median_house_value are moderately correlated (0.69)



In [None]:
# Missing Value Analysis

# Basic missing value summary
missing_summary = pd.DataFrame({
    'Missing Count': houses_corrupted.isnull().sum(),
    'Missing Percentage': (houses_corrupted.isnull().sum() / len(houses_corrupted)) * 100
})
print("Missing Value Summary:")
print(missing_summary)

# Visualize missing patterns
plt.figure(figsize=(12, 6))
msno.matrix(houses_corrupted)
plt.title("Missing Value Pattern")
plt.show()

# Analyze overlap between features with missing values
missing_cols = ['median_income', 'housing_median_age', 'population']
print("\nMissing Value Overlap Analysis:")
for i in range(len(missing_cols)):
    for j in range(i+1, len(missing_cols)):
        col1, col2 = missing_cols[i], missing_cols[j]
        both_missing = houses_corrupted[houses_corrupted[col1].isnull() &
                                      houses_corrupted[col2].isnull()].shape[0]
        print(f"{col1} and {col2} overlap: {both_missing} rows ({(both_missing/2064)*100:.2f}%)")

# Compare medians between missing and present values
complete_cols = ['median_house_value', 'households', 'total_rooms', 'total_bedrooms', 'latitude', 'longitude']
print("\nSignificant Patterns in Missingness (>10% difference):")

for missing_col in missing_cols:
    missing_mask = houses_corrupted[missing_col].isnull()

    for complete_col in complete_cols:
        missing_median = houses_corrupted[missing_mask][complete_col].median()
        present_median = houses_corrupted[~missing_mask][complete_col].median()
        diff_percent = ((missing_median - present_median) / present_median) * 100

        if abs(diff_percent) > 10:
            print(f"\n{complete_col} when {missing_col} is missing vs present:")
            print(f"Missing: {missing_median:.2f}, Present: {present_median:.2f}")
            print(f"Difference: {diff_percent:.2f}%")

**Summary:**
- Calculated basic missing value stats
- Created a visual pattern of missing values using missingno
- Checked how missing values overlap between features
- Compared values in rows with and without missing data

**Reasoning:**
- Generated summary statistics to measure the scale of the missing data
- Visualized missing patterns to check for any consistent gaps in the data [3]
- Analyzed missing value overlap to determine if missingness is related between incomplete records
- Compared statistics between complete and incomplete records to classify the missing data type (MCAR, MAR, or MNAR) [4]

**Results Analysis:**
- Three features have exactly 10% missing values:
  * median_income
  * housing_median_age
  * population
- The missing values don't have much overlap (9-12% overlap)
  - The low overlap in missing values suggests missing values occur independently from one another
- When rows with and without missing values are compared, patterns emerge
  * Removing missing median_income rows results in higher median house values
  * Removing missing housing_median_age rows results in larger houses
  * Removing missing population rows results in much larger houses and higher values
- The relationship betwen missing values and other features suggests this is Missing at Random (MAR) data


### 2. Outlier Identification [10]
- Utilise a statistical outlier detection approach (i.e., **no** KNN, LOF, 1Class SVM)
- Utilise an algorithmic outlier detection method of your choice
- Compare results and decide what to do with identified outliers
  - Include graphs, statistics, and written descriptions as appropriate
- Explain what you are doing, and why your analysis is appropriate
- Comment on benefits/detriments of statistical and algorithmic outlier detection approaches


In [None]:
# Statistical Outlier Detection
def calculate_robust_zscore_outliers(data, threshold=3):
    median = data.median()
    mad = np.median(np.abs(data - median)) * 1.483
    rob_z_scores = (data - median) / mad
    return abs(rob_z_scores) > threshold

data = {}
# Store robust z-score results
robust_results = {}
# Detect outliers for all numeric columns
print("Outlier Detection Results:\n")
for col in houses_corrupted.columns:
    col_data = houses_corrupted[col].dropna()
    data[col] = col_data

    # Calculate outliers using both methods
    robust_outliers = calculate_robust_zscore_outliers(col_data)
    iqr_outliers = calculate_iqr_outliers(col_data)

    # Calculate percentages
    robust_pct = (robust_outliers.sum() / len(col_data)) * 100
    robust_results[col] = {
            'outliers': robust_outliers,
            'percentage': robust_pct,
        }
    iqr_pct = (iqr_outliers.sum() / len(col_data)) * 100

    # Only show if significant outliers found (>1%)
    if robust_pct > 1 or iqr_pct > 1:
        print(f"\n{col}:")
        print(f"Robust Z-score outliers: {robust_pct:.2f}%")
        print(f"IQR outliers: {iqr_pct:.2f}%")

        # Visualize outliers
        plt.figure(figsize=(10, 4))
        plt.scatter(range(len(col_data)), col_data, c='lightgray', s=10, label='Normal')
        plt.scatter(np.where(robust_outliers)[0], col_data[robust_outliers],
                   c='red', s=10, alpha=0.4, label='Robust Z-score')
        plt.scatter(np.where(iqr_outliers)[0], col_data[iqr_outliers],
                   c='blue', s=10, alpha=0.2, label='IQR')
        plt.title(f'Outlier Detection: {col}')
        plt.ylabel('Value')
        plt.xlabel('Index')
        plt.legend()
        plt.show()

**Summary:**

I used two statistical methods to find outliers:
- Robust z-score method using median and MAD
- IQR method using 1.5 times the interquartile range

I calculated the percentage of outliers for each method and made plots showing where these outliers appear.

**Reasoning:**
- Used robust z-score to handle skewed distributions since it uses median/MAD instead of mean/standard deviation [5]
- Used IQR method as a comparison point since it's simple [1]
- Visualized results to intuitively check outliers and compare method agreement [6]
- Focused only on significant outlier percentages (>1%) to identify notable features


**Results Analysis:**
- Size features had the most outliers (~6%)
- Value features had fewer outliers
  * median_house_value: ~5%
  * median_income: ~3%
- Geographic features had the least outliers
  * Only latitude showed significant outliers at 2.5% with the robust z-score method
- The plots show both methods usually agreed on what was an outlier
- The robust z-score method is the most appropriate choice because it uses median and MAD, which works better with the skewed features




In [None]:
# Store KNN results
knn_results = {}
print("Algorithmic Outlier Detection Results:\n")
for col in houses_corrupted:
    # Prepare column data
    col_data = data[col].values.reshape(-1, 1)

    # KNN Global Detection
    nbrs = NearestNeighbors(n_neighbors=30).fit(col_data)
    distances, _ = nbrs.kneighbors(col_data)
    avg_distances = distances.mean(axis=1)
    threshold = np.percentile(avg_distances, 95)
    knn_outliers = avg_distances > threshold
    knn_pct = (knn_outliers.sum() / len(col_data)) * 100

    if knn_pct > 1:
      knn_results[col] = {
          'outliers': knn_outliers,
          'percentage': knn_pct,
      }

    # Local Outlier Factor
    lof = LocalOutlierFactor(n_neighbors=20, contamination=0.025)
    lof_outliers = lof.fit_predict(col_data) == -1
    lof_pct = (lof_outliers.sum() / len(col_data)) * 100

    # Isolation Forest
    iso_f = IsolationForest(random_state=42)
    iso_outliers = iso_f.fit_predict(col_data) == -1
    iso_pct = (iso_outliers.sum() / len(col_data)) * 100

    # One-Class SVM
    ocsvm = OneClassSVM(nu=0.03)
    svm_outliers = ocsvm.fit_predict(col_data) == -1
    svm_pct = (svm_outliers.sum() / len(col_data)) * 100

    # Only show results if significant outliers found
    if max(knn_pct, lof_pct, iso_pct, svm_pct) > 1:
        print(f"\n{col}:")
        print(f"KNN Outliers: {knn_pct:.2f}%")
        print(f"LOF Outliers: {lof_pct:.2f}%")
        print(f"Isolation Forest Outliers: {iso_pct:.2f}%")
        print(f"One-Class SVM Outliers: {svm_pct:.2f}%")

        # Visualize results
        fig, axes = plt.subplots(1, 4, figsize=(20, 5))
        methods = ['KNN', 'LOF', 'Isolation Forest', 'SVM']
        outliers = [knn_outliers, lof_outliers, iso_outliers, svm_outliers]
        colors = ['red', 'purple', 'orange', 'green']

        for i, (method, outlier_mask, color) in enumerate(zip(methods, outliers, colors)):
            axes[i].scatter(range(len(col_data)), col_data, c='lightgray', s=10, label='Normal')
            axes[i].scatter(np.where(outlier_mask)[0], col_data[outlier_mask],
                          c=color, s=10, label='Outliers')
            axes[i].set_title(f'{method} Outliers: {col}')
            axes[i].set_ylabel('Value')
            axes[i].set_xlabel('Index')
            axes[i].legend()

        plt.tight_layout()
        plt.show()

**Summary:**

I used four different algorithmic methods to find outliers:
- KNN Anomaly Detection
- Local Outlier Factor
- Isolation Forest
- One-Class SVM

I calculated the percentage of outliers for each method and made plots showing where these outliers appear.

**Reasoning:**
- Implemented multiple algorithms to compare their performance on different feature types [5]
- Used KNN and LOF to identify global and local outliers respectively [5] [8]
- Used Isolation Forest for cutting-edge tree based method [5]
- Used One-Class SVM to test a boundary-based method [5], [8]
- Visualized results to intuitively check outliers and compare method agreement [6]


**Results Analysis:**
- KNN was most consistent
  * Found ~5% outliers in most features
  * Showed clear separation between normal and outlier points
- LOF was more conservative
  * Found 2-2.5% outliers
  * Sometimes marked points in dense areas as outliers
- Isolation Forest found too many outliers
  * Found 17-63% outliers
  * Seemed especially sensitive with normally distributed features
- One-Class SVM was inconsistent
  * Found between 2-13% outliers
  * Showed strange strip patterns in its detections
- KNN anomoly detection is the best performing algorithmic method
  - KNN works best with global anomolies [5]
  - It avoids the density issues seen with LOF
  - It's more reliable than Isolation Forest, which found too many outliers
  - It's simpler to understand and tune than One-Class SVM

In [None]:
# Compare k=20 configuration with stored k=30 results
print("KNN Parameter Comparison Results:\n")
for col in knn_results:
   col_data = data[col].values.reshape(-1, 1)

   # Test k=20 configuration
   nbrs = NearestNeighbors(n_neighbors=20).fit(col_data)
   distances, _ = nbrs.kneighbors(col_data)
   avg_distances = distances.mean(axis=1)
   threshold = np.percentile(avg_distances, 95)
   k20_outliers = avg_distances > threshold
   k20_pct = (k20_outliers.sum() / len(col_data)) * 100

   print(f"\n{col}:")
   print(f"k=20: {k20_pct:.2f}%")
   print(f"k=30: {knn_results[col]['percentage']:.2f}%")

   # Visualize outliers
   plt.figure(figsize=(10, 4))
   plt.scatter(range(len(col_data)), col_data, c='lightgray', s=10, label='Normal')
   plt.scatter(np.where(k20_outliers)[0], col_data[k20_outliers],
              c='red', s=10, alpha=0.4, label='k=20')
   plt.scatter(np.where(knn_results[col]['outliers'])[0], col_data[knn_results[col]['outliers']],
              c='blue', s=10, alpha=0.2, label='k=30')
   plt.title(f'Outlier Detection: {col}')
   plt.ylabel('Value')
   plt.xlabel('Index')
   plt.legend()
   plt.show()

**Summary:**

I compared two different hyperparameters for KNN outlier detection:
- Using 20 nearest neighbors (k=20)
- Using 30 nearest neighbors (k=30)

I calculated the percentage of outliers for each method and made plots showing where these outliers appear.


**Reasoning:**
- Tested different k values to find the optimal neighborhood size [5]
- Used k=20 as a lower bound to avoid false positives [5]
- Used k=30 as an upper bound to avoid false negatives [5]
- Visualized results to intuitively check outliers and compare method agreement [6]

**Results Analysis:**
- Both settings found similar percentages of outliers
- k=30 is the best performing final hyperparameter
  - k=20 cons:
    * Sometimes marked points in dense areas as outliers
    * Missed some obvious outliers in the tails
    * Was too sensitive to lower-end outliers
  - k=30 pros:
    * Clearer separation between normal and outlier points
    * More consistent in finding tail outliers

In [None]:
# Compare KNN outliers with stored robust z-score outliers
print("Algorithmic vs Statistical Method Comparison:\n")
for col in knn_results:
    col_data = data[col]
    print(f"\n{col}:")
    print(f"KNN Outliers: {knn_results[col]['percentage']:.2f}%")
    print(f"Robust Z-score Outliers: {robust_results[col]['percentage']:.2f}%")

    # Visualize outliers
    plt.figure(figsize=(10, 4))
    plt.scatter(range(len(col_data)), col_data, c='lightgray', s=10, label='Normal')
    plt.scatter(np.where(knn_results[col]['outliers'])[0], col_data[knn_results[col]['outliers']],
                c='red', s=10, alpha=0.4, label='KNN')
    plt.scatter(np.where(robust_results[col]['outliers'])[0], col_data[robust_results[col]['outliers']],
                c='blue', s=10, alpha=0.2, label='Robust Z-score')
    plt.title(f'Outlier Detection: {col}')
    plt.ylabel('Value')
    plt.xlabel('Index')
    plt.legend()
    plt.show()

**Summary:**

I compared the best statistical method (robust z-score) with the best algorithmic method (KNN with k=30)
- Calculated outlier percentages for both methods
- Made plots showing where these outliers appear

**Reasoning:**
* Compared best-performing statistical and algorithmic methods to validate outlier detection
- Visualized results to intuitively check outliers and compare method agreement [6]

**Results Analysis:**

- Normal features:
  * Robust z-score found fewer outliers (2.66%)
  * KNN consistently found about 5%
- Skewed features:
  * Robust z-score found more outliers (5.85-6.42%)
  * KNN consistently found about 5%
- Geographic features:
  * Robust z-score was more conservative (2.52%)
  * KNN consistently found about 5%
- Robust z-score is the most appropriate method for this dataset
  * It adapts better to each feature's distribution instead of forcing a fixed percentage like KNN
  * It's more conservative with normal and bimodal data, which makes sense because these distributions naturally have fewer true outliers
  * It's more aggressive with highly skewed features like room counts, where more extreme values are expected



After implementing both statistical and algorithmic outlier detection methods, I will determine the most appropriate treatment for the identified outliers.

Looking at the characteristics of outliers across different feature types, the outliers show clear patterns:
- Size Features
  * Mostly big properties like apartment complexes
  * When rooms are high, bedrooms and population are also high
- Value Features
  * Represent luxury homes
  * Match with high-income areas
- Geographic Features
  * Outliers identify properties in unique locations


Based on these findings I decided to keep all outliers in the dataset
- They represent contextual anomolies that are unusual but valid - like luxury homes or apartment complexes [5]
- The relationships between features make sense (e.g., more rooms = more bedrooms)
- Removing them would hide important parts of the housing market we're trying to analyze

### 3. Imputation [10]
- Identify which features should be imputed and which should be removed
  - Provide a written rationale for this decision
- Impute the missing records using KNN imputation
- Impute the missing records using MICE imputation
- Compare both imputed datasets feature distributions against each other and the non-imputed data
- Build a regressor on all three datasets
  - Use regression models to predict house median price
  - Compare regressors of non-imputed data against imputed datas
  - **Note**: If you're struggling to compare against the original dataset focus on comparing the two imputed datasets against each other


In [None]:
# Use this dataset for comparison against the imputed datasets
houses = pd.read_csv('https://raw.githubusercontent.com/PaoloMissier/CSC3831-2021-22/main/IMPUTATION/TARGET-DATASETS/ORIGINAL/houses.csv', header=0)

According to my missing values analysis:
- Three features each have 10% missing:
  * median_income
  * housing_median_age
  * population
- The values aren't missing randomly:
  * Missing median_income relates to house values
  * Missing housing_age relates to house size
  * Missing population relates to both size and value

Based on these findings, I decided to  impute all three features
- Features should only be removed if more than 30% of values are missing - the 10% is well below this [4]
- The earlier analysis showed this is Missing at Random (MAR) data, which means other features can make good predictions [4]
- Removing any features would make the analysis less complete [4]



In [None]:
def perform_imputation(data, method):
    if method == 'knn':
        imputer = KNNImputer(n_neighbors=20)
    else:  # mice
        imputer = IterativeImputer(random_state=42)

    imputed_data = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

    return imputed_data

# Run both imputation methods
houses_knn = perform_imputation(houses_corrupted, 'knn')
houses_mice = perform_imputation(houses_corrupted, 'mice')

# Compare results
print("\nImputation Results Comparison:")

for col in missing_cols:
    print(f"\n{col}:")
    original = houses[col]
    knn_imp = houses_knn[col]
    mice_imp = houses_mice[col]

    # Distribution statistics
    print("\nDistribution Metrics:")
    if col in skewed_cols:
        print("Center and Spread (Median, MAD):")
        print(f"Original: {original.median():.3f}, {1.483 * abs(original - original.median()).median():.3f}")
        print(f"KNN: {knn_imp.median():.3f}, {1.483 * abs(knn_imp - knn_imp.median()).median():.3f}")
        print(f"MICE: {mice_imp.median():.3f}, {1.483 * abs(mice_imp - mice_imp.median()).median():.3f}")
    else:
        print("Center and Spread (Mean, Std):")
        print(f"Original: {original.mean():.3f}, {original.std():.3f}")
        print(f"KNN: {knn_imp.mean():.3f}, {knn_imp.std():.3f}")
        print(f"MICE: {mice_imp.mean():.3f}, {mice_imp.std():.3f}")

    print("\nSkewness:")
    print(f"Original: {original.skew():.3f}")
    print(f"KNN: {knn_imp.skew():.3f}")
    print(f"MICE: {mice_imp.skew():.3f}")

    plt.figure(figsize=(10, 4))
    data = np.concatenate([original, knn_imp, mice_imp])
    bin_edges = np.linspace(data.min(), data.max(), 31)
    plt.hist(original, bins=bin_edges, alpha=0.5, label='Original', color='blue')
    plt.hist(knn_imp, bins=bin_edges, alpha=0.5, label='KNN', color='red')
    plt.hist(mice_imp, bins=bin_edges, alpha=0.5, label='MICE', color='yellow')
    plt.title(f'{col} Distribution Comparison')
    plt.legend()
    plt.show()

**Summary:**

I tested two imputation methods for the missing values:
- KNN imputation using 20 nearest neighbors[9]
- MICE (Multiple Imputation by Chained Equations) [9], [10]

Compared the center, spread, and skew before and after imputation for each method


**Reasoning:**
* Compared results to evaluate how well each method preserved core data characteristics [1]
* Used different neighbors value because it achieved better performance

**Results Analysis:**

- For median_income (Right-skewed distribution):
  * Center: MICE (3.504) deviates less from the original (3.535) than KNN (3.486)
  * Spread: MICE (1.499) deviates less from the original (1.578) than KNN (1.429)
  * Skewness: MICE (1.692) deviates less from the original (1.647) than KNN (1.733).
  * Visualization: KNN slightly overestimates the peak frequency
- For housing_median_age (normal distribution):
  * Center: MICE (28.481) deviates less from the original (28.639) than KNN (28.636)
  * Spread: KNN (12.036) deviates less from the original (12.586) than MICE (12.025).
  * Skewness: MICE (0.052) deviates less from the original (0.060) than KNN (0.019).
  * Visualization: KNN and MICE overly concentrate imputations at the center
- For population (Highly right-skewed distribution):
  * Center: MICE (1152.000) deviates less from the original (1166.000) than KNN (1149.000).
  * Spread: KNN (585.785) deviates less from the original (652.520) than MICE (615.445).
  * Skewness: MICE (4.971) deviates less from the original (4.936) than KNN (5.026).
  * Visualization: Both methods almost exactly mirror the distribution

Based on these findings, I recommend using MICE imputation
- The best method more closely preserves data characteristics [4]
- MICE was consistently better at maintaining the center and skewness of the distributions
- While KNN was sometimes better with spread, MICE performed better on average

In [None]:
# For each dataset, use all features except target for regression
target = 'median_house_value'
features = [col for col in houses.columns if col != target]

# Compare regression performance across datasets
datasets = {
    'Original': houses,
    'KNN Imputed': houses_knn,
    'MICE Imputed': houses_mice
}

print("Regression Results:\n")
for name, data in datasets.items():
    # Prepare data - using multiple predictors and polynomial features
    X = data[features].values
    y = data[target].values

    # Split into 80% training, 20% testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Create polynomial features
    poly = PolynomialFeatures(degree=3, include_bias=False)
    X_train_poly = poly.fit_transform(X_train)
    X_test_poly = poly.transform(X_test)

    # Fit model
    model = LinearRegression()
    model.fit(X_train_poly, y_train)

    # Evaluate
    y_pred = model.predict(X_test_poly)
    r2 = r2_score(y_test, y_pred)
    rmse = root_mean_squared_error(y_test, y_pred)

    print(f"\n{name} Dataset:")
    print(f"R² Score: {r2:.4f}")
    print(f"RMSE: {rmse:.2f}")

    # Visualize predicted vs actual values
    plt.figure(figsize=(10, 4))
    plt.scatter(y_test, y_pred, alpha=0.5)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
    plt.xlabel('Actual House Value')
    plt.ylabel('Predicted House Value')
    plt.title(f'{name} Dataset: Predicted vs Actual Values')
    plt.show()

**Summary:**
- Built regression models to predict house values
- Used the same features for each dataset (original, KNN, and MICE)
- Created polynomial features to catch non-linear relationships
- Compared model performance using R² and RMSE


**Reasoning:**
* Used regression analysis to test of imputation quality
* Added polynomial features to consider non-linear patterns in the distribution [6], [12]
* Compared both R² and RMSE to evaluate model performance across different value ranges [3]

**Results Analysis**
- Original data:
  * R² = 0.6565, RMSE = 67096.22
  * Sets baseline performance
- KNN imputed data:
  * R² = 0.6390, RMSE = 68779.92
  * Best imputation performance overall
- MICE imputed data:
  * R² = 0.6374, RMSE = 68930.63
  * Slight decline from the original
- All models:
  * Predicted well for average houses
  * Had trouble with very expensive houses

Based on these findings I continue to recommend using MICE imputation
- While KNN performed regression better, the difference was negligible and MICE also preserved distributions better
- The similar prediction plots show both methods kept the important relationships between features


### 4. Conclusions & Throughts [3]
- Disucss methods used for anomaly detection, pros/cons of each method
- Disucss challenges/difficulties in anomaly detection implementation
- Discuss methods used for imputation, pros/cons of each method
- Discuss challenges/difficulties in imputation implementation

My analysis of the housing dataset revealed important discoveries about outlier detection and data imputation. I found statistical methods offered significant advantages in outlier detection despite their simplicity. The robust z-score method was particularly effective. Its use of median and MAD made it especially appropriate for my skewed housing data [1]. However, it did show some weakness with my bimodal geographic features.

While algorithmic methods like LOF, SVM, and Isolation Forest could theoretically capture more complex patterns, they presented several practical challenges. LOF struggled with high-density regions, SVM showed problematic strip patterns, and Isolation Forest was generally oversensitive to outliers. These methods also required careful parameter tuning and significant computational resources, making them less practical for my particular dataset [4].

For imputation, both KNN and MICE showed distinct strengths and weaknesses. KNN imputation proved more intuitive to implement and better performed regression. However, it struggled to maintain the center and skew of my features. MICE was more complex to implement and computationally intensive. However, it better preserved the center and skew of my distributions and performed regression almost as well as KNN. Its main drawback was poorer preservation of spread values.

Several challenges emerged during implementation. For outlier detection, I struggled with handling different distribution types among my features. Imputation challenged me with feature selection and parameter tuning, particularly in balancing computational efficiency with accuracy. The visualization and comparison of results across multiple methods also proved difficult.

#### Citations

[1] I. Dixon. (2024). Lecture 1b: Exploratory Data Analysis [PowerPoint slides]. Available: https://ncl.instructure.com/courses/55046/pages/lecture?module_item_id=3467277

[2] I. Dixon. (2024). Practical I [Google Colab notebook]. Available: https://github.com/iaindixon/CSC3831_Part_I/blob/main/CSC3831_Partical_I.ipynb

[3] P. Missier. (201). Beer Imputation Example [Google Colab notebook]. Available: https://github.com/PaoloMissier/CSC3831-2021-22/blob/main/IMPUTATION/regression-example/Beer-imputation-example.ipynb

[4] I. Dixon. (2024). Lecture 2b: Data Imputation [PowerPoint slides]. Available: https://ncl.instructure.com/courses/55046/pages/lecture-2?module_item_id=3503618

[5] I. Dixon. (2024). Lecture 3a: Tabular Anomaly Detection [PowerPoint slides]. Available: https://ncl.instructure.com/courses/55046/pages/lecture-3?module_item_id=3511620

[6] https://www.geeksforgeeks.org/detect-and-remove-the-outliers-using-python/

[7] Scikit Learn. "SVM." scikit-learn.org. Accessed: Dec. 3, 2024. [Online.] Available: https://scikit-learn.org/stable/modules/svm.html


[8] Scikit Learn. "Neighbors." scikit-learn.org. Accessed: Dec. 3, 2024. [Online.] Available: https://scikit-learn.org/stable/modules/neighbors.html

[9] Scikit Learn. "Impute." scikit-learn.org. Accessed: Dec. 4, 2024. [Online.] Available: https://scikit-learn.org/stable/api/sklearn.impute.html

[10] Azur, M.J., Stuart, E.A., Frangakis, C. and Leaf, P.J., "Multiple imputation by chained equations: what is it and how does it work?." Int. J. Methods Psychiatr. Res., 20: 40-49. 2011. [Online.] Available: https://doi.org/10.1002/mpr.329

[11] V. Gonzalez-Zelaya. (2024). Introduction to Machine Learning / Regression Problems [PowerPoint slides]. Available: https://ncl.instructure.com/courses/55046/files/8847813?module_item_id=3517175

[12] Scikit Learn. "Preprocessing." scikit-learn.org. Accessed: Dec. 4, 2024. [Online.] Available: https://scikit-learn.org/stable/modules/preprocessing.html