In [75]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

df_hou = pd.read_csv('Housing.csv')

def get_numeric(df):
    return df.select_dtypes(include=[np.number]).columns.tolist()

def fill_missing(df, strat):
    df0 = df.copy()
    for c in get_numeric(df0):
        if strat == 'zero':
            df0[c].fillna(0, inplace=True)
        elif strat == 'mean':
            df0[c].fillna(df0[c].mean(), inplace=True)
        elif strat == 'median':
            df0[c].fillna(df0[c].median(), inplace=True)
    return df0

def minmax_scale(arr):
    return (arr - arr.min()) / (arr.max() - arr.min())

def standard_scale(arr):
    return (arr - arr.mean()) / arr.std()

def scale_df(df, method):
    df0 = df.copy()
    for c in get_numeric(df0):
        if method == 'minmax':
            df0[c] = minmax_scale(df0[c])
        elif method == 'standard':
            df0[c] = standard_scale(df0[c])
    return df0

def remove_outliers(df):
    df0 = df.copy()
    for c in get_numeric(df0):
        Q1, Q3 = df0[c].quantile([0.25, 0.75])
        IQR = Q3 - Q1
        mask = ~((df0[c] < Q1 - 1.5 * IQR) | (df0[c] > Q3 + 1.5 * IQR))
        df0 = df0.loc[mask]
    return df0


def evaluate(df, target_col):
    features = [c for c in get_numeric(df) if c != target_col]
    variants = {}
    for strat in ['zero', 'mean', 'median']:
        filled = fill_missing(df, strat)
        for scale in ['minmax', 'standard']:
            name = f"{strat}_{scale}"
            scaled = scale_df(filled, scale)
            variants[name] = scaled
            variants[name + '_no_out'] = remove_outliers(scaled)
    rows = []
    for name, dfv in variants.items():
        if dfv.shape[0] < 10:
            continue
        X = dfv[features].values
        y = dfv[target_col].values
        m = LinearRegression().fit(X, y)
        ypred = m.predict(X)
        rows.append((name, dfv.shape[0],
                     mean_squared_error(y, ypred),
                     r2_score(y, ypred)))

    return pd.DataFrame(rows, columns=['Variant','N','MSE','R2']) \
             .sort_values('R2', ascending=False) \
             .reset_index(drop=True)

target_col = 'price'   
housing_results = evaluate(df_hou, target_col)
print("Housing preprocessing results:")
print(housing_results.to_string(index=False))


Housing preprocessing results:
               Variant   N      MSE       R2
           zero_minmax 545 0.011477 0.561583
         zero_standard 545 0.437613 0.561583
           mean_minmax 545 0.011477 0.561583
         mean_standard 545 0.437613 0.561583
         median_minmax 545 0.011477 0.561583
       median_standard 545 0.437613 0.561583
    zero_minmax_no_out 365 0.007382 0.365161
  zero_standard_no_out 365 0.281477 0.365161
    mean_minmax_no_out 365 0.007382 0.365161
  mean_standard_no_out 365 0.281477 0.365161
  median_minmax_no_out 365 0.007382 0.365161
median_standard_no_out 365 0.281477 0.365161


In [76]:

df_hou = pd.read_csv('Housing.csv')

numeric_cols = df_hou.select_dtypes(include=[np.number]).columns.tolist()

outlier_counts = pd.DataFrame(columns=['Column', 'Outlier_Count'])

for col in numeric_cols:
    Q1 = df_hou[col].quantile(0.25)
    Q3 = df_hou[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outlier_mask = (df_hou[col] < lower_bound) | (df_hou[col] > upper_bound)
    count = outlier_mask.sum()
    outlier_counts = outlier_counts.append({
        'Column': col,
        'Outlier_Count': count
    }, ignore_index=True)

outlier_counts.sort_values('Outlier_Count', ascending=False, inplace=True)

print("Outlier count by column:")
print(outlier_counts.to_string(index=False))


Outlier count by column:
   Column Outlier_Count
  stories            41
    price            15
     area            12
 bedrooms            12
  parking            12
bathrooms             1


  outlier_counts = outlier_counts.append({
  outlier_counts = outlier_counts.append({
  outlier_counts = outlier_counts.append({
  outlier_counts = outlier_counts.append({
  outlier_counts = outlier_counts.append({
  outlier_counts = outlier_counts.append({
