In [1]:
import pandas as pd
import  numpy as np
df = pd.read_csv('Handling Outliers.csv')
# Focus on the two numeric columns
cols = ['SalePrice', 'LotArea']

# Function to compute IQR bounds
def get_iqr_bounds(series, factor=1.5):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - factor * IQR
    upper = Q3 + factor * IQR
    return lower, upper

# Detecting & printing out how many outliers for each column
for col in cols:
    lower, upper = get_iqr_bounds(df[col])
    print(f"For {col}: Q1, Q3, IQR:")
    print(f"  Lower bound = {lower:.2f}")
    print(f"  Upper bound = {upper:.2f}")
    outliers = df[(df[col] < lower) | (df[col] > upper)]
    print(f"  Number of outliers = {outliers.shape[0]}")
    # Optionally see some of them
    print(outliers[[col]].describe())
    print('---')

# Handling outliers

# Option A: Remove outlier rows entirely
df_no_outliers = df.copy()
for col in cols:
    lower, upper = get_iqr_bounds(df_no_outliers[col])
    df_no_outliers = df_no_outliers[(df_no_outliers[col] >= lower) & (df_no_outliers[col] <= upper)]

print("Shape before:", df.shape, "after removing outliers:", df_no_outliers.shape)

# Option B: Cap (clip) the outliers instead of removing
df_capped = df.copy()
for col in cols:
    lower, upper = get_iqr_bounds(df_capped[col])
    df_capped[col] = df_capped[col].clip(lower=lower, upper=upper)

print("Capping done.  Stats after capping:")
print(df_capped[cols].describe())


For SalePrice: Q1, Q3, IQR:
  Lower bound = 3937.50
  Upper bound = 340037.50
  Number of outliers = 61
           SalePrice
count      61.000000
mean   425954.180328
std     89557.255611
min    341000.000000
25%    372500.000000
50%    394617.000000
75%    440000.000000
max    755000.000000
---
For LotArea: Q1, Q3, IQR:
  Lower bound = 1481.50
  Upper bound = 17673.50
  Number of outliers = 69
             LotArea
count      69.000000
mean    35601.014493
std     35475.698864
min      1300.000000
25%     20544.000000
50%     23595.000000
75%     35133.000000
max    215245.000000
---
Shape before: (1460, 81) after removing outliers: (1326, 81)
Capping done.  Stats after capping:
           SalePrice       LotArea
count    1460.000000   1460.000000
mean   177331.526370   9647.388014
std     67205.835915   3594.356399
min     34900.000000   1481.500000
25%    129975.000000   7553.500000
50%    163000.000000   9478.500000
75%    214000.000000  11601.500000
max    340037.500000  17673.5000

In [2]:
df.to_csv('output3.csv', index=False)
print("CSV file created: output3.csv")

CSV file created: output3.csv
