### a) Mean and Standard Deviation

##### Outlier Detection:

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [19]:
df = pd.read_csv("house_price.csv")
mean = df["price_per_sqft"].mean()
std_dev = df["price_per_sqft"].std()
lower_bound = mean - 3 * std_dev
upper_bound = mean + 3 * std_dev

outliers = df[(df["price_per_sqft"] < lower_bound) | (df["price_per_sqft"] > upper_bound)]
print(outliers)
df

             location       size  total_sqft  bath  price  bhk  price_per_sqft
345             other  3 Bedroom        11.0   3.0   74.0    3          672727
1106            other  5 Bedroom        24.0   2.0  150.0    5          625000
4044   Sarjapur  Road  4 Bedroom         1.0   4.0  120.0    4        12000000
4924            other      7 BHK         5.0   7.0  115.0    7         2300000
11447      Whitefield  4 Bedroom        60.0   4.0  218.0    4          363333


Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.00,4,4615
2,Uttarahalli,3 BHK,1440.0,2.0,62.00,3,4305
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.00,3,6245
4,Kothanur,2 BHK,1200.0,2.0,51.00,2,4250
...,...,...,...,...,...,...,...
13195,Whitefield,5 Bedroom,3453.0,4.0,231.00,5,6689
13196,other,4 BHK,3600.0,5.0,400.00,4,11111
13197,Raja Rajeshwari Nagar,2 BHK,1141.0,2.0,60.00,2,5258
13198,Padmanabhanagar,4 BHK,4689.0,4.0,488.00,4,10407


##### Removal Methods:

In [62]:
# Trimming:
df_trimming = df[(df["price_per_sqft"] >= lower_bound) & (df["price_per_sqft"] <= upper_bound)]

print("Before trimming outliers: ", len(df))
print("After trimming outliers: ", len(df_trimming))
print("Total outliers found: ", len(df) - len(df_trimming))

Before trimming outliers:  13200
After trimming outliers:  13200
Total outliers found:  0


In [51]:
# Capping:
print("Dataset before capping:")
print("____________________________________________________\n")
print(df.describe())
df["price_per_sqft"] = np.clip(df["price_per_sqft"], lower_bound, upper_bound)
print("\nDataset after capping:")
print("____________________________________________________\n")
print(df.describe())

Dataset before capping:
____________________________________________________

         total_sqft          bath         price           bhk  price_per_sqft
count  13200.000000  13200.000000  13200.000000  13200.000000    13200.000000
mean    1555.302783      2.691136    112.276178      2.800833     6835.446522
std     1237.323445      1.338915    149.175995      1.292843     7930.115195
min        1.000000      1.000000      8.000000      1.000000      267.000000
25%     1100.000000      2.000000     50.000000      2.000000     4267.000000
50%     1275.000000      2.000000     71.850000      3.000000     5438.000000
75%     1672.000000      3.000000    120.000000      3.000000     7317.000000
max    52272.000000     40.000000   3600.000000     43.000000   328101.817727

Dataset after capping:
____________________________________________________

         total_sqft          bath         price           bhk  price_per_sqft
count  13200.000000  13200.000000  13200.000000  13200.000000   

### b) Percentile Method

##### Outlier Detection:

In [81]:
df = pd.read_csv("house_price.csv")
low_percentile = df["price_per_sqft"].quantile(0.01)
high_percentile = df["price_per_sqft"].quantile(0.99)

outliers = df[(df["price_per_sqft"] < low_percentile) | (df["price_per_sqft"] > high_percentile)]
print(outliers)

              location       size  total_sqft  bath   price  bhk  \
9                other  6 Bedroom      1020.0   6.0   370.0    6   
20             Kengeri      1 BHK       600.0   1.0    15.0    1   
45          HSR Layout  8 Bedroom       600.0   9.0   200.0    8   
130    Electronic City      2 BHK       880.0   1.0    16.5    2   
169           Attibele      1 BHK       450.0   1.0    11.0    1   
...                ...        ...         ...   ...     ...  ...   
13081            other  6 Bedroom      8000.0   6.0  2800.0    6   
13094            other  4 Bedroom      1200.0   5.0   325.0    4   
13127            other  4 Bedroom      1200.0   5.0   325.0    4   
13185         Hulimavu      1 BHK       500.0   1.0   220.0    1   
13186            other  4 Bedroom      1200.0   5.0   325.0    4   

       price_per_sqft  
9               36274  
20               2500  
45              33333  
130              1875  
169              2444  
...               ...  
13081          

##### Removal Methods:

In [66]:
# Trimming:
df_percentile_trimming = df[(df["price_per_sqft"] >= low_percentile) & (df["price_per_sqft"] <= high_percentile)]

print("Before trimming percentile outliers: ", len(df))
print("After trimming percentile outliers: ", len(df_percentile_trimming))
print("Total outliers found: ", len(df) - len(df_percentile_trimming))

Before trimming percentile outliers:  13200
After trimming percentile outliers:  12941
Total outliers found:  259


In [68]:
# Capping:
print("Dataset before percentile capping:")
print("____________________________________________________\n")
print(df.describe())
df["price_per_sqft"] = np.clip(df["price_per_sqft"], low_percentile, high_percentile)
print("\nDataset after percentile capping:")
print("____________________________________________________\n")
print(df.describe())

Dataset before percentile capping:
____________________________________________________

         total_sqft          bath         price           bhk  price_per_sqft
count  13200.000000  13200.000000  13200.000000  13200.000000    13200.000000
mean    1555.302783      2.691136    112.276178      2.800833     6835.446522
std     1237.323445      1.338915    149.175995      1.292843     7930.115195
min        1.000000      1.000000      8.000000      1.000000      267.000000
25%     1100.000000      2.000000     50.000000      2.000000     4267.000000
50%     1275.000000      2.000000     71.850000      3.000000     5438.000000
75%     1672.000000      3.000000    120.000000      3.000000     7317.000000
max    52272.000000     40.000000   3600.000000     43.000000   328101.817727

Dataset after percentile capping:
____________________________________________________

         total_sqft          bath         price           bhk  price_per_sqft
count  13200.000000  13200.000000  13200.0

### c) IQR (Interquartile Range) Method

##### Outlier Detection:

In [83]:
df = pd.read_csv("house_price.csv")
Q1 = df["price_per_sqft"].quantile(0.25)
Q3 = df["price_per_sqft"].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = df[(df["price_per_sqft"] < lower_bound) | (df["price_per_sqft"] > upper_bound)]
print(outliers)

               location       size  total_sqft  bath  price  bhk  \
7          Rajaji Nagar      4 BHK      3300.0   4.0  600.0    4   
9                 other  6 Bedroom      1020.0   6.0  370.0    6   
22          Thanisandra  4 Bedroom      2800.0   5.0  380.0    4   
45           HSR Layout  8 Bedroom       600.0   9.0  200.0    8   
48             KR Puram  2 Bedroom       800.0   1.0  130.0    2   
...                 ...        ...         ...   ...    ...  ...   
13142             other      2 BHK      1140.0   1.0  185.0    2   
13157             other  7 Bedroom      1400.0   7.0  218.0    7   
13185          Hulimavu      1 BHK       500.0   1.0  220.0    1   
13186             other  4 Bedroom      1200.0   5.0  325.0    4   
13191  Ramamurthy Nagar  7 Bedroom      1500.0   9.0  250.0    7   

       price_per_sqft  
7               18181  
9               36274  
22              13571  
45              33333  
48              16250  
...               ...  
13142          

##### Removal Methods:

In [89]:
# Trimming:
df_iqr_trimming = df[(df["price_per_sqft"] >= lower_bound) & (df["price_per_sqft"] <= upper_bound)]

print("Before trimming IQR outliers: ", len(df))
print("After trimming IQR outliers: ", len(df_iqr_trimming))
print("Total outliers found: ", len(df) - len(df_iqr_trimming))

Before trimming IQR outliers:  13200
After trimming IQR outliers:  13200
Total outliers found:  0


In [91]:
# Capping:
print("Dataset before IQR capping:")
print("____________________________________________________\n")
print(df.describe())
df["price_per_sqft"] = np.clip(df["price_per_sqft"], lower_bound, upper_bound)
print("\nDataset after IQR capping:")
print("____________________________________________________\n")
print(df.describe())

Dataset before IQR capping:
____________________________________________________

         total_sqft          bath         price           bhk  price_per_sqft
count  13200.000000  13200.000000  13200.000000  13200.000000    13200.000000
mean    1555.302783      2.691136    112.276178      2.800833     6181.215985
std     1237.323445      1.338915    149.175995      1.292843     2631.064261
min        1.000000      1.000000      8.000000      1.000000      267.000000
25%     1100.000000      2.000000     50.000000      2.000000     4267.000000
50%     1275.000000      2.000000     71.850000      3.000000     5438.000000
75%     1672.000000      3.000000    120.000000      3.000000     7317.000000
max    52272.000000     40.000000   3600.000000     43.000000    11892.000000

Dataset after IQR capping:
____________________________________________________

         total_sqft          bath         price           bhk  price_per_sqft
count  13200.000000  13200.000000  13200.000000  13200.0

### d) Z-Score Method

##### Outlier Detection:

In [95]:
df = pd.read_csv("house_price.csv")
df["z_score"] = (df["price_per_sqft"] - df["price_per_sqft"].mean()) / df["price_per_sqft"].std()
outliers = df[(df["z_score"] < -3) | (df["z_score"] > 3)]
print(outliers)

             location       size  total_sqft  bath  price  bhk  \
345             other  3 Bedroom        11.0   3.0   74.0    3   
1106            other  5 Bedroom        24.0   2.0  150.0    5   
4044   Sarjapur  Road  4 Bedroom         1.0   4.0  120.0    4   
4924            other      7 BHK         5.0   7.0  115.0    7   
11447      Whitefield  4 Bedroom        60.0   4.0  218.0    4   

       price_per_sqft     z_score  
345            672727    6.229030  
1106           625000    5.781843  
4044         12000000  112.362023  
4924          2300000   21.476067  
11447          363333    3.330105  


##### Removal Methods:

In [98]:
# Trimming:
df_zscore_trimming = df[df["z_score"].abs() <= 3]

print("Before trimming IQR outliers: ", len(df))
print("After trimming IQR outliers: ", len(df_zscore_trimming))
print("Total outliers found: ", len(df) - len(df_zscore_trimming))

Before trimming IQR outliers:  13200
After trimming IQR outliers:  13195
Total outliers found:  5


In [104]:
# Imputation with mean:
print("Dataset before Z-Score Imputation Mean:")
print("____________________________________________________\n")
print(df.describe())
df.loc[df["z_score"].abs() > 3, "price_per_sqft"] = df["price_per_sqft"].mean()
print("\nDataset after Z-Score Imputation Mean:")
print("____________________________________________________\n")
print(df.describe())

Dataset before Z-Score Imputation Mean:
____________________________________________________

         total_sqft          bath         price           bhk  price_per_sqft  \
count  13200.000000  13200.000000  13200.000000  13200.000000    13200.000000   
mean    1555.302783      2.691136    112.276178      2.800833     6713.225379   
std     1237.323445      1.338915    149.175995      1.292843     4875.867134   
min        1.000000      1.000000      8.000000      1.000000      267.000000   
25%     1100.000000      2.000000     50.000000      2.000000     4267.000000   
50%     1275.000000      2.000000     71.850000      3.000000     5438.000000   
75%     1672.000000      3.000000    120.000000      3.000000     7313.000000   
max    52272.000000     40.000000   3600.000000     43.000000   200000.000000   

            z_score  
count  1.320000e+04  
mean  -1.076580e-18  
std    1.000000e+00  
min   -7.170936e-02  
25%   -3.423062e-02  
50%   -2.325872e-02  
75%   -5.653076e-03  


In [102]:
# Imputation with median:
print("Dataset before Z-Score Imputation Median:")
print("____________________________________________________\n")
print(df.describe())
df.loc[df["z_score"].abs() > 3, "price_per_sqft"] = df["price_per_sqft"].median()
print("\nDataset after Z-Score Imputation Median:")
print("____________________________________________________\n")
print(df.describe())

Dataset before Z-Score Imputation:
____________________________________________________

         total_sqft          bath         price           bhk  price_per_sqft  \
count  13200.000000  13200.000000  13200.000000  13200.000000    13200.000000   
mean    1555.302783      2.691136    112.276178      2.800833     6714.165658   
std     1237.323445      1.338915    149.175995      1.292843     4875.860475   
min        1.000000      1.000000      8.000000      1.000000      267.000000   
25%     1100.000000      2.000000     50.000000      2.000000     4267.000000   
50%     1275.000000      2.000000     71.850000      3.000000     5438.000000   
75%     1672.000000      3.000000    120.000000      3.000000     7317.000000   
max    52272.000000     40.000000   3600.000000     43.000000   200000.000000   

            z_score  
count  1.320000e+04  
mean  -1.076580e-18  
std    1.000000e+00  
min   -7.170936e-02  
25%   -3.423062e-02  
50%   -2.325872e-02  
75%   -5.653076e-03  
max  