In [1]:
import numpy as np
import pandas as pd 
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("C:\\Pandas\\bhp.csv")
df.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4,4615
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3,4305
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3,6245
4,Kothanur,2 BHK,1200.0,2.0,51.0,2,4250


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13200 entries, 0 to 13199
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   location        13200 non-null  object 
 1   size            13200 non-null  object 
 2   total_sqft      13200 non-null  float64
 3   bath            13200 non-null  float64
 4   price           13200 non-null  float64
 5   bhk             13200 non-null  int64  
 6   price_per_sqft  13200 non-null  int64  
dtypes: float64(3), int64(2), object(2)
memory usage: 722.0+ KB


In [4]:
# Remove outliers using percentile technique first. Use [0.001, 0.999] for lower and upper bound percentiles
percentile_1 = df['price_per_sqft'].quantile(0.001)
percentile_1

1366.184

In [5]:
percentile_99 = df['price_per_sqft'].quantile(0.999)
percentile_99

50959.36200000098

In [6]:
df['price_per_sqft'].quantile(0.001,interpolation = 'linear')

1366.184

In [7]:
df['price_per_sqft'].quantile(0.001,interpolation = 'higher')

1379

In [8]:
df_no_outlier = df[df.price_per_sqft < percentile_99]
df_no_outlier

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.00,4,4615
2,Uttarahalli,3 BHK,1440.0,2.0,62.00,3,4305
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.00,3,6245
4,Kothanur,2 BHK,1200.0,2.0,51.00,2,4250
...,...,...,...,...,...,...,...
13195,Whitefield,5 Bedroom,3453.0,4.0,231.00,5,6689
13196,other,4 BHK,3600.0,5.0,400.00,4,11111
13197,Raja Rajeshwari Nagar,2 BHK,1141.0,2.0,60.00,2,5258
13198,Padmanabhanagar,4 BHK,4689.0,4.0,488.00,4,10407


In [9]:
df_no_outlier.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,13186.0,13186.0,13186.0,13186.0,13186.0
mean,1555.784221,2.690429,111.595825,2.800167,6657.4661
std,1237.058719,1.337659,145.34008,1.292094,4143.188587
min,250.0,1.0,8.0,1.0,267.0
25%,1100.0,2.0,50.0,2.0,4266.0
50%,1275.0,2.0,71.61,3.0,5432.5
75%,1671.5,3.0,120.0,3.0,7307.75
max,52272.0,40.0,3600.0,43.0,50349.0


In [10]:
filter_df = df[(df['price_per_sqft'] > percentile_1 ) & (df['price_per_sqft'] < percentile_99)]
filter_df

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.00,4,4615
2,Uttarahalli,3 BHK,1440.0,2.0,62.00,3,4305
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.00,3,6245
4,Kothanur,2 BHK,1200.0,2.0,51.00,2,4250
...,...,...,...,...,...,...,...
13195,Whitefield,5 Bedroom,3453.0,4.0,231.00,5,6689
13196,other,4 BHK,3600.0,5.0,400.00,4,11111
13197,Raja Rajeshwari Nagar,2 BHK,1141.0,2.0,60.00,2,5258
13198,Padmanabhanagar,4 BHK,4689.0,4.0,488.00,4,10407


In [11]:
# After removing outliers in step 1, you get a new dataframe.

In [12]:
mean_value = df['price_per_sqft'].mean()
mean_value

7920.336742424242

In [13]:
standard_value = df['price_per_sqft'].std()
standard_value

106727.16032810845

In [14]:
# On step(2) dataframe, use 4 standard deviation to remove outliers
lower_bound = mean_value - 4*standard_value
upper_bound = mean_value + 4*standard_value

In [15]:
lower_bound

-418988.3045700096

In [16]:
upper_bound

434828.97805485805

In [17]:
df_filtered = df[(df['price_per_sqft'] >= lower_bound) & (df['price_per_sqft'] <= upper_bound)]
df_filtered

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.00,4,4615
2,Uttarahalli,3 BHK,1440.0,2.0,62.00,3,4305
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.00,3,6245
4,Kothanur,2 BHK,1200.0,2.0,51.00,2,4250
...,...,...,...,...,...,...,...
13195,Whitefield,5 Bedroom,3453.0,4.0,231.00,5,6689
13196,other,4 BHK,3600.0,5.0,400.00,4,11111
13197,Raja Rajeshwari Nagar,2 BHK,1141.0,2.0,60.00,2,5258
13198,Padmanabhanagar,4 BHK,4689.0,4.0,488.00,4,10407


In [None]:
# Plot histogram for new dataframe that is generated after step (3). Also plot bell curve on same histogram
sns.histplot(df_filtered['price_per_sqft'],binwidth=0.8,kde=True)
plt.show()

  with pd.option_context('mode.use_inf_as_na', True):


In [19]:
# On step(2) dataframe, use zscore of 4 to remove outliers. This is quite similar to step (3) and you will get exact same result