In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('/content/Bengaluru_House_Data (1).csv')

# Display the first 5 rows
print("First 5 rows of the DataFrame:")
print(df.head())

First 5 rows of the DataFrame:
              area_type   availability                  location       size  \
0  Super built-up  Area         19-Dec  Electronic City Phase II      2 BHK   
1            Plot  Area  Ready To Move          Chikka Tirupathi  4 Bedroom   
2        Built-up  Area  Ready To Move               Uttarahalli      3 BHK   
3  Super built-up  Area  Ready To Move        Lingadheeranahalli      3 BHK   
4  Super built-up  Area  Ready To Move                  Kothanur      2 BHK   

   society total_sqft  bath  balcony   price  
0   Coomee       1056   2.0      1.0   39.07  
1  Theanmp       2600   5.0      3.0  120.00  
2      NaN       1440   2.0      3.0   62.00  
3  Soiewre       1521   3.0      1.0   95.00  
4      NaN       1200   2.0      1.0   51.00  


In [2]:
missing_society_count = df['society'].isnull().sum()
total_rows = len(df)
missing_society_percentage = (missing_society_count / total_rows) * 100

print(f"Number of missing values in 'society' column: {missing_society_count}")
print(f"Percentage of missing values in 'society' column: {missing_society_percentage:.2f}%")

Number of missing values in 'society' column: 5502
Percentage of missing values in 'society' column: 41.31%


In [3]:
df['society'].fillna('No Society', inplace=True)

print("Missing values in 'society' column after imputation:")
print(df['society'].isnull().sum())

Missing values in 'society' column after imputation:
0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['society'].fillna('No Society', inplace=True)


In [4]:
df['society'] = df['society'].fillna('No Society')

print("Missing values in 'society' column after imputation:")
print(df['society'].isnull().sum())

Missing values in 'society' column after imputation:
0


In [5]:
Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outliers
outliers = df[(df['price'] < lower_bound) | (df['price'] > upper_bound)]

print(f"First Quartile (Q1): {Q1}")
print(f"Third Quartile (Q3): {Q3}")
print(f"Interquartile Range (IQR): {IQR}")
print(f"Lower Bound for Outliers: {lower_bound}")
print(f"Upper Bound for Outliers: {upper_bound}")
print(f"Total number of outliers found in 'price' column: {len(outliers)}")

First Quartile (Q1): 50.0
Third Quartile (Q3): 120.0
Interquartile Range (IQR): 70.0
Lower Bound for Outliers: -55.0
Upper Bound for Outliers: 225.0
Total number of outliers found in 'price' column: 1276


In [6]:
print("\nDescriptive statistics for 'price' in the original dataset:")
print(df['price'].describe())

print("\nDescriptive statistics for 'price' in the outliers dataset:")
print(outliers['price'].describe())


Descriptive statistics for 'price' in the original dataset:
count    13320.000000
mean       112.565627
std        148.971674
min          8.000000
25%         50.000000
50%         72.000000
75%        120.000000
max       3600.000000
Name: price, dtype: float64

Descriptive statistics for 'price' in the outliers dataset:
count    1276.000000
mean      425.746865
std       324.368791
min       226.000000
25%       260.000000
50%       325.000000
75%       450.000000
max      3600.000000
Name: price, dtype: float64


In [7]:
df_cleaned = df[~((df['price'] < lower_bound) | (df['price'] > upper_bound))]

print(f"Original DataFrame shape: {df.shape}")
print(f"Cleaned DataFrame shape after removing price outliers: {df_cleaned.shape}")

Original DataFrame shape: (13320, 9)
Cleaned DataFrame shape after removing price outliers: (12044, 9)
