# 4. Data Cleaning and Preparation
Problem Statement: Given a dataset with missing values and inconsistencies, clean and
prepare the data for analysis.
Requirements:
Use Python or R to perform data cleaning.
Document the steps taken to handle missing values and inconsistencies.
Evaluation Criteria: Correctness of data cleaning steps, clarity of documentation.

# Step:1 Import libraries and dataset

In [19]:
import pandas as pd
df = pd.read_csv("R:\data set\Bengaluru_House_Data.csv")
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


# Step2: Summary of the dataset

In [20]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB
None


In [21]:
print(df.describe())

               bath       balcony         price
count  13247.000000  12711.000000  13320.000000
mean       2.692610      1.584376    112.565627
std        1.341458      0.817263    148.971674
min        1.000000      0.000000      8.000000
25%        2.000000      1.000000     50.000000
50%        2.000000      2.000000     72.000000
75%        3.000000      2.000000    120.000000
max       40.000000      3.000000   3600.000000


In [22]:
print(df.shape)

(13320, 9)


# Step3: Handling missing values

In [23]:
#check missing values
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])

location       1
size          16
society     5502
bath          73
balcony      609
dtype: int64


In [24]:
# drop a column
df.drop(columns=['society'], inplace=True)
df.head()

Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,1200,2.0,1.0,51.0


In [25]:
# Fill missing value with a placeholder
df['location'] = df['location'].fillna('Unknown')
df.head()

Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,1200,2.0,1.0,51.0


In [26]:
# Convert to numeric, if not already
df['bath'] = pd.to_numeric(df['bath'], errors='coerce')  

# Fill missing values with median
df['bath'] = df['bath'].fillna(df['bath'].median())  

# Convert to integer type
df['bath'] = df['bath'].astype(int)

df.head()

Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,1056,2,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,2600,5,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,1440,2,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,1521,3,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,1200,2,1.0,51.0


In [27]:
# Convert to numeric, if not already
df['balcony'] = pd.to_numeric(df['balcony'], errors='coerce')  

# Fill missing values with median
df['balcony'] = df['balcony'].fillna(df['balcony'].median())  

# Convert to integer type
df['balcony'] = df['balcony'].astype(int)

df.head()

Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,1056,2,1,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,2600,5,3,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,1440,2,3,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,1521,3,1,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,1200,2,1,51.0


In [29]:
# Fill missing values with mode
df['size'] = df['size'].fillna(df['size'].mode())  
df.head()

Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,1056,2,1,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,2600,5,3,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,1440,2,3,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,1521,3,1,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,1200,2,1,51.0


In [30]:
missing_values = df.isnull().sum()
print(missing_values)

area_type        0
availability     0
location         0
size            16
total_sqft       0
bath             0
balcony          0
price            0
dtype: int64


# Step4: Handling Duplicate values

In [31]:
print(df.duplicated().sum())

569


In [12]:
# Remove duplicate rows
df = df.drop_duplicates()
df.head()

Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,1056,2,1,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,2600,5,3,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,1440,2,3,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,1521,3,1,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,1200,2,1,51.0
