# About Dataset  
This dataset was scraped from a Bangladeshi real estate website. It is raw and quite messy, containing 20 columns and 811 rows.  
Some of the necessary tasks include EDA, cleaning and extracting data, and changing data types etc. 

## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import re

## Load the data from csv to Data frame

In [2]:
original_data = pd.read_csv("real_estate_raw_data.csv")

In [3]:
# Copy the dataset before making any changes
df = original_data.copy()

## Exploratory Data Analysis (EDA)

In [None]:
# To see the sample data from randomly
pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)

df.sample(3)

In [None]:
df.info()

In [None]:
df.describe()

### Check Duplicate Values

In [None]:
# To check for duplicate data
df.duplicated() == 1

In [None]:
# To Check the url column for duplicacy
df[df['url'].duplicated() == 1]

### Check Null Values

In [None]:
# Check null values in any column
df.isnull().any()

In [None]:
# counts the number of non-Null entries in each column for rows where the 'property size' column is null.
df[df['property size'].isna()].count()

In [None]:
# Count the number of null values in each column
null_counts = df.isna().sum()

print(null_counts)

### Delete where major columns values are null

In [12]:
# Delete rows only where the values of columns of price, property size, parking and year built are null. If any of the column is not null, dont delete it.
df.dropna(subset=['price', 'property size', 'parking', 'year built'], how='all', inplace=True)

## Extracting & Cleaning

### 1. Property Type

In [None]:
# Get the unique values of a column
df['property type'].unique()

In [14]:
# Function to clean Property type
def definite_property_type(type):
    if pd.isna(type):
        return np.nan
   
    if 'Residential' in type:
        return "Residential Apartment"
    else:
        return "Commercial Space"

# Apply function to the 'property size' column
df['property type'] = df['property type'].apply(definite_property_type)


In [None]:
# Display unique cleaned property sizes
df['property type'].unique()
df.sample(2)

### 2. Property Size

In [None]:
df['property size'].unique()

In [17]:
# Function to extract Property Size
def clean_size(size):
    if pd.isna(size):  # To handle null values
        return np.nan
    match = re.search(r'(\d+)', size)
    if match:
        return int(match.group(1))
    return np.nan       # NaN values are of type float

# Apply function to the 'property size' column
df['property size'] = df['property size'].apply(clean_size)

In [None]:
# Rename the column
df = df.rename(columns = {"property size": "property_size(Sq Ft)"})
df['property_size(Sq Ft)']

### 3. Parking

In [None]:
df['parking'].unique()

In [20]:
# Funtion to extract number of parking space
def num_of_parking(parking):

    if isinstance(parking, str):
        parking = parking.replace(' car parking (Per floor)', '').replace(' car parking', '').replace(' Car Parking', '').replace('\xa0', '')
        return parking.strip()
          
    return parking
        
# Apply function to the 'parking' column
df['parking'] = df['parking'].apply(num_of_parking)

In [None]:
df['parking']

###  4. Lift

In [None]:
df['lift'].unique()

In [23]:
# Function to extract lift number
def lift_number(lift_num):
    if pd.isna(lift_num):  # To handle null values
        return np.nan
    match = re.search(r'(\d+)', lift_num)
    if match:
        return int(match.group(1))
    return np.nan       # NaN values are of type float

# Apply function to the 'lift' column
df['lift'] = df['lift'].apply(lift_number)

In [None]:
df['lift'].unique()
df['lift']

Function to clean lift
def lift_num(lift):
    if isinstance(lift, str):
        if ' Available' in lift:
            return lift.split(' Available')[0]
        elif ' available' in lift:
            return lift.split(' available')[0]
          
    return lift

Apply function to the 'lift' column
df['lift'] = df['lift'].apply(lift_num)


### 5. Price

In [None]:
df['price'].unique()

In [26]:
# Cleaning price
df['cleaned_price'] = df['price'].str.split('(').str[0]
df['cleaned_price'] = df['cleaned_price'].str.split('/').str[0].str.strip('BDT ').str.replace(',','').str.strip()

In [None]:
df['cleaned_price'].unique()

In [28]:
# Function to unify measure unit
def unify_price(price):
    if pd.isna(price):
        return np.nan
    
    if 'Lac' in price or 'Lakh' in price:
        match = re.search(r'(\d+\.?\d+)', price)     #re.search(r'(\d+\.?\d*) Lac', price)
        return float(match.group(1)) * 1e5           ## 1 lakh = 100,000 = 10^5
    
    elif 'Cr' in price or 'Cr.' in price:
        match = re.search(r'(\d+\.\d+)', price) or re.search(r'(\d+)', price) 
        if match:
            return float(match.group(1)) * 1e7       ## 1 crore = 10 million = 10^7
    
    # Remove non-numeric characters except for digits
    cleaned_price = re.sub(r'[^\d]', '', price)
    if cleaned_price == '':
        return np.nan
    else:
        return cleaned_price
    
# Apply function to price column
df['price(BDT)'] = df['cleaned_price'].apply(unify_price)


In [None]:
# Set display format to show two decimal places
#pd.set_option('display.float_format', '{:.2f}'.format)

df[['price','price(BDT)']]
#df['price(BDT)']

### 6. Service Charge

In [None]:
df['service_charge'].unique()

In [31]:
# Function to extract service charge
def extract_value(text):
    if isinstance(text, str):
        match = re.search(r'BDT\s*([\d,]+)/-', text)
        if match:
            return match.group(1).replace(',', '')
    return None

# Apply function to 'service_charge' column
df['service_charge(BDT)'] = df['service_charge'].apply(extract_value)

In [None]:
pd.set_option('display.max_rows', None)
df['service_charge(BDT)']

### 7. Garage Size

In [None]:
df['garage size'].unique()

In [34]:
# Function to extracrt garage size
def extract_garage(garage):
    if pd.isna(garage):
        return np.nan
    match = re.search(r'(\d+)', garage)
    if match:
        return match.group(1)
    return np.nan

# Apply function to garage column
df['garage(Sq Ft)'] = df['garage size'].apply(extract_garage)


In [None]:
df['garage(Sq Ft)'] = df['garage(Sq Ft)'].astype(str)
df['garage(Sq Ft)'].unique()

### 8. Bedrooms

In [None]:
df['bedrooms'].unique()

In [37]:
# Function to extract number of bedrooms
def extract_num_of_bedroom(bedroom):
    if pd.isna(bedroom):
        return np.nan
    
    match = re.search(r'(\d+)', bedroom)
    if match:
        return match.group(1)
    return np.nan

# Apply function to extract number of bedrooms
df['bedrooms'] = df['bedrooms'].apply(extract_num_of_bedroom)


In [None]:
df['bedrooms'].unique()

### 9. Bathroom

In [None]:
df['bathrooms'].unique()

In [40]:
# Function to extract number of bathroom
def extract_num_of_bath(bath):
    if pd.isna(bath):
        return np.nan
    match =re.search(r'(\d+)', bath)
    if match:
        return match.group(1)
    return np.nan 

# Apply fx
df['bathrooms'] = df['bathrooms'].apply(extract_num_of_bath)

In [None]:
df['bathrooms']

### 10.Front Road Size

In [None]:
df['front road size'].unique()

In [43]:
# Function to extract front road size
def extract_front_road(road_size):
    if pd.isna(road_size):
        return np.nan
    match = re.search(r'(\d+)', road_size)
    if match:
        return match.group(1)
    return np.nan

# Apply function to fron road size column
df['front_road(Sq Ft)'] = df['front road size'].apply(extract_front_road)

In [None]:
df['front_road(Sq Ft)']

### 11. Common Area	

In [None]:
df['common area'].unique()

In [46]:
# Function to extract common area
def extract_common_area(common_area):
    if pd.isna(common_area):
        return np.nan 
    match = re.search(r'(\d+)', common_area)
    if match:
        return match.group(1)
    return np.nan

# Apply function
df['common_area(Sq Ft)'] = df['common area'].apply(extract_common_area) 

In [None]:
df['common_area(Sq Ft)']

### 12. City Area

In [None]:
df['location']

In [None]:
# Extract city area from location column 
df['city_area'] = df['location'].str.split(',').str[0]
df['city_area']

### 13. District

In [None]:
# Function to create distirct from location
def create_district(district):

    if 'Dhaka' in district or 'dhaka' in district:
        return 'Dhaka'
    else:
        return None
    
# Apply fx to create district
df['district'] = df['location'].apply(create_district)

df['district']


### 14. Zip Code

In [51]:
# Function to extract zip code
def extract_zip_code(zip):
    if isinstance(zip, str):
        
        match = re.search(r'(\d+)', zip)
        if match:
            return match.group(1)
    return None

# Applly function to extract zip code from location column
df['zip_code'] = df['location'].apply(extract_zip_code)

In [None]:
df['zip_code']

### 15. Interior

In [None]:
df['interior'].unique()

In [54]:
# Function to clean interior
def clean_interior(interior):
    if not isinstance(interior, str):
        return None
    
    if 'Un' in interior or 'Non' in interior:
        return 'Un-Furnished'
    elif 'Semi' in interior or 'semi' in interior:
        return 'Semi-Furnished'
    else:
        return 'Fully-Furnised'
    
# Apply fx to interior column
df['interior_style'] = df['interior'].apply(clean_interior)

In [None]:
# Check after cleaning
df[['interior', 'interior_style']]
df['interior_style'].unique()

## Output After Cleaning

In [None]:
df.shape

In [None]:
df.sample(3)

In [58]:
# Delete Columns produced by product of Cleaning: 'price','cleaned_price', 'garage size', 'front road size','common area', 'location', 'interior'
df.drop(['price','cleaned_price', 'garage size', 'front road size','common area', 'location', 'interior', 'service_charge'], axis = 1, inplace= True)

In [None]:
# Data Frame rows number x columns number
df.shape

In [None]:
# Dataframe columns
df.columns

In [None]:
df.info()

In [62]:
# Rearrange columns of DataFrame
new_order = [
    'url', 'title', 'property type', 'property_size(Sq Ft)', 'price(BDT)', 'service_charge(BDT)',
    'bedrooms', 'bathrooms', 'parking', 'lift', 'floor', 'interior_style',
    'year built', 'building registration type', 'preferred tennant',
    'garage(Sq Ft)', 'front_road(Sq Ft)', 'common_area(Sq Ft)',
    'city_area', 'district', 'zip_code', 'country']

df_reordered = df[new_order]


In [None]:
df_reordered.head(3)

In [64]:
# Output is in csv file
df_reordered.to_csv('output_final.csv', index=False)