##  Import Data and Required Packages

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

: 

In [None]:
df=pd.read_csv('data/house_price_bd.csv')

## Show Top 5 Records

In [None]:
df.head()

## Shape of the dataset

In [None]:
df.shape

## Check Missing values

In [None]:
df.isna().sum()

In [None]:
# Visualize the null values in dataset
plt.figure(figsize=(8,5))
sns.heatmap(df.isna(), yticklabels=False, cbar=False, cmap='viridis')
plt.title("Visualize Null Value")
plt.show()

## Drop null values

In [None]:
df.dropna(subset=['Occupancy_status', 'Floor_area', 'Location'], inplace=True)

In [None]:
# most homes likely to have at least 3 bedrooms and 2 bathrooms
df['Bedrooms'].fillna(3., inplace=True)
df['Bathrooms'].fillna(2., inplace=True)

In [None]:
df['Floor_no'].unique()

In [None]:
df['Floor_no'].value_counts()

In [None]:
# Convert to numeric, non-numeric values become NaN
df['Floor_no'] = pd.to_numeric(df['Floor_no'], errors='coerce')

In [None]:
# Impute missing values with the mode
df['Floor_no'].fillna(df['Floor_no'].mode()[0], inplace=True)

In [None]:
# Recheck null values
df.isna().sum()

In [None]:
# Recheck shape of the dataset
df.shape

In [None]:
df.head()

## Check duplicates

In [None]:
df.duplicated().sum()

## Drop duplicates

In [None]:
df = df.drop_duplicates()

In [None]:
# Recheck shape of the dataset
df.shape

## Check data type 

In [None]:
df.dtypes

## Convert data types

In [None]:
# Convert 'Bedrooms', 'Bathrooms', 'Floor_no', 'Floor_area', Price_in_taka' into integer
df['Bedrooms'] = df['Bedrooms'].astype('int64')
df['Bathrooms'] = df['Bathrooms'].astype('int64')
df['Floor_no'] = df['Floor_no'].astype('int64')
df['Floor_area'] = df['Floor_area'].astype('int64')

# Remove the '৳' symbol
df['Price_in_taka'] = df['Price_in_taka'].str.replace('৳', '', regex=False)
# Remove commas
df['Price_in_taka'] = df['Price_in_taka'].str.replace(',', '', regex=False)
# Convert to numeric
df['Price_in_taka'] = pd.to_numeric(df['Price_in_taka'])

In [None]:
df.dtypes

## Drop unnecessary column

In [None]:
# Drop 'Title' column from the dataset
df.drop('Title', axis=1, inplace=True)

In [None]:
df['Occupancy_status'].unique()

In [None]:
df['Occupancy_status'].value_counts()

Since the column is highly imbalanced (almost all values are vacant), it likely has little predictive value.

In [None]:
# Drop 'Occupancy_status' from the dataset
df = df.drop(columns=['Occupancy_status'])

In [None]:
df.head()

## Checking the number of unique values of each column

In [None]:
df.nunique()

## Check statistics of data set

In [None]:
df.describe()

## Drop outliers

In [None]:
# Visualize each feature to identify outliers
for col in ['Bedrooms', 'Bathrooms', 'Floor_no', 'Floor_area', 'Price_in_taka']:
    sns.boxplot(x=df[col])
    plt.title(f'Boxplot of {col}')
    plt.show()

In [None]:
# IQR method to identify outliers
for col in ['Bedrooms', 'Bathrooms', 'Floor_no', 'Floor_area', 'Price_in_taka']:
    Q1 = df[col].quantile(0.25)  # 25th percentile
    Q3 = df[col].quantile(0.75)  # 75th percentile
    IQR = Q3 - Q1  # Interquartile range

    # Bounds for outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Remove rows outside the bounds
    df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]


In [None]:
df.describe()

In [None]:
# Data distribution of each feature
for col in ['Bedrooms', 'Bathrooms', 'Floor_no', 'Floor_area', 'Price_in_taka']:
    sns.distplot(x=df[col])
    plt.title(f'Boxplot of {col}')
    plt.show()


In [None]:
# After removing outliers
sns.boxplot(df)
plt.show()

## Exploring Data

In [None]:
df.head()

## Modify 'Location' column

In [None]:
df['Location'] = df['Location'].apply(lambda x: x.split(',')[-1].strip())

In [None]:
df.head()

In [None]:
# define numerical and categorical features
numeric_features = [feature for feature in df.columns if df[feature].dtype != 'O']
categorical_features = [feature for feature in df.columns if df[feature].dtype == 'O']

# print columns
print('We have {} numerical features : {}'.format(len(numeric_features), numeric_features))
print('\nWe have {} categorical features : {}'.format(len(categorical_features), categorical_features))

## Histogram & KDE

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(16, 8))
plt.subplot(121)
sns.histplot(data=df, x='Price_in_taka',bins = 30, kde=True, color='g')
plt.subplot(122)
sns.histplot(data=df, x='Price_in_taka', hue='City', kde=True)
plt.show()

  - **Narayanganj-city (red)**: Houses are mostly concentrated at the lower price range (~2–5 million taka). This indicates a less expensive housing market.
   - **Dhaka (blue)**: A wider spread of prices, with more properties in the higher range (~6–14 million taka), suggesting a diverse and expensive housing market.
   - **Chattogram (orange)**: The price distribution shows a moderate range, with peaks slightly higher than Narayanganj but lower than Dhaka.
   - **Gazipur (green)** and **Cumilla (gray)**: These cities have limited distributions and align more closely with Narayanganj’s lower price trends.

2. **Affordability**:
   - Cities like Narayanganj and Gazipur might be more affordable compared to Dhaka and Chattogram.
   - Dhaka likely represents premium properties or highly sought-after locations, driving up prices.

3. **Overlap Between Cities**:
   - Some overlap exists between cities like Dhaka and Chattogram, but Narayanganj clearly stands out as the least expensive market.
