In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso, BayesianRidge
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from lightgbm import LGBMRegressor
import os
import warnings
warnings.filterwarnings('ignore')

In [2]:
print(os.getcwd())

D:\ACADEMIC\SEMESTER-5\SPL-2\SPL2-reantalapp\server\RentPrediction\Notebook


In [6]:
data_path = os.path.join('..', 'Data', 'houserentdhaka.csv')
print("Dataset path:", data_path)

Dataset path: ..\Data\houserentdhaka.csv


In [8]:
df=pd.read_csv(data_path)

In [9]:
df.head()

Unnamed: 0.1,Unnamed: 0,Location,Area,Bed,Bath,Price
0,0,"Block H, Bashundhara R-A, Dhaka","1,600 sqft",3,3,20 Thousand
1,1,"Farmgate, Tejgaon, Dhaka",900 sqft,2,2,20 Thousand
2,2,"Block B, Nobodoy Housing Society, Mohammadpur,...","1,250 sqft",3,3,18 Thousand
3,3,"Gulshan 1, Gulshan, Dhaka","2,200 sqft",3,4,75 Thousand
4,4,"Baridhara, Dhaka","2,200 sqft",3,3,75 Thousand


In [10]:
df.shape

(28800, 6)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28800 entries, 0 to 28799
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  28800 non-null  int64 
 1   Location    28800 non-null  object
 2   Area        28800 non-null  object
 3   Bed         28800 non-null  int64 
 4   Bath        28800 non-null  int64 
 5   Price       28800 non-null  object
dtypes: int64(3), object(3)
memory usage: 1.3+ MB


In [12]:
df.columns

Index(['Unnamed: 0', 'Location', 'Area', 'Bed', 'Bath', 'Price'], dtype='object')

In [13]:
df.rename({'Price':'Rent'},axis=1, inplace=True)

In [14]:
df.drop(columns=df.columns[0], axis=1,inplace=True)

In [15]:
df.head()

Unnamed: 0,Location,Area,Bed,Bath,Rent
0,"Block H, Bashundhara R-A, Dhaka","1,600 sqft",3,3,20 Thousand
1,"Farmgate, Tejgaon, Dhaka",900 sqft,2,2,20 Thousand
2,"Block B, Nobodoy Housing Society, Mohammadpur,...","1,250 sqft",3,3,18 Thousand
3,"Gulshan 1, Gulshan, Dhaka","2,200 sqft",3,4,75 Thousand
4,"Baridhara, Dhaka","2,200 sqft",3,3,75 Thousand


In [16]:
df.dtypes

Location    object
Area        object
Bed          int64
Bath         int64
Rent        object
dtype: object

In [17]:
df['Location'].head()

0                      Block H, Bashundhara R-A, Dhaka
1                             Farmgate, Tejgaon, Dhaka
2    Block B, Nobodoy Housing Society, Mohammadpur,...
3                            Gulshan 1, Gulshan, Dhaka
4                                     Baridhara, Dhaka
Name: Location, dtype: object

In [18]:
df['Location']=df['Location'].str.split(',').str[:-1]

df['Location'] = df['Location'].str[-1]

df['Location'] = df['Location'].str.strip()

df['Location'].head()

0    Bashundhara R-A
1            Tejgaon
2        Mohammadpur
3            Gulshan
4          Baridhara
Name: Location, dtype: object

In [19]:
df['Area']=df['Area'].str.split(' ').str[:-1]

In [20]:
df['Area'].head()

0    [1,600]
1      [900]
2    [1,250]
3    [2,200]
4    [2,200]
Name: Area, dtype: object

In [21]:
df['Area'] = df['Area'].apply(lambda x: ''.join(map(str, x)))

In [22]:
df['Area'] = df['Area'].str.replace(',','')

In [23]:
df['Area'] = df['Area'].astype(int)

In [24]:
df['Area'].head()

0    1600
1     900
2    1250
3    2200
4    2200
Name: Area, dtype: int32

In [25]:
print(df['Area'].dtype)

int32


In [26]:
df['Area'] = df['Area'].astype("int64")

In [27]:
print(df['Area'].dtypes)

int64


In [28]:
df['Area'].head()

0    1600
1     900
2    1250
3    2200
4    2200
Name: Area, dtype: int64

In [29]:
df['Rent']=df['Rent'].replace({"Thousand":"*1e3", "Lakh":"*1e5"}, regex=True).map(pd.eval).astype("int64")

df['Rent'].head()

0    20000
1    20000
2    18000
3    75000
4    75000
Name: Rent, dtype: int64

In [30]:
df.head()

Unnamed: 0,Location,Area,Bed,Bath,Rent
0,Bashundhara R-A,1600,3,3,20000
1,Tejgaon,900,2,2,20000
2,Mohammadpur,1250,3,3,18000
3,Gulshan,2200,3,4,75000
4,Baridhara,2200,3,3,75000


In [31]:
df.dtypes

Location    object
Area         int64
Bed          int64
Bath         int64
Rent         int64
dtype: object

In [32]:
df.isna().sum()

Location    13
Area         0
Bed          0
Bath         0
Rent         0
dtype: int64

In [33]:
df.dropna(inplace=True)

In [34]:
df.head()

Unnamed: 0,Location,Area,Bed,Bath,Rent
0,Bashundhara R-A,1600,3,3,20000
1,Tejgaon,900,2,2,20000
2,Mohammadpur,1250,3,3,18000
3,Gulshan,2200,3,4,75000
4,Baridhara,2200,3,3,75000


In [35]:
df1=df.copy()
df1['Price_Per_Sqft']=df1['Rent']/df1['Area']
df1.head()

Unnamed: 0,Location,Area,Bed,Bath,Rent,Price_Per_Sqft
0,Bashundhara R-A,1600,3,3,20000,12.5
1,Tejgaon,900,2,2,20000,22.222222
2,Mohammadpur,1250,3,3,18000,14.4
3,Gulshan,2200,3,4,75000,34.090909
4,Baridhara,2200,3,3,75000,34.090909


In [36]:
len(df1['Location'].unique())

68

In [38]:
print(df1['Location'].unique())

['Bashundhara R-A' 'Tejgaon' 'Mohammadpur' 'Gulshan' 'Baridhara'
 'Hazaribag' 'Mirpur' 'Nikunja' 'Uttara' 'Khilgaon' 'Ibrahimpur' 'Badda'
 'Adabor' 'Jatra Bari' 'Malibagh' 'Banani' 'Kakrail' 'Dhanmondi'
 'Maghbazar' 'Kalachandpur' 'Niketan' 'Eskaton' 'Banasree' 'Bashabo'
 'Baridhara DOHS' 'Aftab Nagar' 'Lalmatia' 'Dakshin Khan' 'Mohakhali DOHS'
 'Sutrapur' 'Hatirpool' 'Agargaon' 'Rampura' 'Cantonment' 'Shahbagh'
 'Khilkhet' 'Motijheel' 'Shantinagar' 'Shegunbagicha' 'Kathalbagan'
 'Shyamoli' 'Kalabagan' 'Demra' 'Kuril' 'Mohakhali' 'Lalbagh' 'New Market'
 'Kafrul' 'Kachukhet' 'Turag' 'Nadda' 'Shyampur' 'Maniknagar'
 'Banani DOHS' 'Shiddheswari' 'Bangshal' 'Paribagh' 'Joar Sahara'
 'Mugdapara' 'North Shahjahanpur' 'Kotwali' 'Shahjahanpur' 'Uttar Khan'
 'Taltola' 'Sadarghat' 'Banglamotors' 'Zafrabad' 'Keraniganj']


In [39]:
location_count = df1['Location'].value_counts(ascending=False)

In [40]:
location_count.head()

Location
Mirpur             8451
Mohammadpur        3612
Uttara             2070
Badda              1831
Bashundhara R-A    1397
Name: count, dtype: int64

In [41]:
len(location_count[location_count<=10])

9

In [42]:
location_count_less_than_ten=location_count[location_count<=10]

In [44]:
df1.Location = df1.Location.apply(lambda x: 'other' if x in location_count_less_than_ten else x)

In [45]:
len(df1['Location'].unique())

60

In [46]:
df1[df1.Area/df1.Bed<300].head(10) #Unrealistic ratio 

Unnamed: 0,Location,Area,Bed,Bath,Rent,Price_Per_Sqft
39,Jatra Bari,800,3,2,15000,18.75
86,Mirpur,745,3,2,15000,20.134228
135,Jatra Bari,800,3,2,15000,18.75
186,Maghbazar,550,2,2,13000,23.636364
191,Adabor,550,2,1,8500,15.454545
193,Khilgaon,550,2,1,11000,20.0
195,Mirpur,850,3,2,13000,15.294118
205,Jatra Bari,800,3,2,15000,18.75
252,Mohammadpur,750,3,2,16000,21.333333
260,Dakshin Khan,800,3,2,8500,10.625


In [47]:
df1.shape

(28787, 6)

In [49]:
df2=df1[~(df1.Area/df1.Bed<300)]
df2.shape

(27866, 6)