In [76]:
# Import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, ShuffleSplit, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression,Lasso, Ridge
from xgboost import XGBRegressor
import os
import warnings
import pickle
import joblib
warnings.filterwarnings('ignore')

In [3]:
print(os.getcwd())

D:\ACADEMIC\SEMESTER-5\SPL-2\SPL2-reantalapp\server\RentPrediction\Notebook


In [4]:
data_path = os.path.join('..', 'Data', 'houserentdhaka.csv')
print("Dataset path:", data_path)

Dataset path: ..\Data\houserentdhaka.csv


In [5]:
df=pd.read_csv(data_path)

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,Location,Area,Bed,Bath,Price
0,0,"Block H, Bashundhara R-A, Dhaka","1,600 sqft",3,3,20 Thousand
1,1,"Farmgate, Tejgaon, Dhaka",900 sqft,2,2,20 Thousand
2,2,"Block B, Nobodoy Housing Society, Mohammadpur,...","1,250 sqft",3,3,18 Thousand
3,3,"Gulshan 1, Gulshan, Dhaka","2,200 sqft",3,4,75 Thousand
4,4,"Baridhara, Dhaka","2,200 sqft",3,3,75 Thousand


In [7]:
df.shape

(28800, 6)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28800 entries, 0 to 28799
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  28800 non-null  int64 
 1   Location    28800 non-null  object
 2   Area        28800 non-null  object
 3   Bed         28800 non-null  int64 
 4   Bath        28800 non-null  int64 
 5   Price       28800 non-null  object
dtypes: int64(3), object(3)
memory usage: 1.3+ MB


In [9]:
df.columns

Index(['Unnamed: 0', 'Location', 'Area', 'Bed', 'Bath', 'Price'], dtype='object')

In [10]:
df.rename({'Price':'Rent'},axis=1, inplace=True)

In [11]:
df.drop(columns=df.columns[0], axis=1,inplace=True)

In [12]:
df.head()

Unnamed: 0,Location,Area,Bed,Bath,Rent
0,"Block H, Bashundhara R-A, Dhaka","1,600 sqft",3,3,20 Thousand
1,"Farmgate, Tejgaon, Dhaka",900 sqft,2,2,20 Thousand
2,"Block B, Nobodoy Housing Society, Mohammadpur,...","1,250 sqft",3,3,18 Thousand
3,"Gulshan 1, Gulshan, Dhaka","2,200 sqft",3,4,75 Thousand
4,"Baridhara, Dhaka","2,200 sqft",3,3,75 Thousand


In [13]:
df.dtypes

Location    object
Area        object
Bed          int64
Bath         int64
Rent        object
dtype: object

In [14]:
df['Location'].head()

0                      Block H, Bashundhara R-A, Dhaka
1                             Farmgate, Tejgaon, Dhaka
2    Block B, Nobodoy Housing Society, Mohammadpur,...
3                            Gulshan 1, Gulshan, Dhaka
4                                     Baridhara, Dhaka
Name: Location, dtype: object

In [15]:
df['Location']=df['Location'].str.split(',').str[:-1]

df['Location'] = df['Location'].str[-1]

df['Location'] = df['Location'].str.strip()

df['Location'].head()

0    Bashundhara R-A
1            Tejgaon
2        Mohammadpur
3            Gulshan
4          Baridhara
Name: Location, dtype: object

In [16]:
df['Area']=df['Area'].str.split(' ').str[:-1]

In [17]:
df['Area'].head()

0    [1,600]
1      [900]
2    [1,250]
3    [2,200]
4    [2,200]
Name: Area, dtype: object

In [18]:
df['Area'] = df['Area'].apply(lambda x: ''.join(map(str, x)))

In [19]:
df['Area'] = df['Area'].str.replace(',','')

In [20]:
df['Area'] = df['Area'].astype(int)

In [21]:
df['Area'].head()

0    1600
1     900
2    1250
3    2200
4    2200
Name: Area, dtype: int64

In [22]:
print(df['Area'].dtype)

int64


In [23]:
df['Area'] = df['Area'].astype("int64")

In [24]:
print(df['Area'].dtypes)

int64


In [25]:
df['Area'].head()

0    1600
1     900
2    1250
3    2200
4    2200
Name: Area, dtype: int64

In [26]:
df['Rent']=df['Rent'].replace({"Thousand":"*1e3", "Lakh":"*1e5"}, regex=True).map(pd.eval).astype("int64")

df['Rent'].head()

0    20000
1    20000
2    18000
3    75000
4    75000
Name: Rent, dtype: int64

In [27]:
df.head()

Unnamed: 0,Location,Area,Bed,Bath,Rent
0,Bashundhara R-A,1600,3,3,20000
1,Tejgaon,900,2,2,20000
2,Mohammadpur,1250,3,3,18000
3,Gulshan,2200,3,4,75000
4,Baridhara,2200,3,3,75000


In [28]:
df.dtypes

Location    object
Area         int64
Bed          int64
Bath         int64
Rent         int64
dtype: object

In [29]:
df.isna().sum()

Location    13
Area         0
Bed          0
Bath         0
Rent         0
dtype: int64

In [30]:
df.dropna(inplace=True)

In [31]:
df.head()

Unnamed: 0,Location,Area,Bed,Bath,Rent
0,Bashundhara R-A,1600,3,3,20000
1,Tejgaon,900,2,2,20000
2,Mohammadpur,1250,3,3,18000
3,Gulshan,2200,3,4,75000
4,Baridhara,2200,3,3,75000


In [32]:
df1=df.copy()
df1['Price_Per_Sqft']=df1['Rent']/df1['Area']
df1.head()

Unnamed: 0,Location,Area,Bed,Bath,Rent,Price_Per_Sqft
0,Bashundhara R-A,1600,3,3,20000,12.5
1,Tejgaon,900,2,2,20000,22.222222
2,Mohammadpur,1250,3,3,18000,14.4
3,Gulshan,2200,3,4,75000,34.090909
4,Baridhara,2200,3,3,75000,34.090909


In [33]:
len(df1['Location'].unique())

68

In [34]:
print(df1['Location'].unique())

['Bashundhara R-A' 'Tejgaon' 'Mohammadpur' 'Gulshan' 'Baridhara'
 'Hazaribag' 'Mirpur' 'Nikunja' 'Uttara' 'Khilgaon' 'Ibrahimpur' 'Badda'
 'Adabor' 'Jatra Bari' 'Malibagh' 'Banani' 'Kakrail' 'Dhanmondi'
 'Maghbazar' 'Kalachandpur' 'Niketan' 'Eskaton' 'Banasree' 'Bashabo'
 'Baridhara DOHS' 'Aftab Nagar' 'Lalmatia' 'Dakshin Khan' 'Mohakhali DOHS'
 'Sutrapur' 'Hatirpool' 'Agargaon' 'Rampura' 'Cantonment' 'Shahbagh'
 'Khilkhet' 'Motijheel' 'Shantinagar' 'Shegunbagicha' 'Kathalbagan'
 'Shyamoli' 'Kalabagan' 'Demra' 'Kuril' 'Mohakhali' 'Lalbagh' 'New Market'
 'Kafrul' 'Kachukhet' 'Turag' 'Nadda' 'Shyampur' 'Maniknagar'
 'Banani DOHS' 'Shiddheswari' 'Bangshal' 'Paribagh' 'Joar Sahara'
 'Mugdapara' 'North Shahjahanpur' 'Kotwali' 'Shahjahanpur' 'Uttar Khan'
 'Taltola' 'Sadarghat' 'Banglamotors' 'Zafrabad' 'Keraniganj']


In [35]:
location_count = df1['Location'].value_counts(ascending=False)

In [36]:
location_count.head()

Location
Mirpur             8451
Mohammadpur        3612
Uttara             2070
Badda              1831
Bashundhara R-A    1397
Name: count, dtype: int64

In [37]:
len(location_count[location_count<=10])

9

In [38]:
location_count_less_than_ten=location_count[location_count<=10]

In [39]:
df1.Location = df1.Location.apply(lambda x: 'other' if x in location_count_less_than_ten else x)

In [40]:
len(df1['Location'].unique())

60

In [41]:
df1[df1.Area/df1.Bed<300].head(10) #Unrealistic ratio 

Unnamed: 0,Location,Area,Bed,Bath,Rent,Price_Per_Sqft
39,Jatra Bari,800,3,2,15000,18.75
86,Mirpur,745,3,2,15000,20.134228
135,Jatra Bari,800,3,2,15000,18.75
186,Maghbazar,550,2,2,13000,23.636364
191,Adabor,550,2,1,8500,15.454545
193,Khilgaon,550,2,1,11000,20.0
195,Mirpur,850,3,2,13000,15.294118
205,Jatra Bari,800,3,2,15000,18.75
252,Mohammadpur,750,3,2,16000,21.333333
260,Dakshin Khan,800,3,2,8500,10.625


In [42]:
df1.shape

(28787, 6)

In [43]:
df2=df1[~(df1.Area/df1.Bed<300)]
df2.shape

(27866, 6)

In [44]:
df2.Price_Per_Sqft.describe()

count    27866.000000
mean        18.947709
std          6.576776
min          6.500000
25%         15.384615
50%         18.000000
75%         20.952381
max        228.571429
Name: Price_Per_Sqft, dtype: float64

In [45]:
def remove_pps_outliers(df):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('Location'):
        m=np.mean(subdf.Price_Per_Sqft)
        st=np.std(subdf.Price_Per_Sqft)
        reduced_df=subdf[(subdf.Price_Per_Sqft>(m-st)) & (subdf.Price_Per_Sqft<=(m+st))]
        df_out=pd.concat([df_out,reduced_df],ignore_index=True)
    return df_out

df3=remove_pps_outliers(df2)
df3.shape

(19889, 6)

In [46]:
def remove_bedroom_outliers(df):
    exclude_indices=np.array([])
    for location, location_df in df.groupby('Location'):
        bed_stats={}
        for bed,bed_df in location_df.groupby('Bed'):
            bed_stats[bed]={
                'mean':np.mean(bed_df.Price_Per_Sqft),
                'std':np.std(bed_df.Price_Per_Sqft),
                'count':bed_df.shape[0]
            }
        for bed,bed_df in location_df.groupby('Bed'):
            stats=bed_stats.get(bed-1)
            if stats and stats['count']>5:
                exclude_indices=np.append(exclude_indices, bed_df[bed_df.Price_Per_Sqft<(stats['mean'])].index.values)
    return df.drop(exclude_indices,axis='index')

df4=remove_bedroom_outliers(df3)
df4.shape

(9750, 6)

In [47]:
df4.Bath.unique()

array([3, 2, 1, 4, 5, 6])

In [48]:
df5=df4[df4.Bath<df4.Bath+2]
df5.shape

(9750, 6)

In [49]:
df6=df5.drop(['Price_Per_Sqft'],axis='columns')

In [50]:
df6.shape

(9750, 5)

In [51]:
df6.head(30)

Unnamed: 0,Location,Area,Bed,Bath,Rent
10,Adabor,1550,3,3,30000
13,Adabor,800,2,2,16000
14,Adabor,850,2,2,15000
17,Adabor,1550,3,3,30000
19,Adabor,1550,3,3,30000
20,Adabor,800,2,2,15000
21,Adabor,800,2,2,15000
23,Adabor,800,2,2,14000
26,Adabor,850,2,2,15000
27,Adabor,800,2,2,15500


In [52]:
df6.Location.unique()

array(['Adabor', 'Aftab Nagar', 'Agargaon', 'Badda', 'Banani',
       'Banani DOHS', 'Banasree', 'Bangshal', 'Baridhara',
       'Baridhara DOHS', 'Bashabo', 'Bashundhara R-A', 'Cantonment',
       'Dakshin Khan', 'Dhanmondi', 'Eskaton', 'Gulshan', 'Hatirpool',
       'Hazaribag', 'Ibrahimpur', 'Jatra Bari', 'Joar Sahara',
       'Kachukhet', 'Kafrul', 'Kakrail', 'Kalabagan', 'Kalachandpur',
       'Kathalbagan', 'Khilgaon', 'Khilkhet', 'Kotwali', 'Kuril',
       'Lalbagh', 'Lalmatia', 'Maghbazar', 'Malibagh', 'Mirpur',
       'Mohakhali', 'Mohakhali DOHS', 'Mohammadpur', 'Motijheel',
       'Mugdapara', 'Nadda', 'New Market', 'Niketan', 'Nikunja',
       'North Shahjahanpur', 'Rampura', 'Shahjahanpur', 'Shantinagar',
       'Shegunbagicha', 'Shiddheswari', 'Shyamoli', 'Shyampur',
       'Sutrapur', 'Tejgaon', 'Turag', 'Uttar Khan', 'Uttara', 'other'],
      dtype=object)

In [53]:
df6 = df6.reset_index(drop=True)
dummies = pd.get_dummies(df6.Location)
dummies = dummies.astype(int)

In [54]:
dummies.head(3)

Unnamed: 0,Adabor,Aftab Nagar,Agargaon,Badda,Banani,Banani DOHS,Banasree,Bangshal,Baridhara,Baridhara DOHS,...,Shegunbagicha,Shiddheswari,Shyamoli,Shyampur,Sutrapur,Tejgaon,Turag,Uttar Khan,Uttara,other
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [55]:
df7=pd.concat([df6,dummies.drop('other',axis='columns')],axis='columns')
df7.head(3)

Unnamed: 0,Location,Area,Bed,Bath,Rent,Adabor,Aftab Nagar,Agargaon,Badda,Banani,...,Shantinagar,Shegunbagicha,Shiddheswari,Shyamoli,Shyampur,Sutrapur,Tejgaon,Turag,Uttar Khan,Uttara
0,Adabor,1550,3,3,30000,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Adabor,800,2,2,16000,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Adabor,850,2,2,15000,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [83]:
df8=df7.drop('Location',axis='columns')
df8.head(2)

Unnamed: 0,Area,Bed,Bath,Rent,Adabor,Aftab Nagar,Agargaon,Badda,Banani,Banani DOHS,...,Shantinagar,Shegunbagicha,Shiddheswari,Shyamoli,Shyampur,Sutrapur,Tejgaon,Turag,Uttar Khan,Uttara
0,1550,3,3,30000,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,800,2,2,16000,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [84]:
X=df8.drop('Rent',axis='columns')
X.head()

Unnamed: 0,Area,Bed,Bath,Adabor,Aftab Nagar,Agargaon,Badda,Banani,Banani DOHS,Banasree,...,Shantinagar,Shegunbagicha,Shiddheswari,Shyamoli,Shyampur,Sutrapur,Tejgaon,Turag,Uttar Khan,Uttara
0,1550,3,3,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,800,2,2,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,850,2,2,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1550,3,3,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1550,3,3,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [85]:
y=df8.Rent
y.head()

0    30000
1    16000
2    15000
3    30000
4    30000
Name: Rent, dtype: int64

In [86]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=10)

In [87]:
lr_clf=LinearRegression()
lr_clf.fit(X_train,y_train)
lr_score=lr_clf.score(X_test,y_test)
print("Linear Score: %.6f" % (lr_score * 100))

Linear Score: 91.875752


In [88]:
cv=ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
cross_val_score(LinearRegression(),X,y,cv=cv)

array([0.90816917, 0.92378826, 0.88279049, 0.91749134, 0.91756592])

In [89]:
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

scores = {}

def get_cv_score(model, X, y):
    cv_scores = []
    for train_idx, test_idx in cv.split(X):
        X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)
        y_train, y_test = train_test_split(y, test_size=0.2, random_state=42)
        
        model.fit(X_train, y_train)
        score = model.score(X_test, y_test)
        cv_scores.append(score)
    
    return np.mean(cv_scores)

lr_clf = LinearRegression()
scores["Linear Regression"] = get_cv_score(lr_clf, X, y)

las_clf = Lasso(alpha=0.1)
scores["Lasso"] = get_cv_score(las_clf, X, y)

ridge_clf = Ridge(alpha=0.1, solver='auto')
scores["Ridge Regression"] = get_cv_score(ridge_clf, X, y)

xgb_clf = XGBRegressor(objective='reg:squarederror', colsample_bytree=0.3, 
                       learning_rate=0.1, max_depth=5, alpha=10, n_estimators=100)
scores["XGBoost"] = get_cv_score(xgb_clf, X, y)

for model, score in scores.items():
    print(f"{model} Score: {score * 100:.2f}")


Linear Regression Score: 92.66
Lasso Score: 92.66
Ridge Regression Score: 92.66
XGBoost Score: 96.23


In [90]:
print(type(X_train))
print(type(y_train))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


In [91]:
def predict_price(Location,Area,Bed,Bath,X):    
    index = np.where(X.columns == Location)[0][0]

    x = np.zeros(len(X.columns))
    x[0] = Area
    x[1] = Bed
    x[2] = Bath
    if index >= 0:
        x[index] = 1

    return xgb_clf.predict([x])[0]

In [92]:
# Now predict using the best model
predicted_price = round(predict_price('Lalmatia', 1100, 3, 2, X))
print(f"Predicted Price: {predicted_price}")

Predicted Price: 27950


In [93]:
# Now predict using the best model
predicted_price = round(predict_price('Lalmatia', 1100, 2, 3, X))
print(f"Predicted Price: {predicted_price}")

Predicted Price: 26378


In [94]:
predicted_price = predict_price('Gulshan', 2000, 3, 3, X)
print(f"Predicted Price: {predicted_price}")

Predicted Price: 81163.6484375


In [95]:
predicted_price = predict_price('Gulshan', 2200, 3, 4, X)
print(f"Predicted Price: {predicted_price}")

Predicted Price: 85857.21875


In [96]:
predicted_price = predict_price('Mirpur', 1450, 3, 3, X)
print(f"Predicted Price: {predicted_price}")

Predicted Price: 25397.431640625


In [97]:
os.makedirs('../Model', exist_ok=True)

with open('../Model/dhaka_rent_model.pkl', 'wb') as f:
    pickle.dump(xgb_clf, f)

print("Model saved successfully!")

Model saved successfully!


In [108]:
x_column=X.columns.tolist()
with open('../Model/X_columns.pkl','wb') as f:
    joblib.dump(x_column,f)

In [109]:
X_columns = joblib.load("../../RentPrediction/Model/X_columns.pkl")
print(type(X_columns))  # Should print <class 'list'>
print(len(X_columns)) 
print(X_columns)

<class 'list'>
62
['Area', 'Bed', 'Bath', 'Adabor', 'Aftab Nagar', 'Agargaon', 'Badda', 'Banani', 'Banani DOHS', 'Banasree', 'Bangshal', 'Baridhara', 'Baridhara DOHS', 'Bashabo', 'Bashundhara R-A', 'Cantonment', 'Dakshin Khan', 'Dhanmondi', 'Eskaton', 'Gulshan', 'Hatirpool', 'Hazaribag', 'Ibrahimpur', 'Jatra Bari', 'Joar Sahara', 'Kachukhet', 'Kafrul', 'Kakrail', 'Kalabagan', 'Kalachandpur', 'Kathalbagan', 'Khilgaon', 'Khilkhet', 'Kotwali', 'Kuril', 'Lalbagh', 'Lalmatia', 'Maghbazar', 'Malibagh', 'Mirpur', 'Mohakhali', 'Mohakhali DOHS', 'Mohammadpur', 'Motijheel', 'Mugdapara', 'Nadda', 'New Market', 'Niketan', 'Nikunja', 'North Shahjahanpur', 'Rampura', 'Shahjahanpur', 'Shantinagar', 'Shegunbagicha', 'Shiddheswari', 'Shyamoli', 'Shyampur', 'Sutrapur', 'Tejgaon', 'Turag', 'Uttar Khan', 'Uttara']
