In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as ply
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [2]:
df = pd.read_csv("Bengaluru_House_Data.csv")
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [3]:
df.shape

(13320, 9)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [5]:
df.describe()

Unnamed: 0,bath,balcony,price
count,13247.0,12711.0,13320.0
mean,2.69261,1.584376,112.565627
std,1.341458,0.817263,148.971674
min,1.0,0.0,8.0
25%,2.0,1.0,50.0
50%,2.0,2.0,72.0
75%,3.0,2.0,120.0
max,40.0,3.0,3600.0


In [6]:
# droping unnecessary columns
df.drop(["area_type","availability","society","balcony"], axis=1, inplace=True)

In [7]:
df.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0


In [8]:
# checking Null values
df.isnull().sum()

location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64

In [9]:
df["bhk"] = df["size"].str.split(" ").str[0]

In [10]:
df.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0,4
2,Uttarahalli,3 BHK,1440,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0,3
4,Kothanur,2 BHK,1200,2.0,51.0,2


In [11]:
# deal with Nan Values categorical features
col = df[["location","bhk" ,"bath"]]
for i in col:
    df[i].fillna(df[i].mode().iloc[0], inplace=True)

In [12]:
df.isnull().sum()

location       0
size          16
total_sqft     0
bath           0
price          0
bhk            0
dtype: int64

In [13]:
# droping size columns
df.drop("size", axis=1 , inplace = True)

In [14]:
df["total_sqft"].unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [15]:
def rangeconv(x):
    ran = x.split("-")
    if len(ran)==2:
        return (float(ran[0])+ float(ran[1]))/2
    try:
        return float(x)
    except:
        return None

In [16]:
df['total_sqft'] = df['total_sqft'].apply(rangeconv)

In [17]:
df["total_sqft"].unique()

array([1056. , 2600. , 1440. , ..., 1258.5,  774. , 4689. ])

In [18]:
df["total_sqft"].isnull().sum()

46

In [19]:
df["total_sqft"].fillna(df["total_sqft"].median(),inplace=True)

In [20]:
df["total_sqft"].isnull().sum()

0

In [21]:
df["bath"].value_counts()

2.0     6981
3.0     3286
4.0     1226
1.0      788
5.0      524
6.0      273
7.0      102
8.0       64
9.0       43
10.0      13
12.0       7
11.0       3
13.0       3
16.0       2
14.0       1
40.0       1
18.0       1
27.0       1
15.0       1
Name: bath, dtype: int64

In [22]:
df["location"].unique()

array(['Electronic City Phase II', 'Chikka Tirupathi', 'Uttarahalli', ...,
       '12th cross srinivas nagar banshankari 3rd stage',
       'Havanur extension', 'Abshot Layout'], dtype=object)

In [23]:
df["location"].value_counts()

Whitefield                             541
Sarjapur  Road                         399
Electronic City                        302
Kanakpura Road                         273
Thanisandra                            234
                                      ... 
Adarsh Nagar                             1
Michael Palaya                           1
singapura paradise                       1
5 Bedroom Farm House in Lakshmipura      1
Reliable Woods Layout                    1
Name: location, Length: 1305, dtype: int64

In [24]:
# dum_locat = pd.get_dummies(df["location"], drop_first=True)

In [25]:
# data = pd.concat([df, dum_locat], axis=1)

In [26]:
# data.drop("location", axis=1 ,inplace=True)

In [27]:
enc = LabelEncoder()
df["location"] = enc.fit_transform(df["location"])

In [29]:
df.head()

Unnamed: 0,location,total_sqft,bath,price,bhk
0,419,1056.0,2.0,39.07,2
1,317,2600.0,5.0,120.0,4
2,1179,1440.0,2.0,62.0,3
3,757,1521.0,3.0,95.0,3
4,716,1200.0,2.0,51.0,2


In [32]:
x = df.drop("price", axis=1)
y = df["price"]

In [33]:
x_train , x_test , y_train , y_test = train_test_split(x, y , random_state=42, test_size=0.3)

In [34]:
len(x_train)

9324

In [35]:
len(x_test)

3996

In [36]:
lg = LinearRegression()

In [37]:
lg.fit(x_train,y_train)

LinearRegression()

In [38]:
lg.score(x_test , y_test)

0.456611464253599

In [39]:
dt = DecisionTreeRegressor(
 criterion='mse',
    splitter='best',
    max_depth=5,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=43,
)

In [40]:
dt.fit(x_train , y_train)

DecisionTreeRegressor(max_depth=5, random_state=43)

In [41]:
dt.score(x_test, y_test)

0.4398213133538682

In [42]:
rf = RandomForestRegressor(
n_estimators=100,
    max_depth=5,
    min_samples_split=5,
    min_samples_leaf=5,
    n_jobs=5,
    random_state=42,
    verbose=5,
    criterion='mse'

)

In [43]:
rf.fit(x_train , y_train)

[Parallel(n_jobs=5)]: Using backend ThreadingBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   8 tasks      | elapsed:    0.1s


building tree 1 of 100building tree 2 of 100building tree 3 of 100
building tree 4 of 100

building tree 5 of 100

building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100building tree 10 of 100

building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100building tree 17 of 100

building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100building tree 36 of 100

building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
b

[Parallel(n_jobs=5)]: Done  62 tasks      | elapsed:    0.3s
[Parallel(n_jobs=5)]: Done 100 out of 100 | elapsed:    0.4s finished


RandomForestRegressor(max_depth=5, min_samples_leaf=5, min_samples_split=5,
                      n_jobs=5, random_state=42, verbose=5)

In [44]:
rf.score(x_test , y_test)

[Parallel(n_jobs=5)]: Using backend ThreadingBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=5)]: Done  62 tasks      | elapsed:    0.0s
[Parallel(n_jobs=5)]: Done 100 out of 100 | elapsed:    0.0s finished


0.573244686081233