In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [42]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

In [61]:
from sklearn.compose import ColumnTransformer

In [3]:
from sklearn.preprocessing import OneHotEncoder

In [4]:
df=pd.read_csv('Bengaluru_House_Data.csv')
df.sample(5)

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
1584,Super built-up Area,19-Dec,Hoodi,3 BHK,Caycend,1512,3.0,2.0,75.77
7914,Super built-up Area,Ready To Move,Somasundara Palya,3 BHK,,1275,2.0,2.0,52.0
1883,Super built-up Area,Ready To Move,Sector 2 HSR Layout,3 BHK,NCdgerj,1450,2.0,2.0,135.0
8520,Super built-up Area,Ready To Move,Parappana Agrahara,2 BHK,Peide L,1194,2.0,2.0,47.0
2368,Plot Area,Ready To Move,Bileshivale,5 Bedroom,LGDewke,6040,4.0,,170.0


In [5]:
for i in df.columns:
    print(df[i].value_counts())
    print("........................")

area_type
Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: count, dtype: int64
........................
availability
Ready To Move    10581
18-Dec             307
18-May             295
18-Apr             271
18-Aug             200
                 ...  
15-Aug               1
17-Jan               1
16-Nov               1
16-Jan               1
14-Jul               1
Name: count, Length: 81, dtype: int64
........................
location
Whitefield                        540
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: count, Length: 1305, dtype: int64
........................
si

In [6]:
df.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [6]:
df.shape

(13320, 9)

<h2>Handling Missing Values</h2>

In [7]:
df['size']=df['size'].fillna('2 BHK')

In [8]:
df['location']=df['location'].fillna('Whitefield')

In [9]:
df['bath']=df['bath'].fillna(df['bath'].median())

In [10]:
df['balcony']=df['balcony'].fillna(df['balcony'].median())

In [11]:
df=df.drop(columns=['society','availability'])

In [12]:
def convert(x):
    s=x.split('-')
    if len(s)==2:
        return (float(s[0])+float(s[1]))/2
    try:
        return float(x)
    except:
        return None

In [13]:
df['total_sqft']=df['total_sqft'].apply(convert)

In [14]:
df['bhk']=df['size'].str.split().str.get(0).astype(int)

In [15]:
df=df.drop(columns=['size'])

In [16]:
df['price_per_sq']=df['total_sqft']/df['price']

In [17]:
df.sample()

Unnamed: 0,area_type,location,total_sqft,bath,balcony,price,bhk,price_per_sq
5182,Super built-up Area,Hoodi,1925.0,3.0,3.0,110.0,3,17.5


In [18]:
df['total_sqft']=df['total_sqft'].fillna(df['total_sqft'].median())
df['price_per_sq']=df['price_per_sq'].fillna(df['price_per_sq'].median())

In [19]:
df.isnull().sum()

area_type       0
location        0
total_sqft      0
bath            0
balcony         0
price           0
bhk             0
price_per_sq    0
dtype: int64

In [20]:
df.sample()

Unnamed: 0,area_type,location,total_sqft,bath,balcony,price,bhk,price_per_sq
1063,Plot Area,2nd Stage Nagarbhavi,3000.0,8.0,3.0,451.0,6,6.651885


In [21]:
df['location']=df['location'].apply(lambda x:x.strip())

In [22]:
count=df['location'].value_counts()
count_new=count[count<=10]
count_new

location
Dairy Circle                      10
Nagappa Reddy Layout              10
Basapura                          10
1st Block Koramangala             10
Sector 1 HSR Layout               10
                                  ..
Bapuji Layout                      1
1st Stage Radha Krishna Layout     1
BEML Layout 5th stage              1
singapura paradise                 1
Abshot Layout                      1
Name: count, Length: 1053, dtype: int64

In [24]:
df['location']=df['location'].apply(lambda x:'other' if x in count_new else x)

In [25]:
df['location'].value_counts()

location
other                 2885
Whitefield             542
Sarjapur  Road         399
Electronic City        304
Kanakpura Road         273
                      ... 
Nehru Nagar             11
Banjara Layout          11
LB Shastri Nagar        11
Pattandur Agrahara      11
Narayanapura            11
Name: count, Length: 242, dtype: int64

In [28]:
df.sample()

Unnamed: 0,area_type,location,total_sqft,bath,balcony,price,bhk,price_per_sq
12321,Super built-up Area,Brookefield,1382.0,2.0,2.0,84.5,2,16.35503


<h2>Outlier Detection And Removal</h2>

In [29]:
df.describe()

Unnamed: 0,total_sqft,bath,balcony,price,bhk,price_per_sq
count,13320.0,13320.0,13320.0,13320.0,13320.0,13320.0
mean,1558.647202,2.688814,1.603378,112.565627,2.802778,18.919847
std,1236.376834,1.338754,0.803067,148.971674,1.294496,9.405701
min,1.0,1.0,0.0,8.0,1.0,0.008333
25%,1100.0,2.0,1.0,50.0,2.0,13.7
50%,1276.0,2.0,2.0,72.0,3.0,18.401613
75%,1678.0,3.0,2.0,120.0,3.0,23.409091
max,52272.0,40.0,3.0,3600.0,43.0,373.371429


In [30]:
(df['total_sqft']/df['bhk']).describe()

count    13320.000000
mean       575.122682
std        388.133558
min          0.250000
25%        473.000000
50%        552.500000
75%        625.000000
max      26136.000000
dtype: float64

In [31]:
df=df[((df['total_sqft']/df['bhk'])>=300)]

In [35]:
df.describe()

Unnamed: 0,total_sqft,bath,balcony,price,bhk,price_per_sq
count,12568.0,12568.0,12568.0,12568.0,12568.0,12568.0
mean,1593.601347,2.559039,1.603278,111.409518,2.650064,19.489026
std,1259.4844,1.077715,0.796664,151.916676,0.976813,9.278165
min,300.0,1.0,0.0,8.44,1.0,0.566667
25%,1117.75,2.0,1.0,49.0,2.0,14.482759
50%,1300.0,2.0,2.0,70.0,3.0,18.868486
75%,1700.0,3.0,2.0,115.0,3.0,23.735277
max,52272.0,16.0,3.0,3600.0,16.0,373.371429


In [37]:
df.price_per_sq.describe()

count    12568.000000
mean        19.489026
std          9.278165
min          0.566667
25%         14.482759
50%         18.868486
75%         23.735277
max        373.371429
Name: price_per_sq, dtype: float64

In [39]:
def price_per_outlier(df):
    out=pd.DataFrame()
    for key,subdf in df.groupby('location'):
        m=np.mean(subdf.price_per_sq)
        s=np.std(subdf.price_per_sq)
        newdf=subdf[(subdf.price_per_sq>(m-s)) & (subdf.price_per_sq<=(m+s))]
        out=pd.concat([out,newdf],ignore_index=True)
    return out
df=price_per_outlier(df)
df.describe()

Unnamed: 0,total_sqft,bath,balcony,price,bhk,price_per_sq
count,9533.0,9533.0,9533.0,9533.0,9533.0,9533.0
mean,1527.678104,2.495437,1.600545,96.976132,2.591105,19.052033
std,911.895078,0.990357,0.791191,96.591971,0.900303,5.901291
min,300.0,1.0,0.0,10.0,1.0,3.692308
25%,1115.0,2.0,1.0,51.0,2.0,15.180328
50%,1300.0,2.0,2.0,70.0,2.0,18.833333
75%,1675.0,3.0,2.0,106.0,3.0,22.593407
max,30400.0,16.0,3.0,2200.0,16.0,41.0


In [43]:
df.sample()

Unnamed: 0,area_type,location,total_sqft,bath,balcony,price,bhk,price_per_sq
1563,Super built-up Area,Devarachikkanahalli,991.0,2.0,2.0,40.0,2,24.775


In [45]:
df=df.drop(columns=['price_per_sq'])

<h2>Model</h2>

In [46]:
Y=df['price']
X=df.drop(columns=['price'])

In [76]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=0)

In [77]:
X_train

Unnamed: 0,area_type,location,total_sqft,bath,balcony,bhk
5301,Plot Area,Ramagondanahalli,540.0,1.0,0.0,1
9190,Plot Area,other,1500.0,5.0,2.0,5
3151,Built-up Area,JP Nagar,1315.0,2.0,1.0,3
7775,Super built-up Area,other,700.0,1.0,1.0,1
7003,Super built-up Area,Yelahanka,1450.0,3.0,0.0,3
...,...,...,...,...,...,...
7891,Super built-up Area,other,1800.0,3.0,2.0,3
9225,Super built-up Area,other,3800.0,6.0,2.0,6
4859,Super built-up Area,Old Madras Road,1065.0,2.0,1.0,2
3264,Built-up Area,Jalahalli,1400.0,1.0,1.0,2


In [78]:
col_trans=ColumnTransformer([('ohe_area_loc',OneHotEncoder(sparse_output=False),[0,1])],remainder='passthrough')

In [79]:
scaler=StandardScaler()

In [80]:
lr=LinearRegression()

In [81]:
pipe=make_pipeline(col_trans,scaler,lr)

In [82]:
pipe.fit(X_train,Y_train)

In [83]:
pipe.predict(X_test)

array([ 49.25710347,  62.14577535,  74.80397847, ...,  18.17897847,
       514.23073629,  61.4231191 ])

In [84]:
pred

array([ 49.25710347,  62.14577535,  74.80397847, ...,  18.17897847,
       514.23073629,  61.4231191 ])

In [85]:
r2_score(Y_test,pred)

0.7748079281499975