In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.metrics import r2_score
import pickle as pkl

In [3]:
df = pd.read_csv('Bengaluru_House_Data.csv')

In [4]:
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [5]:
df.shape

(13320, 9)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [7]:
#using value counts on each column to get the the total no of things in the rows
for column in df.columns:
    print(df[column].value_counts())
    print("*"*20)

area_type
Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: count, dtype: int64
********************
availability
Ready To Move    10581
18-Dec             307
18-May             295
18-Apr             271
18-Aug             200
                 ...  
15-Aug               1
17-Jan               1
16-Nov               1
16-Jan               1
14-Jul               1
Name: count, Length: 81, dtype: int64
********************
location
Whitefield                        540
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: count, Length: 1305, dtype: int64
********************
size
2 BHK    

In [8]:
#checking missing values
df.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [9]:
df.drop(columns=['area_type','availability','society','balcony'],inplace=True)

In [10]:
#here im getting only bath and price as describe() method is used on numerical columns only not non-numeric columns
df.describe()

Unnamed: 0,bath,price
count,13247.0,13320.0
mean,2.69261,112.565627
std,1.341458,148.971674
min,1.0,8.0
25%,2.0,50.0
50%,2.0,72.0
75%,3.0,120.0
max,40.0,3600.0


In [11]:
#after dropping the columns total columns left are
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13319 non-null  object 
 1   size        13304 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13247 non-null  float64
 4   price       13320 non-null  float64
dtypes: float64(2), object(3)
memory usage: 520.4+ KB


In [12]:
#Checking which value is most to fill the missing values with and it is categorical column so we use value_counts
df['location'].value_counts()

location
Whitefield                        540
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: count, Length: 1305, dtype: int64

In [13]:
#filling the missing location with sarjapur road we use value_counts here also as categorical
df['location'] = df['location'].fillna('Sarjapur Road')

In [14]:
#checking the missing values in size using the value_count
df['size'].value_counts()

size
2 BHK         5199
3 BHK         4310
4 Bedroom      826
4 BHK          591
3 Bedroom      547
1 BHK          538
2 Bedroom      329
5 Bedroom      297
6 Bedroom      191
1 Bedroom      105
8 Bedroom       84
7 Bedroom       83
5 BHK           59
9 Bedroom       46
6 BHK           30
7 BHK           17
1 RK            13
10 Bedroom      12
9 BHK            8
8 BHK            5
11 BHK           2
11 Bedroom       2
10 BHK           2
14 BHK           1
13 BHK           1
12 Bedroom       1
27 BHK           1
43 Bedroom       1
16 BHK           1
19 BHK           1
18 Bedroom       1
Name: count, dtype: int64

In [15]:
#filling the missing values 
df['size'] = df['size'].fillna('2 BHK')

In [16]:
#filling the missing values in the bath column using the median as it is numerical column.
df['bath'] = df['bath'].fillna(df['bath'].median())

In [17]:
#checking if the missing values are completely filled or not
df.isnull().sum()

location      0
size          0
total_sqft    0
bath          0
price         0
dtype: int64

In [18]:
#creating a new column called 'bhk' in the DataFrame data by extracting the number of bedrooms from the 'size' column and converting it to an integer.
df['bhk'] = df['size'].str.split().str.get(0).astype(int)

In [19]:
df.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0,4
2,Uttarahalli,3 BHK,1440,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0,3
4,Kothanur,2 BHK,1200,2.0,51.0,2


In [20]:
#checking if there are more than 20 bhk in data and these are basically outliers in the data
df[df.bhk>20]

Unnamed: 0,location,size,total_sqft,bath,price,bhk
1718,2Electronic City Phase II,27 BHK,8000,27.0,230.0,27
4684,Munnekollal,43 Bedroom,2400,40.0,660.0,43


In [21]:
#checking for some unique values in total_sqft but here is some range problem like 1133 - 1384 which we have to fix
df['total_sqft'].unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [22]:
#using this code to convert the range values like 1133 - 1384 into simple values by adding these two and then dividing these two also i.e getting the mean of the both values
def convertRange(x):
    #splitting the values having - sign and storing it in temp as string
    temp = x.split('-')
    # thus temp values is two(2) which is 1133 - 1384 then
    if len(temp) == 2:
        # converting the 0 index into float and 1 index in float also and dividing by 2 to get mean of that.
        return (float(temp[0]) + float(temp[1]))/2
    try:
        #after splitting if we get only 1 temp value we will return the x value
        return float(x)
    except:
        # if there is any problem in the conversion then it will return none
        return None
    
    

In [23]:
#applying the function convertRange here and storing that in total-sqft back
df['total_sqft'] = df['total_sqft'].apply(convertRange)

In [24]:
# here size is converted into bhk and total_sqft range value is also replaced by the mean value
df.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3
4,Kothanur,2 BHK,1200.0,2.0,51.0,2


## Price Per Square Feet

In [25]:
# we will create another column price_per sqft which will help in removing the outliers. we will get it by doing price/total_sqft
# we will multiply it with 100000(1 lakh) so that price comes out from decimal points and divide by total_sqft
df['price_per_square_feet'] = df['price'] *100000 / df['total_sqft']

In [26]:
df.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_square_feet
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4,4615.384615
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3,4305.555556
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3,6245.890861
4,Kothanur,2 BHK,1200.0,2.0,51.0,2,4250.0


In [27]:
df['price_per_square_feet']

0         3699.810606
1         4615.384615
2         4305.555556
3         6245.890861
4         4250.000000
             ...     
13315     6689.834926
13316    11111.111111
13317     5258.545136
13318    10407.336319
13319     3090.909091
Name: price_per_square_feet, Length: 13320, dtype: float64

In [28]:
# here we added bhk and price_per_sqft as we convert them into numerical columns
df.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_square_feet
count,13274.0,13320.0,13320.0,13320.0,13274.0
mean,1559.626694,2.688814,112.565627,2.802778,7907.501
std,1238.405258,1.338754,148.971674,1.294496,106429.6
min,1.0,1.0,8.0,1.0,267.8298
25%,1100.0,2.0,50.0,2.0,4266.865
50%,1276.0,2.0,72.0,3.0,5434.306
75%,1680.0,3.0,120.0,3.0,7311.746
max,52272.0,40.0,3600.0,43.0,12000000.0


In [29]:
# here we will check the value counts in which we have around 1306 different places which we can't pass on our model so we will replace it with others
df['location'].value_counts()

location
Whitefield                        540
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Uvce Layout                         1
Abshot Layout                       1
Name: count, Length: 1306, dtype: int64

In [30]:
# here we are using lambda in the 'location' column to remove any leading or trailing whitespaces and using strip to remove any accidental whitespaces
# this will combine the values which has come more than once in the data. eg Whitefield , Whitefield = 2(Whitefield is twice)
df['location'] = df['location'].apply(lambda x: x.strip())
# thus here we are counting after adding all the similar values
location_count = df['location'].value_counts()

In [31]:
# here length is reduced to 1295 as we combined the locations which have come more than once
location_count

location
Whitefield                            541
Sarjapur  Road                        399
Electronic City                       304
Kanakpura Road                        273
Thanisandra                           237
                                     ... 
1Channasandra                           1
Hosahalli                               1
Vijayabank bank layout                  1
near Ramanashree California resort      1
Abshot Layout                           1
Name: count, Length: 1295, dtype: int64

In [32]:
# here we are ehecking those locations which only came either 10 times or less than that
location_count_less_10 = location_count[location_count<=10]
location_count_less_10

location
BTM 1st Stage                         10
Nagadevanahalli                       10
Basapura                              10
Sector 1 HSR Layout                   10
Dairy Circle                          10
                                      ..
1Channasandra                          1
Hosahalli                              1
Vijayabank bank layout                 1
near Ramanashree California resort     1
Abshot Layout                          1
Name: count, Length: 1054, dtype: int64

In [33]:
# thus all the locations which are 10 or less than 10 we will replace it with other so that the length of location decreases else keep it same as x is
df['location'] = df['location'].apply(lambda x: 'other' if x in location_count_less_10 else x)

In [34]:
# thus length gets reduced as 2886 are other values and length is now only 242
df['location'].value_counts()

location
other                 2886
Whitefield             541
Sarjapur  Road         399
Electronic City        304
Kanakpura Road         273
                      ... 
Nehru Nagar             11
Banjara Layout          11
LB Shastri Nagar        11
Pattandur Agrahara      11
Narayanapura            11
Name: count, Length: 242, dtype: int64

## Outlier detection and removal

In [35]:
# we will remove outlier in total_sqft as it is no possible to be only 1 sqft house(MIN)
df.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_square_feet
count,13274.0,13320.0,13320.0,13320.0,13274.0
mean,1559.626694,2.688814,112.565627,2.802778,7907.501
std,1238.405258,1.338754,148.971674,1.294496,106429.6
min,1.0,1.0,8.0,1.0,267.8298
25%,1100.0,2.0,50.0,2.0,4266.865
50%,1276.0,2.0,72.0,3.0,5434.306
75%,1680.0,3.0,120.0,3.0,7311.746
max,52272.0,40.0,3600.0,43.0,12000000.0


In [36]:
# getting how many bhk should be in a sqft
(df['total_sqft']/df['bhk']).describe()
#after using describe we found that there is a flat with min = 0.25 sqft area in a flat

count    13274.000000
mean       575.074878
std        388.205175
min          0.250000
25%        473.333333
50%        552.500000
75%        625.000000
max      26136.000000
dtype: float64

In [37]:
#here we say that total_sqft in a bhk should be greater than 300 and less than that is not feasible
df = df[((df['total_sqft']/df['bhk']) >= 300)]

In [38]:
df.describe()
# thus we are keeping only total_sqft greater than 300(MIN = 300)

Unnamed: 0,total_sqft,bath,price,bhk,price_per_square_feet
count,12530.0,12530.0,12530.0,12530.0,12530.0
mean,1594.564544,2.559537,111.382401,2.650838,6303.979357
std,1261.271296,1.077938,152.077329,0.976678,4162.237981
min,300.0,1.0,8.44,1.0,267.829813
25%,1116.0,2.0,49.0,2.0,4210.526316
50%,1300.0,2.0,70.0,3.0,5294.117647
75%,1700.0,3.0,115.0,3.0,6916.666667
max,52272.0,16.0,3600.0,16.0,176470.588235


In [39]:
df.shape
#shape is now changed from 13300 to 12530

(12530, 7)

In [40]:
df['price_per_square_feet'].describe()
# here  the MAX value is too big which is not possible for a flat thus it is an outlier which needs to be removed

count     12530.000000
mean       6303.979357
std        4162.237981
min         267.829813
25%        4210.526316
50%        5294.117647
75%        6916.666667
max      176470.588235
Name: price_per_square_feet, dtype: float64

In [41]:
def remove_outliers(df):
    df_output = pd.DataFrame()
    # using groupby we get location key and subdf(subdataframe)
    # doing it for all locations using for loop
    for key,subdf in df.groupby('location'):
        # getting mean price_per_sqft of a single location
        m = np.mean(subdf.price_per_square_feet)
         # getting standard deviation price_per_sqft of a single location
        st = np.std(subdf.price_per_square_feet)
        
        # keeping that subdata-frame price_per_sqft and then keeping it under m-st and m+st and storing in gen_df
        #concatenating it with df_output
        gen_df = subdf[(subdf['price_per_square_feet'] > (m-st)) & (subdf['price_per_square_feet'] < (m+st))]
        df_output = pd.concat([df_output,gen_df], ignore_index = True)
    return df_output

df = remove_outliers(df)

# thus here max std min all are reduced and outliers are almost removed

In [42]:
df.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_square_feet
count,10301.0,10301.0,10301.0,10301.0,10301.0
mean,1508.440608,2.471702,91.286372,2.574896,5659.062876
std,880.694214,0.979449,86.342786,0.897649,2265.774749
min,300.0,1.0,10.0,1.0,1250.0
25%,1110.0,2.0,49.0,2.0,4244.897959
50%,1286.0,2.0,67.0,2.0,5175.600739
75%,1650.0,3.0,100.0,3.0,6428.571429
max,30400.0,16.0,2200.0,16.0,24509.803922


## bhk outlier removal

In [43]:
def bhk_outlier_remover(df):
    #creating an array which will store an array which we dont need
    exclude_indices = np.array([])
    for location, location_df in df.groupby('location'):
        #creating dictionary which will store bhk stats 
        bhk_stats = {}
        #doing group by on bhk_df(dataframe)
        for bhk, bhk_df in location_df.groupby('bhk'):
            bhk_df[bhk] = {
                # in this for each bhk we are storing its value i.e mean, std and count
                'mean': np.mean(bhk_df.price_per_square_feet),
                'std': np.std(bhk_df.price_per_square_feet),
                'count': bhk_df.shape[0]
            }
            
        for bhk, bhk_df in location_df.groupby('bhk'):
            #after getting the stats here what we do is
            stats = bhk_stats.get(bhk-1) # here eg if we have 3bhk flat we will check the stats of the 2 bhk flat
            if stats and stats['count']>5:
                # if the current price point of 3bhk is less than mean of 2bhk then we are keeping it otherwise not.
                exclude_indices = np.append(exclude_indices,bhk_df[bhk_df.price_per_square_feet<(stats['mean'])].index.values)
     #after storing in exclude_indices we drop all those exclude_indices           
    return df.drop(exclude_indices, axis='index')

In [44]:
df.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_square_feet
count,10301.0,10301.0,10301.0,10301.0,10301.0
mean,1508.440608,2.471702,91.286372,2.574896,5659.062876
std,880.694214,0.979449,86.342786,0.897649,2265.774749
min,300.0,1.0,10.0,1.0,1250.0
25%,1110.0,2.0,49.0,2.0,4244.897959
50%,1286.0,2.0,67.0,2.0,5175.600739
75%,1650.0,3.0,100.0,3.0,6428.571429
max,30400.0,16.0,2200.0,16.0,24509.803922


In [45]:
df=bhk_outlier_remover(df)

In [46]:
df.shape

(10301, 7)

In [47]:
df

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_square_feet
0,1st Block Jayanagar,4 BHK,2850.0,4.0,428.00,4,15017.543860
1,1st Block Jayanagar,3 BHK,1630.0,3.0,194.00,3,11901.840491
2,1st Block Jayanagar,3 BHK,1875.0,2.0,235.00,3,12533.333333
3,1st Block Jayanagar,3 BHK,1200.0,2.0,130.00,3,10833.333333
4,1st Block Jayanagar,2 BHK,1235.0,2.0,148.00,2,11983.805668
...,...,...,...,...,...,...,...
10296,other,2 BHK,1353.0,2.0,110.00,2,8130.081301
10297,other,1 Bedroom,812.0,1.0,26.00,1,3201.970443
10298,other,3 BHK,1440.0,2.0,63.93,3,4439.583333
10299,other,2 BHK,1075.0,2.0,48.00,2,4465.116279


In [48]:
#Dropping columns size and price_per_sqft(it was only used to remove outliers)
df.drop(columns=['size','price_per_square_feet'], axis= 1, inplace=True)

## Cleaned Data

In [49]:
df.head()
#location, total_sqft,bath, and bhk are our features and price is our target column

Unnamed: 0,location,total_sqft,bath,price,bhk
0,1st Block Jayanagar,2850.0,4.0,428.0,4
1,1st Block Jayanagar,1630.0,3.0,194.0,3
2,1st Block Jayanagar,1875.0,2.0,235.0,3
3,1st Block Jayanagar,1200.0,2.0,130.0,3
4,1st Block Jayanagar,1235.0,2.0,148.0,2


In [50]:
#saving the cleaned data using data.to_csv
df.to_csv("Cleaned_data.csv")

In [51]:
#creating X and Y where price is excluded from feature columns as it is target column
X = df.drop(columns=['price'])
y = df['price']

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)

In [53]:
print(X_train.shape)
print(X_test.shape)

(8240, 4)
(2061, 4)


## Applying Linear Regression

In [54]:
#using onehotencoding on location column(categorical) using column transfoemer
colum_trans = make_column_transformer((OneHotEncoder(sparse=False),['location']),
                                      remainder='passthrough')

In [55]:
scaler = StandardScaler()

In [56]:
lr = LinearRegression()

In [57]:
pipeline = make_pipeline(colum_trans,scaler, lr)

In [58]:
pipeline.fit(X_train, y_train)



In [59]:
y_pred = pipeline.predict(X_test)

In [60]:
score = r2_score(y_test,y_pred)

In [61]:
score

0.8284768898542761

## Applying Lasso 
 Lasso regression uses L1 regularization and can lead to feature selection by setting some coefficients exactly to zero

In [62]:
lasso = Lasso()

In [64]:
pipe = make_pipeline(colum_trans,scaler, lasso)

In [65]:
pipe.fit(X_train,y_train)



In [66]:
y_pred_lasso = pipe.predict(X_test)
r2_score(y_test, y_pred_lasso)

0.821316543187225

## Applying Ridge
 Ridge regression uses L2 regularization and shrinks coefficients towards zero

In [67]:
ridge = Ridge()

In [68]:
pipe = make_pipeline(colum_trans,scaler, ridge)

In [69]:
pipe.fit(X_train,y_train)



In [70]:
y_pred_ridge = pipe.predict(X_test)
r2_score(y_test, y_pred_ridge)

0.8284595107915464

In [71]:
print("No Regularization: ", r2_score(y_test, y_pred))
print("Lasso: ", r2_score(y_test, y_pred_lasso))
print("Ridge: ", r2_score(y_test, y_pred_ridge))

No Regularization:  0.8284768898542761
Lasso:  0.821316543187225
Ridge:  0.8284595107915464


In [72]:
import pickle

In [73]:
pickle.dump(pipe, open('RidgeModel.pkl','wb'))