In [1]:
import pandas as pd
import numpy as np

## Data Import and Exploration

In [2]:
data=pd.read_csv('Bengaluru_House_Data.csv')

In [3]:
data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [4]:
data.shape

(13320, 9)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


The data.info() command provides a concise summary of the DataFrame data, including information about its columns such as their names, non-null counts, data types, and memory usage, aiding in understanding the structure and completeness of the dataset.

In [6]:
for column in data.columns:
  print(data[column].value_counts())
  print("*"*20)

area_type
Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: count, dtype: int64
********************
availability
Ready To Move    10581
18-Dec             307
18-May             295
18-Apr             271
18-Aug             200
                 ...  
15-Aug               1
17-Jan               1
16-Nov               1
16-Jan               1
14-Jul               1
Name: count, Length: 81, dtype: int64
********************
location
Whitefield                        540
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: count, Length: 1305, dtype: int64
********************
size
2 BHK    

The code iterates over each column in a DataFrame containing property data and prints the count of unique values for each column, providing insights into the distribution of property attributes such as area type, availability, location, size, society, total square footage, number of bathrooms, balconies, and prices within the dataset.

## Data Cleaning

In [7]:
data.isna() .sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [8]:
data.drop(columns=['area_type', 'availability', 'society', 'balcony'], inplace=True)

In [9]:
data.describe()

Unnamed: 0,bath,price
count,13247.0,13320.0
mean,2.69261,112.565627
std,1.341458,148.971674
min,1.0,8.0
25%,2.0,50.0
50%,2.0,72.0
75%,3.0,120.0
max,40.0,3600.0


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13319 non-null  object 
 1   size        13304 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13247 non-null  float64
 4   price       13320 non-null  float64
dtypes: float64(2), object(3)
memory usage: 520.4+ KB


In [11]:
data['location'].value_counts()

location
Whitefield                        540
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: count, Length: 1305, dtype: int64

The code snippet first provides a count of properties based on their locations, showing the number of properties in each location such as 'Whitefield', 'Sarjapur Road', 'Electronic City', etc., providing insights into the distribution of properties across different areas.

In [12]:
data['location'] = data['location'].fillna('Sarjapur Road')

it fills any missing values in the 'location' column with the string 'Sarjapur Road'

In [13]:
data['size'].value_counts()

size
2 BHK         5199
3 BHK         4310
4 Bedroom      826
4 BHK          591
3 Bedroom      547
1 BHK          538
2 Bedroom      329
5 Bedroom      297
6 Bedroom      191
1 Bedroom      105
8 Bedroom       84
7 Bedroom       83
5 BHK           59
9 Bedroom       46
6 BHK           30
7 BHK           17
1 RK            13
10 Bedroom      12
9 BHK            8
8 BHK            5
11 BHK           2
11 Bedroom       2
10 BHK           2
14 BHK           1
13 BHK           1
12 Bedroom       1
27 BHK           1
43 Bedroom       1
16 BHK           1
19 BHK           1
18 Bedroom       1
Name: count, dtype: int64

it displays the count of properties based on their sizes (number of bedrooms), ranging from 1 RK (Room + Kitchen) to 43 bedrooms, providing an overview of the distribution of property sizes within the dataset.

In [14]:
data['size'] = data['size'].fillna('2 BHK')

In [15]:
data['bath'] - data['bath'].fillna(data['bath'].median())

0        0.0
1        0.0
2        0.0
3        0.0
4        0.0
        ... 
13315    0.0
13316    0.0
13317    0.0
13318    0.0
13319    0.0
Name: bath, Length: 13320, dtype: float64

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13320 non-null  object 
 1   size        13320 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13247 non-null  float64
 4   price       13320 non-null  float64
dtypes: float64(2), object(3)
memory usage: 520.4+ KB


The code provided generates information about the DataFrame data, showing that it contains 13320 entries with 5 columns: 'location', 'size', 'total_sqft', 'bath', and 'price'.

In [17]:
data['bhk']=data['size'].str.split().str.get(0).astype(int)

 the code creates a new column 'bhk' by extracting the number of bedrooms (BHK) from the 'size' column and converting it to integer format using the 'str.split()' and 'str.get()' methods.

In [18]:
data[data.bhk > 20]

Unnamed: 0,location,size,total_sqft,bath,price,bhk
1718,2Electronic City Phase II,27 BHK,8000,27.0,230.0,27
4684,Munnekollal,43 Bedroom,2400,40.0,660.0,43


it filters the DataFrame to show entries where the number of bedrooms ('bhk') is greater than 20, though no code is provided for this specific step.

In [19]:
data['total_sqft'].unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [20]:
def convertRange(x):
    temp = x.split('-')
    if len(temp) == 2:
        return (float(temp[0]) + float(temp[1])) / 2
    try:
        return float(x)
    except:
        return None

it defines a function called convertRange(x) which takes an input x, splits it if it contains a hyphen, calculates the average if it's a range, and converts it to a float value. If the input cannot be converted to a float

In [21]:
data['total_sqft']=data['total_sqft'].apply(convertRange)

it applies the convertRange() function to the 'total_sqft' column of the DataFrame data, converting any range values to their average float value.

In [22]:
data.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3
4,Kothanur,2 BHK,1200.0,2.0,51.0,2


Finally, it displays the first few rows of the modified DataFrame data, where the 'total_sqft' column now contains float values instead of ranges or single values.

## Feature Engineering

In [23]:
data['price_per_sqft'] = data['price'] *100000 / data['total_sqft']

This code calculates the price per square foot for each entry in the DataFrame data by dividing the 'price' column (in units of 100,000) by the 'total_sqft' column.

In [24]:
data['price_per_sqft']

0         3699.810606
1         4615.384615
2         4305.555556
3         6245.890861
4         4250.000000
             ...     
13315     6689.834926
13316    11111.111111
13317     5258.545136
13318    10407.336319
13319     3090.909091
Name: price_per_sqft, Length: 13320, dtype: float64

it displays the resulting 'price_per_sqft' values, showing the calculated price per square foot for each entry in the DataFrame.

In [25]:
data.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,13274.0,13247.0,13320.0,13320.0,13274.0
mean,1559.626694,2.69261,112.565627,2.802778,7907.501
std,1238.405258,1.341458,148.971674,1.294496,106429.6
min,1.0,1.0,8.0,1.0,267.8298
25%,1100.0,2.0,50.0,2.0,4266.865
50%,1276.0,2.0,72.0,3.0,5434.306
75%,1680.0,3.0,120.0,3.0,7311.746
max,52272.0,40.0,3600.0,43.0,12000000.0


In [26]:
data['location'].value_counts()

location
Whitefield                        540
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Uvce Layout                         1
Abshot Layout                       1
Name: count, Length: 1306, dtype: int64

In [27]:
data['location'] = data['location'].apply(lambda x: x.strip())
location_count= data['location']. value_counts()

In [28]:
location_count

location
Whitefield                            541
Sarjapur  Road                        399
Electronic City                       304
Kanakpura Road                        273
Thanisandra                           237
                                     ... 
1Channasandra                           1
Hosahalli                               1
Vijayabank bank layout                  1
near Ramanashree California resort      1
Abshot Layout                           1
Name: count, Length: 1295, dtype: int64

In [29]:
location_count_less_10 = location_count[location_count<=10]
location_count_less_10

location
BTM 1st Stage                         10
Nagadevanahalli                       10
Basapura                              10
Sector 1 HSR Layout                   10
Dairy Circle                          10
                                      ..
1Channasandra                          1
Hosahalli                              1
Vijayabank bank layout                 1
near Ramanashree California resort     1
Abshot Layout                          1
Name: count, Length: 1054, dtype: int64

In [30]:
data[ 'location' ]=data['location'].apply(lambda x: 'other' if x in location_count_less_10 else x)

This line of code replaces locations with fewer than 10 occurrences with the label 'other' in the 'location' column of the DataFrame. It uses a lambda function to check if each location falls into the category of those with fewer than 10 occurrences, and if so, it replaces it with 'other'; otherwise, it keeps the original location. This helps to simplify and generalize the location data, grouping less common locations together.

In [31]:
data['location'].value_counts()

location
other                 2886
Whitefield             541
Sarjapur  Road         399
Electronic City        304
Kanakpura Road         273
                      ... 
Nehru Nagar             11
Banjara Layout          11
LB Shastri Nagar        11
Pattandur Agrahara      11
Narayanapura            11
Name: count, Length: 242, dtype: int64

## Outlier Removal

In [32]:
data.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,13274.0,13247.0,13320.0,13320.0,13274.0
mean,1559.626694,2.69261,112.565627,2.802778,7907.501
std,1238.405258,1.341458,148.971674,1.294496,106429.6
min,1.0,1.0,8.0,1.0,267.8298
25%,1100.0,2.0,50.0,2.0,4266.865
50%,1276.0,2.0,72.0,3.0,5434.306
75%,1680.0,3.0,120.0,3.0,7311.746
max,52272.0,40.0,3600.0,43.0,12000000.0


In [33]:
(data[ 'total_sqft']/data['bhk']).describe()

count    13274.000000
mean       575.074878
std        388.205175
min          0.250000
25%        473.333333
50%        552.500000
75%        625.000000
max      26136.000000
dtype: float64

In [34]:
data = data[((data['total_sqft']/data['bhk']) >= 300)]
data.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,12530.0,12457.0,12530.0,12530.0,12530.0
mean,1594.564544,2.562816,111.382401,2.650838,6303.979357
std,1261.271296,1.080239,152.077329,0.976678,4162.237981
min,300.0,1.0,8.44,1.0,267.829813
25%,1116.0,2.0,49.0,2.0,4210.526316
50%,1300.0,2.0,70.0,3.0,5294.117647
75%,1700.0,3.0,115.0,3.0,6916.666667
max,52272.0,16.0,3600.0,16.0,176470.588235


The second line filters the DataFrame data to include only those entries where the calculated average area per bedroom is greater than or equal to 300 square feet. This step is likely aimed at removing outliers or entries with unusually small area per bedroom. After filtering, it generates a new statistical summary of the filtered DataFrame.

In [35]:
data.shape

(12530, 7)

In [36]:
data.price_per_sqft.describe()

count     12530.000000
mean       6303.979357
std        4162.237981
min         267.829813
25%        4210.526316
50%        5294.117647
75%        6916.666667
max      176470.588235
Name: price_per_sqft, dtype: float64

In [37]:
def remove_outliers_sqft(df):
    df_output = pd.DataFrame()  # Indentation corrected here
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf.price_per_sqft)

        st = np.std(subdf.price_per_sqft)

        gen_df = subdf[(subdf.price_per_sqft > (m - st)) & (subdf.price_per_sqft <= (m + st))]
        df_output = pd.concat([df_output, gen_df], ignore_index=True)
    return df_output

data = remove_outliers_sqft(data)
data.describe()


Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,10301.0,10245.0,10301.0,10301.0,10301.0
mean,1508.440608,2.47428,91.286372,2.574896,5659.062876
std,880.694214,0.9815,86.342786,0.897649,2265.774749
min,300.0,1.0,10.0,1.0,1250.0
25%,1110.0,2.0,49.0,2.0,4244.897959
50%,1286.0,2.0,67.0,2.0,5175.600739
75%,1650.0,3.0,100.0,3.0,6428.571429
max,30400.0,16.0,2200.0,16.0,24509.803922


The remove_outliers_sqft function defined thereafter aims to remove outliers from the DataFrame based on the price_per_sqft column. It calculates the mean (m) and standard deviation (st) of price_per_sqft for each location, then filters entries to keep only those within one standard deviation from the mean. The function returns a new DataFrame with outliers removed.

In [38]:
def bhk_outlier_remover(df):
    exclude_indices = np.array([])
    for location, location_df in df.groupby('location'):
        bhk_stats = {}
        for bhk, bhk_df in location_df.groupby('bhk'):
            bhk_stats[bhk] = {
                'mean': np.mean(bhk_df.price_per_sqft),
                'std': np.std(bhk_df.price_per_sqft),
                'count': bhk_df.shape[0]
            }
        for bhk, bhk_df in location_df.groupby('bhk'):
            stats = bhk_stats.get(bhk-1)
            if stats and stats['count'] > 5:
                exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft < stats['mean']].index.values)
    return df.drop(exclude_indices, axis='index')


In [39]:
data=bhk_outlier_remover(data)

In [40]:
data.shape

(7361, 7)

In [41]:
data

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,1st Block Jayanagar,4 BHK,2850.0,4.0,428.0,4,15017.543860
1,1st Block Jayanagar,3 BHK,1630.0,3.0,194.0,3,11901.840491
2,1st Block Jayanagar,3 BHK,1875.0,2.0,235.0,3,12533.333333
3,1st Block Jayanagar,3 BHK,1200.0,2.0,130.0,3,10833.333333
4,1st Block Jayanagar,2 BHK,1235.0,2.0,148.0,2,11983.805668
...,...,...,...,...,...,...,...
10292,other,2 BHK,1200.0,2.0,70.0,2,5833.333333
10293,other,1 BHK,1800.0,1.0,200.0,1,11111.111111
10296,other,2 BHK,1353.0,2.0,110.0,2,8130.081301
10297,other,1 Bedroom,812.0,1.0,26.0,1,3201.970443


## Data Preprocessing

In [42]:
data.drop(columns=['size','price_per_sqft'],inplace=True)

## Cleaned Data

In [43]:
data.head()

Unnamed: 0,location,total_sqft,bath,price,bhk
0,1st Block Jayanagar,2850.0,4.0,428.0,4
1,1st Block Jayanagar,1630.0,3.0,194.0,3
2,1st Block Jayanagar,1875.0,2.0,235.0,3
3,1st Block Jayanagar,1200.0,2.0,130.0,3
4,1st Block Jayanagar,1235.0,2.0,148.0,2


In [44]:
data.to_csv("Cleaned_data.csv")

In [45]:
X=data.drop(columns=['price'])
y=data['price']

In [46]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score
from sklearn.impute import SimpleImputer

In [47]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


In [48]:
print(X_train.shape)
print(X_test.shape)

(5888, 4)
(1473, 4)


## Applying Linear Regression

In [49]:
# Define the column transformer with imputation
column_trans = make_column_transformer(
    (OneHotEncoder(sparse=False), ['location']),
    (SimpleImputer(strategy='median'), ['bath']),  # Impute missing values with median
    remainder='passthrough'
)

In [50]:
scaler = StandardScaler()

In [51]:
lr = LinearRegression()

In [52]:
pipe = make_pipeline(column_trans,scaler,lr)

In [53]:
data.isnull().sum()


location       0
total_sqft     0
bath          35
price          0
bhk            0
dtype: int64

In [54]:
pipe.fit(X_train,y_train)



In [55]:
y_pred_lr = pipe.predict(X_test)

In [56]:
r2_score(y_test,y_pred_lr)

0.823357393387876

## Applying Lasso

In [57]:
lasso = Lasso()

In [58]:
pipe = make_pipeline(column_trans,scaler, lasso)

In [59]:
pipe.fit(X_train,y_train)



In [60]:
y_pred_lasso = pipe.predict(X_test)
r2_score(y_test,y_pred_lasso)

0.8128308951745891

## Applying Ridge

In [61]:
ridge = Ridge()

In [62]:
pipe = make_pipeline(column_trans,scaler,ridge)

In [63]:
pipe.fit(X_train,y_train)



In [64]:
y_pred_ridge = pipe.predict(X_test)
r2_score(y_test,y_pred_ridge)

0.8234146633312645

In [65]:
print("No Regularization: ",r2_score(y_test, y_pred_lr))
print("Lasso: ",r2_score(y_test,y_pred_lasso))
print("Ridge: ",r2_score(y_test,y_pred_ridge))

No Regularization:  0.823357393387876
Lasso:  0.8128308951745891
Ridge:  0.8234146633312645


In [66]:
import pickle

In [67]:
pickle.dump(pipe,open('RidgeModel.pkl','wb'))