# Bangalore_house_price_prediction

## Importing the necessary libraries

In [3]:
import numpy as np 
import pandas as pd 
import warnings
warnings.filterwarnings('ignore')


## Reading the dataset

In [4]:
df = pd.read_csv('Bengaluru_House_Data.csv')

In [5]:
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [6]:
df.shape

(13320, 9)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [8]:
df.describe()

Unnamed: 0,bath,balcony,price
count,13247.0,12711.0,13320.0
mean,2.69261,1.584376,112.565627
std,1.341458,0.817263,148.971674
min,1.0,0.0,8.0
25%,2.0,1.0,50.0
50%,2.0,2.0,72.0
75%,3.0,2.0,120.0
max,40.0,3.0,3600.0


## Checking for null values

In [9]:
df.isna().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

## Dropping the columns that are not so useful for our prediction

In [10]:
df.drop(columns=['area_type','availability','society','balcony'],inplace=True)

In [11]:
df.isnull().sum()

location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64

## Checking for duplicate values


In [12]:
df.duplicated().sum()

882

## Dropping the duplicate rows

In [13]:
df.drop_duplicates(inplace=True)

In [14]:
df.duplicated().sum()

0

In [15]:
df['location'].value_counts()

location
Whitefield                        507
Sarjapur  Road                    364
Electronic City                   273
Thanisandra                       224
Kanakpura Road                    223
                                 ... 
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Uvce Layout                         1
Abshot Layout                       1
Name: count, Length: 1305, dtype: int64

In [16]:
df.location.isnull().sum()

1

## Filling the missing value in location column by Whitefield                        

In [17]:
df['location'] = df['location'].fillna("Whitefield")

In [18]:
df['size'].value_counts()

size
2 BHK         4768
3 BHK         3988
4 Bedroom      819
4 BHK          562
3 Bedroom      527
1 BHK          507
2 Bedroom      303
5 Bedroom      288
6 Bedroom      191
1 Bedroom      100
8 Bedroom       84
7 Bedroom       82
5 BHK           59
9 Bedroom       46
6 BHK           30
7 BHK           17
1 RK            12
10 Bedroom      12
9 BHK            8
8 BHK            5
11 BHK           2
11 Bedroom       2
10 BHK           2
14 BHK           1
13 BHK           1
12 Bedroom       1
27 BHK           1
43 Bedroom       1
16 BHK           1
19 BHK           1
18 Bedroom       1
Name: count, dtype: int64

## Filling the missing values in the size column by 2BHK 

In [19]:
df['size'] = df['size'].fillna("2 BHK")

In [20]:
df['size'].isnull().sum()

0

## Filling the missing values in the bathroom column by it's median

In [21]:
df['bath'] = df['bath'].fillna(df['bath'].median())

In [22]:
df['bath'].isnull().sum()

0

In [23]:
df.isnull().sum()

location      0
size          0
total_sqft    0
bath          0
price         0
dtype: int64

### No more missing values present


## Since the size column contains discrepancy so we will filter the column and store it in another column

In [24]:
df['BHK'] = df['size'].str.split().str.get(0).astype('int')

In [25]:
df

Unnamed: 0,location,size,total_sqft,bath,price,BHK
0,Electronic City Phase II,2 BHK,1056,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.00,4
2,Uttarahalli,3 BHK,1440,2.0,62.00,3
3,Lingadheeranahalli,3 BHK,1521,3.0,95.00,3
4,Kothanur,2 BHK,1200,2.0,51.00,2
...,...,...,...,...,...,...
13314,Green Glen Layout,3 BHK,1715,3.0,112.00,3
13315,Whitefield,5 Bedroom,3453,4.0,231.00,5
13316,Richards Town,4 BHK,3600,5.0,400.00,4
13317,Raja Rajeshwari Nagar,2 BHK,1141,2.0,60.00,2


## Dropping the size column as it is no longer needed

In [26]:
df.drop(columns=['size'],inplace=True)

In [27]:
df['total_sqft'].unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

### since this column contains range, which is not desired sao we will take the mean value of the range

In [28]:
def convertrange(x):
    temp = x.split('-')
    if len(temp) == 2:
        return ((float(temp[0])+float(temp[1]))/2)
    try:
        return float(x)
    except:
        return None

In [29]:
df['total_sqft'] = df['total_sqft'].apply(convertrange)

In [30]:
df

Unnamed: 0,location,total_sqft,bath,price,BHK
0,Electronic City Phase II,1056.0,2.0,39.07,2
1,Chikka Tirupathi,2600.0,5.0,120.00,4
2,Uttarahalli,1440.0,2.0,62.00,3
3,Lingadheeranahalli,1521.0,3.0,95.00,3
4,Kothanur,1200.0,2.0,51.00,2
...,...,...,...,...,...
13314,Green Glen Layout,1715.0,3.0,112.00,3
13315,Whitefield,3453.0,4.0,231.00,5
13316,Richards Town,3600.0,5.0,400.00,4
13317,Raja Rajeshwari Nagar,1141.0,2.0,60.00,2


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12438 entries, 0 to 13318
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    12438 non-null  object 
 1   total_sqft  12392 non-null  float64
 2   bath        12438 non-null  float64
 3   price       12438 non-null  float64
 4   BHK         12438 non-null  int32  
dtypes: float64(3), int32(1), object(1)
memory usage: 534.4+ KB


## Creating a new column for price per square feet

In [32]:
df['price_per_sqft'] = (df['price']*100000)/ df ['total_sqft']

In [33]:
df['price_per_sqft']

0         3699.810606
1         4615.384615
2         4305.555556
3         6245.890861
4         4250.000000
             ...     
13314     6530.612245
13315     6689.834926
13316    11111.111111
13317     5258.545136
13318    10407.336319
Name: price_per_sqft, Length: 12438, dtype: float64

In [34]:
df.describe()

Unnamed: 0,total_sqft,bath,price,BHK,price_per_sqft
count,12392.0,12438.0,12438.0,12438.0,12392.0
mean,1576.376848,2.715549,115.408159,2.826821,8099.928
std,1273.831772,1.367021,153.069125,1.323065,110147.9
min,1.0,1.0,8.0,1.0,267.8298
25%,1100.0,2.0,50.0,2.0,4306.941
50%,1288.5,2.0,74.0,3.0,5500.0
75%,1700.0,3.0,125.0,3.0,7446.809
max,52272.0,40.0,3600.0,43.0,12000000.0


In [35]:
df['location'].value_counts()

location
Whitefield                        508
Sarjapur  Road                    364
Electronic City                   273
Thanisandra                       224
Kanakpura Road                    223
                                 ... 
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Uvce Layout                         1
Abshot Layout                       1
Name: count, Length: 1305, dtype: int64

In [36]:
df['location'] = df['location'].apply(lambda x: x.strip())


In [37]:
location_count = df['location'].value_counts()

In [38]:
location_count_less10 = location_count[location_count<=10]
location_count_less10

location
Sadashiva Nagar                          10
Gunjur Palya                             10
BEML Layout                              10
Ganga Nagar                              10
ITPL                                     10
                                         ..
Ring Road Nagarbhavi                      1
Mango Garden Layout                       1
Vijayabank bank layout                    1
Bidere Agarahara, Behind Safal market     1
Abshot Layout                             1
Name: count, Length: 1063, dtype: int64

In [39]:
df['location'] = df['location'].apply(lambda x : 'other' if x in location_count_less10 else x)

In [40]:
df['location'].value_counts()

location
other                   2927
Whitefield               509
Sarjapur  Road           364
Electronic City          275
Thanisandra              226
                        ... 
Banashankari Stage V      11
Doddakallasandra          11
Kodigehalli               11
NGR Layout                11
LB Shastri Nagar          11
Name: count, Length: 232, dtype: int64

In [41]:
df = df[((df['total_sqft']/df['BHK'])>=300)]

## Removing outliers

In [42]:
def remove_outlier_sqft(df):
    df_output = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf.price_per_sqft)
        st = np.std(subdf.price_per_sqft)
        gen_df = subdf[(subdf.price_per_sqft>(m-st))& (subdf.price_per_sqft<=(m+st))]
        df_output = pd.concat([df_output,gen_df],ignore_index=True)
    return df_output

df = remove_outlier_sqft(df)

In [43]:
def bhk_outlier_remover(df):
    exclude_indices = np.array([])

    # Group by location
    for location, location_df in df.groupby('location'):
        bhk_stats = {}

        # Group by BHK within each location
        for bhk, bhk_df in location_df.groupby('bhk'):
            bhk_stats[bhk] = {
                "mean": np.mean(bhk_df['price_per_sqft']),
                "std": np.std(bhk_df['price_per_sqft']),
                "count": bhk_df.shape[0]
            }

        # Identify outliers
        for bhk, bhk_df in location_df.groupby('bhk'):
            stats = bhk_stats.get(bhk - 1)
            if stats and stats['count'] > 5:
                exclude_indices = np.append(
                    exclude_indices,
                    bhk_df[bhk_df['price_per_sqft'] < (stats['mean'])].index.values
                )

    # Drop outliers
    return df.drop(exclude_indices, axis='index')

In [44]:
df.shape

(9606, 6)

In [45]:
df

Unnamed: 0,location,total_sqft,bath,price,BHK,price_per_sqft
0,1st Block Jayanagar,2850.0,4.0,428.00,4,15017.543860
1,1st Block Jayanagar,1630.0,3.0,194.00,3,11901.840491
2,1st Block Jayanagar,1875.0,2.0,235.00,3,12533.333333
3,1st Block Jayanagar,1200.0,2.0,130.00,3,10833.333333
4,1st Block Jayanagar,1235.0,2.0,148.00,2,11983.805668
...,...,...,...,...,...,...
9601,other,1800.0,1.0,200.00,1,11111.111111
9602,other,1095.0,2.0,57.00,2,5205.479452
9603,other,1440.0,2.0,63.93,3,4439.583333
9604,other,1075.0,2.0,48.00,2,4465.116279


## removing the unnecessary columns

In [46]:
df.drop(columns=['price_per_sqft'],inplace=True)

In [47]:
df.head()

Unnamed: 0,location,total_sqft,bath,price,BHK
0,1st Block Jayanagar,2850.0,4.0,428.0,4
1,1st Block Jayanagar,1630.0,3.0,194.0,3
2,1st Block Jayanagar,1875.0,2.0,235.0,3
3,1st Block Jayanagar,1200.0,2.0,130.0,3
4,1st Block Jayanagar,1235.0,2.0,148.0,2


## Saving the cleaned data in .csv format

In [48]:
df.to_csv("cleaned_house_data.csv")

In [74]:
final=pd.read_csv('cleaned_house_data.csv')
final.shape

(9606, 6)

## X, Y split

In [49]:
x = df.drop(columns=['price'])
y = df['price']

## Importing necessary libraries for model training

In [50]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

# Train test split

In [51]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [52]:
x_train.shape,x_test.shape

((7684, 4), (1922, 4))

# Encoding the location column

In [53]:
colmn_trans = make_column_transformer(
    (OneHotEncoder(drop='if_binary'), ['location']),  # OneHotEncode 'location' column
    remainder='passthrough'  # Keep remaining columns unchanged
)


## Applying Linear Regression

In [54]:
# Define the pipeline with StandardScaler(with_mean=False)
pipe = make_pipeline(colmn_trans, StandardScaler(with_mean=False), LinearRegression())


In [55]:
pipe.fit(x_train,y_train)

In [56]:
y_pred_lr = pipe.predict(x_test)

In [57]:
r2_score(y_test,y_pred_lr)

0.8177086585907432

## Applying Lasso Regression

In [69]:
lasso = Lasso()

In [70]:
pipe = make_pipeline(colmn_trans,StandardScaler(with_mean=False),lasso)

In [71]:
pipe.fit(x_train,y_train)

In [72]:
y_pred_lasso = pipe.predict(x_test)

In [73]:
r2_score(y_test,y_pred_lasso)

0.8030881807245223

## Applying Ridge Regression

In [63]:
ridge = Ridge()

In [64]:
pipe = make_pipeline(colmn_trans,StandardScaler(with_mean=False),ridge)

In [65]:
pipe.fit(x_train,y_train)

In [66]:
y_pred_ridge = pipe.predict(x_test)
r2_score(y_test,y_pred_ridge)

0.8177203619415808

In [67]:
print("No regularization:", r2_score(y_test,y_pred_lr))
print("Lasso:", r2_score(y_test,y_pred_lasso))
print("Ridge:", r2_score(y_test,y_pred_ridge))

No regularization: 0.8177086585907432
Lasso: 0.8030881807245223
Ridge: 0.8177203619415808


# Stroring Ridge Regression model using pickle

In [68]:
import pickle

pickle.dump(pipe,open('RidgeModel.pkl','wb'))