In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline


In [2]:
# import csv file
df = pd.read_csv('Bengaluru_House_Data.csv')
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [3]:
# Checking missing values
df.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [4]:
df['size'].unique()

array(['2 BHK', '4 Bedroom', '3 BHK', '4 BHK', '6 Bedroom', '3 Bedroom',
       '1 BHK', '1 RK', '1 Bedroom', '8 Bedroom', '2 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', nan, '9 Bedroom', '27 BHK', '10 Bedroom', '11 Bedroom',
       '10 BHK', '19 BHK', '16 BHK', '43 Bedroom', '14 BHK', '8 BHK',
       '12 Bedroom', '13 BHK', '18 Bedroom'], dtype=object)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [6]:

df['size'] = df['size'].str.replace("BHK", "")
df['size'].unique()


array(['2 ', '4 Bedroom', '3 ', '4 ', '6 Bedroom', '3 Bedroom', '1 ',
       '1 RK', '1 Bedroom', '8 Bedroom', '2 Bedroom', '7 Bedroom', '5 ',
       '7 ', '6 ', '5 Bedroom', '11 ', '9 ', nan, '9 Bedroom', '27 ',
       '10 Bedroom', '11 Bedroom', '10 ', '19 ', '16 ', '43 Bedroom',
       '14 ', '8 ', '12 Bedroom', '13 ', '18 Bedroom'], dtype=object)

In [7]:
df['size'] = df['size'].str.replace("Bedroom", "")
df['size'].unique()

array(['2 ', '4 ', '3 ', '6 ', '1 ', '1 RK', '8 ', '7 ', '5 ', '11 ',
       '9 ', nan, '27 ', '10 ', '19 ', '16 ', '43 ', '14 ', '12 ', '13 ',
       '18 '], dtype=object)

In [8]:
df['size'] = df['size'].str.replace("RK", "")
df['size'].unique()

array(['2 ', '4 ', '3 ', '6 ', '1 ', '8 ', '7 ', '5 ', '11 ', '9 ', nan,
       '27 ', '10 ', '19 ', '16 ', '43 ', '14 ', '12 ', '13 ', '18 '],
      dtype=object)

In [9]:
from sklearn.impute import SimpleImputer

# Create an imputer object
imputer = SimpleImputer(strategy='median')  # Use 'mean' or 'most_frequent' for different strategies

# Fit and transform the DataFrame
df[['size', 'bath', 'balcony']] = imputer.fit_transform(df[['size', 'bath', 'balcony']])


In [10]:


# Create an imputer object with the "most_frequent" strategy for categorical variables
imputer = SimpleImputer(strategy='most_frequent')

# Specify the columns to impute
columns_to_impute = ['location', 'society']

# Fit and transform the DataFrame
df[columns_to_impute] = imputer.fit_transform(df[columns_to_impute])


In [11]:
df.isnull().sum()

area_type       0
availability    0
location        0
size            0
society         0
total_sqft      0
bath            0
balcony         0
price           0
dtype: int64

In [12]:
# Checking duplicate values
df.duplicated().sum()

530

In [13]:
# Drop duplicate rows
df.drop_duplicates(inplace=True)
df.duplicated().sum()

0

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12790 entries, 0 to 13318
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     12790 non-null  object 
 1   availability  12790 non-null  object 
 2   location      12790 non-null  object 
 3   size          12790 non-null  float64
 4   society       12790 non-null  object 
 5   total_sqft    12790 non-null  object 
 6   bath          12790 non-null  float64
 7   balcony       12790 non-null  float64
 8   price         12790 non-null  float64
dtypes: float64(4), object(5)
memory usage: 999.2+ KB


In [15]:
m = df[['area_type','total_sqft']]

In [16]:
from sklearn.preprocessing import LabelEncoder

In [17]:
l=LabelEncoder()
for i in m:
    df[i]=l.fit_transform(df[i])

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12790 entries, 0 to 13318
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     12790 non-null  int32  
 1   availability  12790 non-null  object 
 2   location      12790 non-null  object 
 3   size          12790 non-null  float64
 4   society       12790 non-null  object 
 5   total_sqft    12790 non-null  int32  
 6   bath          12790 non-null  float64
 7   balcony       12790 non-null  float64
 8   price         12790 non-null  float64
dtypes: float64(4), int32(2), object(3)
memory usage: 899.3+ KB


In [19]:
df['availability'].value_counts()

availability
Ready To Move    10171
18-May             292
18-Dec             284
18-Apr             269
18-Aug             187
                 ...  
16-Oct               1
15-Dec               1
15-Jun               1
16-Jul               1
14-Jul               1
Name: count, Length: 81, dtype: int64

In [20]:
df['location'].value_counts()

location
Whitefield                        524
Sarjapur  Road                    379
Electronic City                   287
Kanakpura Road                    249
Thanisandra                       229
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: count, Length: 1305, dtype: int64

In [21]:
df.drop(['availability','location','society'],axis=1,inplace=True)

In [22]:
df.head()

Unnamed: 0,area_type,size,total_sqft,bath,balcony,price
0,3,2.0,70,2.0,1.0,39.07
1,2,4.0,1288,5.0,3.0,120.0
2,0,3.0,514,2.0,3.0,62.0
3,3,3.0,602,3.0,1.0,95.0
4,3,2.0,239,2.0,1.0,51.0


In [23]:
# Divide target and feature variable
X = df.drop(['price'],axis=1)
y = df.price

In [24]:
X

Unnamed: 0,area_type,size,total_sqft,bath,balcony
0,3,2.0,70,2.0,1.0
1,2,4.0,1288,5.0,3.0
2,0,3.0,514,2.0,3.0
3,3,3.0,602,3.0,1.0
4,3,2.0,239,2.0,1.0
...,...,...,...,...,...
13314,3,3.0,803,3.0,3.0
13315,0,5.0,1499,4.0,0.0
13316,3,4.0,1535,5.0,2.0
13317,0,2.0,172,2.0,1.0


In [25]:
# Train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [26]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_train = scaler.fit_transform(X_train)
scaled_test_x=scaler.transform(X_test)

In [None]:
import seaborn as sns
sns.pairplot(df,hue="price")

<seaborn.axisgrid.PairGrid at 0x261c8d78890>

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor


In [None]:
ln=LinearRegression()
ln.fit(scaled_train,y_train)
pn=ln.predict(scaled_test_x)
svr=SVR()
svr.fit(scaled_train,y_train)
pvr=svr.predict(scaled_test_x)
rfr= RandomForestRegressor()
rfr.fit(scaled_train,y_train)
prfr=rfr.predict(scaled_test_x)
gbr=GradientBoostingRegressor()
gbr.fit(scaled_train,y_train)
pgbr=gbr.predict(scaled_test_x)
xgb=XGBRegressor()
xgb.fit(scaled_train,y_train)
pxgb=xgb.predict(scaled_test_x)

In [None]:
y_pred = model.predict(scaled_test_x)


In [None]:
from sklearn.metrics import r2_score

# Define the data for the DataFrame
new = pd.DataFrame({
    "models": ["LN", "SVR", "RFReg", "GBR", "XGBR"],
    "R2_Score": [
        r2_score(y_test, pn),
        r2_score(y_test, pvr),
        r2_score(y_test, prfr),
        r2_score(y_test, pgbr),
        r2_score(y_test, pxgb)
    ]
})


In [None]:
new

In [None]:
sns.catplot(data=new,x="models",y="R2_Score",kind="bar")