In [30]:
import numpy as np 
import pandas as pd 
import matplotlib.pylab as plt
import plotly.express as px 
import plotly.graph_objects as go
%matplotlib inline


In [2]:
#importing the dataset
df =  pd.read_csv('delhi_house_price.csv' )

In [3]:
df.head()

Unnamed: 0,Area,BHK,Bathroom,Furnishing,Locality,Parking,Price,Status,Transaction,Type,Per_Sqft
0,800.0,3,2.0,Semi-Furnished,Rohini Sector 25,1.0,6500000,Ready_to_move,New_Property,Builder_Floor,
1,750.0,2,2.0,Semi-Furnished,"J R Designers Floors, Rohini Sector 24",1.0,5000000,Ready_to_move,New_Property,Apartment,6667.0
2,950.0,2,2.0,Furnished,"Citizen Apartment, Rohini Sector 13",1.0,15500000,Ready_to_move,Resale,Apartment,6667.0
3,600.0,2,2.0,Semi-Furnished,Rohini Sector 24,1.0,4200000,Ready_to_move,Resale,Builder_Floor,6667.0
4,650.0,2,2.0,Semi-Furnished,Rohini Sector 24 carpet area 650 sqft status R...,1.0,6200000,Ready_to_move,New_Property,Builder_Floor,6667.0


## Data Exploring

In [4]:
df.shape

(1259, 11)

In [5]:
# checking Null Values
df.isnull().sum()

Area             0
BHK              0
Bathroom         2
Furnishing       5
Locality         0
Parking         33
Price            0
Status           0
Transaction      0
Type             5
Per_Sqft       241
dtype: int64

In [6]:
#creating  price_per_sqft columns to understand the data
df['price_per_sqft'] = df['Price']/df['Area']
df['price_per_sqft'] = df['price_per_sqft'].astype(int)

In [7]:
df.head(3)

Unnamed: 0,Area,BHK,Bathroom,Furnishing,Locality,Parking,Price,Status,Transaction,Type,Per_Sqft,price_per_sqft
0,800.0,3,2.0,Semi-Furnished,Rohini Sector 25,1.0,6500000,Ready_to_move,New_Property,Builder_Floor,,8125
1,750.0,2,2.0,Semi-Furnished,"J R Designers Floors, Rohini Sector 24",1.0,5000000,Ready_to_move,New_Property,Apartment,6667.0,6666
2,950.0,2,2.0,Furnished,"Citizen Apartment, Rohini Sector 13",1.0,15500000,Ready_to_move,Resale,Apartment,6667.0,16315


In [8]:
# Droping null values (as we cant even out average values due to not fitting correctly)
df = df[df['Furnishing'].notna()]
df = df[df['Bathroom'].notna()]
df = df[df['Parking'].notna()]
df = df[df['Per_Sqft'].notna()]

In [9]:
df.isnull().sum()

Area              0
BHK               0
Bathroom          0
Furnishing        0
Locality          0
Parking           0
Price             0
Status            0
Transaction       0
Type              0
Per_Sqft          0
price_per_sqft    0
dtype: int64

In [10]:
# new dataframe without any null values
df1 = df

In [11]:
df1.shape

(1005, 12)

## Visulizing Data's Features


In [12]:
df1.head()

Unnamed: 0,Area,BHK,Bathroom,Furnishing,Locality,Parking,Price,Status,Transaction,Type,Per_Sqft,price_per_sqft
1,750.0,2,2.0,Semi-Furnished,"J R Designers Floors, Rohini Sector 24",1.0,5000000,Ready_to_move,New_Property,Apartment,6667.0,6666
2,950.0,2,2.0,Furnished,"Citizen Apartment, Rohini Sector 13",1.0,15500000,Ready_to_move,Resale,Apartment,6667.0,16315
3,600.0,2,2.0,Semi-Furnished,Rohini Sector 24,1.0,4200000,Ready_to_move,Resale,Builder_Floor,6667.0,7000
4,650.0,2,2.0,Semi-Furnished,Rohini Sector 24 carpet area 650 sqft status R...,1.0,6200000,Ready_to_move,New_Property,Builder_Floor,6667.0,9538
5,1300.0,4,3.0,Semi-Furnished,Rohini Sector 24,1.0,15500000,Ready_to_move,New_Property,Builder_Floor,6667.0,11923


In [13]:
# Removing column: per_sqft
# Re-Shifting Price column
df2 = df1.drop(columns=['Per_Sqft','price_per_sqft'])
col9 = df2.pop('Price')
df2.insert(9,'Price', col9)
df2.head()

Unnamed: 0,Area,BHK,Bathroom,Furnishing,Locality,Parking,Status,Transaction,Type,Price
1,750.0,2,2.0,Semi-Furnished,"J R Designers Floors, Rohini Sector 24",1.0,Ready_to_move,New_Property,Apartment,5000000
2,950.0,2,2.0,Furnished,"Citizen Apartment, Rohini Sector 13",1.0,Ready_to_move,Resale,Apartment,15500000
3,600.0,2,2.0,Semi-Furnished,Rohini Sector 24,1.0,Ready_to_move,Resale,Builder_Floor,4200000
4,650.0,2,2.0,Semi-Furnished,Rohini Sector 24 carpet area 650 sqft status R...,1.0,Ready_to_move,New_Property,Builder_Floor,6200000
5,1300.0,4,3.0,Semi-Furnished,Rohini Sector 24,1.0,Ready_to_move,New_Property,Builder_Floor,15500000


In [14]:
## removing outliers and making data frame distributed normally

def remove_outliers(df):
    df_out = pd.DataFrame()
    for key, subdf in df2.groupby('Locality'):
        m = np.mean(subdf.Price)
        st = np.std(subdf.Price)
        red_df = subdf[(subdf.Price>(m-st)) &(subdf.Price<=(m+st))]
        df_out = pd.concat([df_out,red_df],ignore_index = True)
    return df_out
df3 = remove_outliers(df2)
df3.shape

(515, 10)

## Visulising Features of Dataset 

In [15]:
#Area and Price relation
fig = px.histogram(df3,x='Area',y='Price',
                    title='Area vs Price',
                    opacity=0.8,
                    color_discrete_sequence=['indianred'],
                    labels={'Area':'Area in Sqft', 'Price':'Price'})
fig.update_layout(barmode='group', bargap=0.30,bargroupgap=0.0,height=420,width=1080)
fig.show()


In [16]:
# House Types
fig = px.histogram(df3,x='Type',
                    title='Different House Types',
                    opacity=0.8,
                    color = 'Type',
                    #color_discrete_sequence=['indianred'],
                    labels={'Type':'Type of House'})
fig.update_layout(barmode='group', bargap=0.10,bargroupgap=0.0,height=520,width=600)
fig.show()

In [17]:
# House condition
fig = px.histogram(df3,x='Furnishing',
                    title='House condition',
                    opacity=0.8,
                    color = 'Furnishing')
                    #color_discrete_sequence=['indianred']
fig.update_layout(barmode='group', bargap=0.10,bargroupgap=0.2,height=520,width=600)
fig.show()

In [18]:
# Price Share of House types
fig = px.pie(df3, values='Price', names='Furnishing',
                title='House Condition vs Price Share')
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.update_layout(height=420,width=600)
fig.show()

In [19]:
# Different house types
fig = px.histogram(df3,x='Type',
                    title='House Type',
                    opacity=0.7,
                    color = 'Type')
                    #color_discrete_sequence=['indianred']
fig.update_layout(barmode='group', bargap=0.10,bargroupgap=0.2,height=520,width=600)
fig.show()

In [20]:
# Price Share of House Type
fig = px.pie(df3, values='Price', names='Type',
                title='House Type vs Price Share')
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.update_layout(height=420,width=600)
fig.show()

In [21]:
#House shifting Status
fig = px.histogram(df3,x='Status',
                    title='House Shifting Status',
                    opacity=1,
                    color="Status")
fig.update_layout(barmode='group',height=520,width=600)
fig.show()

In [22]:
fig = px.histogram(df3,x='Transaction',
                    title='House Ownership Vs House Type',
                    opacity=0.8,
                    color = 'Type',
                    labels={'Transaction':'Ownership'})
                    #color_discrete_sequence=['indianred']
fig.update_layout(barmode='group', bargap=0.20,bargroupgap=0.0,height=520,width=600)
fig.show()

In [23]:
#Parking slots counts
fig = px.histogram(df3,x='Parking',
                    title='Parking Count',
                    color_discrete_sequence=['mediumaquamarine'],
                    labels={'Parking':'Number of Parkings'})
fig.update_layout(bargap=0.20,height=420,width=1080)
fig.show()

In [28]:
#Parking numbers vs price
fig = px.scatter(df3,x='Parking',y='Price',
                title='Price distribution by Number of Parkings',
                labels={'Parking':'Number of Parkings'},
                color='Parking')
fig.update_layout(height=520,width=1080)
fig.show()

## Finding and Removing outliers by business understandings

In [32]:
#Checking for weather house price of 2BHK > 3BHK: Locality wise
def plot_scatter(df3,Locality):
    bhk2 = df3[(df3.Locality == Locality) &(df3.BHK ==2)]
    bhk3 = df3[(df3.Locality == Locality) &(df3.BHK ==3)]
    fig = go.Figure() 
    fig.add_trace(go.Scatter(x=bhk2.Area,y=bhk2.Price,
                    mode='markers',
                    name='2 BHK'))
    fig.add_trace(go.Scatter(x=bhk3.Area,y=bhk3.Price,
                    mode='markers',
                    name='3 BHK'))
    fig.update_layout(title='2BHK vs 3BHK Price', xaxis_title='Area in Sqft', yaxis_title='Price',height=420,width=680)
    fig.show()

plot_scatter(df3,'Alaknanda')

In [33]:
# Droping the data where Number of Bathrooms are greater than Number of BHK's (bhk+2)
df3.drop(df3[df3.BHK+2 < df3.Bathroom].index, inplace= True)

In [34]:
# As we know, generally no house can be made in an area < 100 sqft 
df3.drop(df3[df3['Area'] < 100].index,inplace=True)

In [35]:
# Area requried for 2bhk or more than that must be > 300sqft 
df3.drop(df3[(df3['Area'] < 300) & (df3['BHK'] >=2)].index, inplace=True)

In [36]:
df3.shape

(499, 10)

In [37]:
writer = pd.ExcelWriter('Cleaned data_Delhi_hp_{}.xlsx'.format(pd.datetime.today().strftime('%d%m%y- %H-%M'))) 
df3.to_excel(writer,'Sheet1')
writer.save()


The pandas.datetime class is deprecated and will be removed from pandas in a future version. Import from datetime module instead.



# Model Building for price predictions

In [38]:
df_final = pd.read_excel('Cleaned data_Delhi_hp_270521- 23-38.xlsx', index_col=0)

In [39]:
df_final.shape

(499, 10)

In [40]:
df_final.head(5)

Unnamed: 0,Area,BHK,Bathroom,Furnishing,Locality,Parking,Status,Transaction,Type,Price
1,750.0,2,2,Semi-Furnished,"Abul Fazal Enclave Part-II, Okhla",1,Ready_to_move,Resale,Builder_Floor,3800000
2,1400.0,3,2,Unfurnished,Alaknanda,1,Ready_to_move,Resale,Apartment,19000000
3,1100.0,2,2,Unfurnished,Alaknanda,2,Ready_to_move,Resale,Apartment,15000000
4,1200.0,2,2,Semi-Furnished,Alaknanda,2,Ready_to_move,Resale,Apartment,17500000
5,1400.0,2,2,Semi-Furnished,Alaknanda,2,Ready_to_move,Resale,Apartment,14000000


In [41]:
# Due to more number of Localities i am making a general locality called OTHER (for localities occuring 1 times in data)
df_loc = df_final.Locality.value_counts()
loc_lessthan2 = df_loc[df_loc==1]
df_final.Locality = df_final.Locality.apply(lambda x: 'other' if x in loc_lessthan2 else x)


In [42]:
df_final.head()

Unnamed: 0,Area,BHK,Bathroom,Furnishing,Locality,Parking,Status,Transaction,Type,Price
1,750.0,2,2,Semi-Furnished,other,1,Ready_to_move,Resale,Builder_Floor,3800000
2,1400.0,3,2,Unfurnished,Alaknanda,1,Ready_to_move,Resale,Apartment,19000000
3,1100.0,2,2,Unfurnished,Alaknanda,2,Ready_to_move,Resale,Apartment,15000000
4,1200.0,2,2,Semi-Furnished,Alaknanda,2,Ready_to_move,Resale,Apartment,17500000
5,1400.0,2,2,Semi-Furnished,Alaknanda,2,Ready_to_move,Resale,Apartment,14000000


### Using One Hot Enchoding for handeling the 'text dtype' Columnes


In [43]:
#one hot enchoding for Locality column
dummies_df1 =  pd.get_dummies(df_final.Locality)
df_final1 = pd.concat([df_final,dummies_df1],axis='columns')


In [44]:
#one hot enchoding for Furnishing column
dummies_df2 =  pd.get_dummies(df_final.Furnishing)
df_final1 = pd.concat([df_final1,dummies_df2],axis='columns')

In [45]:
#one hot enchoding for Status column
dummies_df3 =  pd.get_dummies(df_final.Status)
df_final1 = pd.concat([df_final1,dummies_df3],axis='columns')

In [46]:
#one hot enchoding for Transaction column
dummies_df4 =  pd.get_dummies(df_final.Transaction)
df_final1 = pd.concat([df_final1,dummies_df4],axis='columns')

In [47]:
#one hot enchoding for Type column
dummies_df5 =  pd.get_dummies(df_final.Type)
df_final1 = pd.concat([df_final1,dummies_df5],axis='columns')

In [48]:
df_final1.head()

Unnamed: 0,Area,BHK,Bathroom,Furnishing,Locality,Parking,Status,Transaction,Type,Price,...,other,Furnished,Semi-Furnished,Unfurnished,Almost_ready,Ready_to_move,New_Property,Resale,Apartment,Builder_Floor
1,750.0,2,2,Semi-Furnished,other,1,Ready_to_move,Resale,Builder_Floor,3800000,...,1,0,1,0,0,1,0,1,0,1
2,1400.0,3,2,Unfurnished,Alaknanda,1,Ready_to_move,Resale,Apartment,19000000,...,0,0,0,1,0,1,0,1,1,0
3,1100.0,2,2,Unfurnished,Alaknanda,2,Ready_to_move,Resale,Apartment,15000000,...,0,0,0,1,0,1,0,1,1,0
4,1200.0,2,2,Semi-Furnished,Alaknanda,2,Ready_to_move,Resale,Apartment,17500000,...,0,0,1,0,0,1,0,1,1,0
5,1400.0,2,2,Semi-Furnished,Alaknanda,2,Ready_to_move,Resale,Apartment,14000000,...,0,0,1,0,0,1,0,1,1,0


In [None]:
# Droping useless columns
df_final1.drop(columns=['Furnishing','Locality','Status','Transaction','Type'],inplace=True)

In [52]:
df_final1.head()

Unnamed: 0,Area,BHK,Bathroom,Parking,Price,Alaknanda,"Andheria Mor, Mehrauli","Arjun Nagar, Safdarjung Enclave",Budh Vihar Phase 1,Chhattarpur,...,other,Furnished,Semi-Furnished,Unfurnished,Almost_ready,Ready_to_move,New_Property,Resale,Apartment,Builder_Floor
1,750.0,2,2,1,3800000,0,0,0,0,0,...,1,0,1,0,0,1,0,1,0,1
2,1400.0,3,2,1,19000000,1,0,0,0,0,...,0,0,0,1,0,1,0,1,1,0
3,1100.0,2,2,2,15000000,1,0,0,0,0,...,0,0,0,1,0,1,0,1,1,0
4,1200.0,2,2,2,17500000,1,0,0,0,0,...,0,0,1,0,0,1,0,1,1,0
5,1400.0,2,2,2,14000000,1,0,0,0,0,...,0,0,1,0,0,1,0,1,1,0


# Model Tranining

In [53]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

model = LinearRegression()

In [64]:
df3.corr()

Unnamed: 0,Area,BHK,Bathroom,Parking,Price
Area,1.0,0.377792,0.451953,-0.022128,0.478119
BHK,0.377792,1.0,0.781982,-0.154927,0.606396
Bathroom,0.451953,0.781982,1.0,-0.092679,0.762738
Parking,-0.022128,-0.154927,-0.092679,1.0,-0.013329
Price,0.478119,0.606396,0.762738,-0.013329,1.0


In [65]:
# Taking the Features for our training 
x = df_final1.drop(columns=['Price','Parking'])
y= df_final1['Price']

In [73]:
# spliting the data set
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.35, random_state=0)

In [74]:
# Fitting the Liner Regression model
model.fit(x_train,y_train)
model.score(x_test,y_test)

0.7808945114534891

In [75]:
### Creating a prediction function for making predictions

In [76]:
x.columns

Index(['Area', 'BHK', 'Bathroom', 'Alaknanda', 'Andheria Mor, Mehrauli',
       'Arjun Nagar, Safdarjung Enclave', 'Budh Vihar Phase 1', 'Chhattarpur',
       'Chhattarpur Enclave Phase2', 'Chittaranjan Park',
       'Common Wealth Games Village, Commonwealth Games Village 2010',
       'Commonwealth Games Village 2010', 'DDA Flats Block A, Dilshad Garden',
       'DDA Lig Flats, Narela',
       'DLF Capital Greens, New Moti Nagar, Kirti Nagar',
       'Dev Nagar, Karol Bagh', 'Dilshad Colony, Dilshad Garden',
       'Dilshad Garden', 'Dwarka Mor',
       'Friends Colony East, New Friends Colony', 'Geeta Colony',
       'Geetanjali Enclave, Malviya Nagar', 'Godrej South Estate, Okhla',
       'Godrej South Estate, Okhla Phase 1', 'Greater Kailash 1', 'Hauz Khas',
       'Hauz Khas Enclave, Hauz Khas',
       'J R Designers Floors, Rohini Sector 24',
       'Kailash Colony, Greater Kailash', 'Karol Bagh', 'Lajpat Nagar 2',
       'Lajpat Nagar 3', 'Laxmi Nagar',
       'MTNL Employees H

In [78]:
# Prediction Fucntion ( we can do the prediction without worrying about Hot enchoding)
def prediction(Locality,Furnishing,Status,Transaction,Type,Area_total,BHK,Bathroom):
    Locality_index = np.where(x.columns== Locality)[0][0]
    Furnishing_index = np.where(x.columns== Furnishing)[0][0]
    Status_index = np.where(x.columns== Status)[0][0]
    Transaction_index = np.where(x.columns== Transaction)[0][0]
    Type_index = np.where(x.columns== Type)[0][0]

    n = np.zeros(len(x.columns))
    n[0]=Area_total
    n[1]=BHK
    n[2]=Bathroom
    if Locality_index >=0:
        n[Locality_index]=1
    if Furnishing_index>=0:
        n[Furnishing_index]=1
    if Status_index >=0:
        n[Status_index]=1
    if Transaction_index >=0:
        n[Transaction_index]=1
    if Type_index >=0:
        n[Type_index]=1
    return model.predict([n])[0]
    

## Predictions

In [83]:
df3.iloc[2,:]

Area                    1100
BHK                        2
Bathroom                   2
Furnishing       Unfurnished
Locality           Alaknanda
Parking                    2
Status         Ready_to_move
Transaction           Resale
Type               Apartment
Price               15000000
Name: 3, dtype: object

In [80]:
Price_prediction = prediction('Alaknanda','Unfurnished','Ready_to_move','Resale','Apartment',1100,2,2)
print("The predicted price is",Price_prediction ,"lakh")



The predicted price is 15270341.712971712 lakh


# Thank You

    credits: {Data Set: Kaggle , Project Made by: Kumar Shivam}