In [1]:
import numpy as np 
import pandas as pd
from sklearn import preprocessing
from sklearn import model_selection
import sklearn

## Data Preprocessing!

In [2]:
home = pd.read_csv("house_price.csv")
home.head()

Unnamed: 0,area_type,location,society,size,total_sqft,sqft,bath,balcony,possession date,price
0,Super built-up Area,Electronic City Phase II,Coomee,2 BHK,1056,1056.0,2.0,1.0,44192,39.07
1,Plot Area,Chikka Tirupathi,Theanmp,4 Bedroom,2600,2600.0,5.0,3.0,Ready to move,120.0
2,Built-up Area,Uttarahalli,,3 BHK,1440,1440.0,2.0,3.0,Ready to move,62.0
3,Super built-up Area,Lingadheeranahalli,Soiewre,3 BHK,1521,1521.0,3.0,1.0,Ready to move,95.0
4,Super built-up Area,Kothanur,,2 BHK,1200,1200.0,2.0,1.0,Ready to move,51.0


In [3]:
## Dropping irrelevant columns
home.drop(columns=['possession date','area_type', 'total_sqft', 'society'],inplace = True)

In [4]:
## Replacing null balcony/bath values with 0
home['balcony'].fillna(0,inplace=True)
home['bath'].fillna(0,inplace=True)
home.shape

(13314, 6)

In [5]:
## Checking null values in %
round(100*(home.isnull().sum()/len(home.index)),2)

location    0.00
size        0.12
sqft        0.00
bath        0.00
balcony     0.00
price       0.00
dtype: float64

In [6]:
#removing NaN values from the dataset
home.dropna(inplace =True)
home.reset_index(drop= True, inplace =True)
home.shape

(13298, 6)

In [7]:
home['bhk'] = home['size'].str.split().str[0]
home['bhk'].dropna(inplace = True)
home['bhk'] = home['bhk'].astype('int')
home.drop(columns=['size'],inplace = True)
home.head()

Unnamed: 0,location,sqft,bath,balcony,price,bhk
0,Electronic City Phase II,1056.0,2.0,1.0,39.07,2
1,Chikka Tirupathi,2600.0,5.0,3.0,120.0,4
2,Uttarahalli,1440.0,2.0,3.0,62.0,3
3,Lingadheeranahalli,1521.0,3.0,1.0,95.0,3
4,Kothanur,1200.0,2.0,1.0,51.0,2


### Cleaning and feature engineering

In [8]:
##Cleaning: removing invalid data entry
## e.g.: The total sqft divided by the number of bhk should always be more than 200

home = home[~(home.sqft/home.bhk<200)]
home.shape

(12962, 6)

In [9]:
home = home.drop(home[home['bath']>6].index)
home = home.drop(home[home['bhk']>7.0].index)
home.shape

(12819, 6)

In [10]:
## Feature Engineering step
home['price_per_sqft'] = home['price']*100000/home['sqft']
home.head()

Unnamed: 0,location,sqft,bath,balcony,price,bhk,price_per_sqft
0,Electronic City Phase II,1056.0,2.0,1.0,39.07,2,3699.810606
1,Chikka Tirupathi,2600.0,5.0,3.0,120.0,4,4615.384615
2,Uttarahalli,1440.0,2.0,3.0,62.0,3,4305.555556
3,Lingadheeranahalli,1521.0,3.0,1.0,95.0,3,6245.890861
4,Kothanur,1200.0,2.0,1.0,51.0,2,4250.0


In [11]:
home['price_per_sqft'].describe()

count     12819.000000
mean       6451.443980
std        4254.389717
min           2.257423
25%        4239.236182
50%        5350.207469
75%        7076.923077
max      176470.588235
Name: price_per_sqft, dtype: float64

In [12]:
## taking only the values with 1st Standard deviation values.
## as per Normal Distribution, 95% of our data lies within 1st Standard Deviation as per the location

def remove_pps_outliers(df):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf.price_per_sqft)
        st = np.std(subdf.price_per_sqft)
        reduced_df = subdf[(subdf.price_per_sqft>(m-st)) & (subdf.price_per_sqft<=(m+st))]
        df_out = pd.concat([df_out,reduced_df],ignore_index=True)
    return df_out
home = remove_pps_outliers(home)
home.shape

(9623, 7)

In [13]:
## checking the dataset with highest location data provided
## Removing the locations with less than frequency 10

home.location = home.location.str.strip()
location_stats = home['location'].value_counts(ascending=False)
location_stats

location
Whitefield               486
Sarjapur  Road           314
Electronic City          285
Kanakpura Road           203
Yelahanka                185
                        ... 
Dena Bank Colony           1
Postal Colony              1
Daadys Gaarden Layout      1
Prashanth Nagar            1
Chokkanahalli              1
Name: count, Length: 779, dtype: int64

In [14]:
## cretaing a Series of all the location having less than 10 entries against its  
location_stats_less_than_10 = location_stats[location_stats<=10]
location_stats_less_than_10

location
NRI Layout               10
Prithvi Layout           10
Dasanapura               10
Judicial Layout          10
Giri Nagar               10
                         ..
Dena Bank Colony          1
Postal Colony             1
Daadys Gaarden Layout     1
Prashanth Nagar           1
Chokkanahalli             1
Name: count, Length: 595, dtype: int64

In [15]:
## using lambda function to naming 'location_stats_less_than_10' as 'other' and then removing it

home.location = home.location.apply(lambda x: 'other' if x in location_stats_less_than_10 else x)
home = home[home.location != 'other']

In [16]:
## performing One hot encoding on the Categorical values
## 1st step. create dummies
dummies = pd.get_dummies(home.location)
dummies.head(3)


#import category_encoders as ce

# Assuming your DataFrame is called 'df'
#encoder = ce.TargetEncoder()
#home['location_encoded'] = encoder.fit_transform(home['location'], home['price'])
#home['society_encoded'] = encoder.fit_transform(home['society'], home['price'])
#home.shape

Unnamed: 0,1st Phase JP Nagar,2nd Stage Nagarbhavi,5th Phase JP Nagar,6th Phase JP Nagar,7th Phase JP Nagar,8th Phase JP Nagar,9th Phase JP Nagar,Abbigere,Akshaya Nagar,Ambalipura,...,Varthur,Vasanthapura,Vidyaranyapura,Vijayanagar,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yeshwanthpur
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [17]:
## adding the dummies dataframe to our main DataFrame
#one-hot
home = pd.concat([home,dummies],axis='columns')

## removing 'location' as we have already created the dummies
home1 = home.drop(columns=['location','price_per_sqft'],axis = 1)
home1

Unnamed: 0,sqft,bath,balcony,price,bhk,1st Phase JP Nagar,2nd Stage Nagarbhavi,5th Phase JP Nagar,6th Phase JP Nagar,7th Phase JP Nagar,...,Varthur,Vasanthapura,Vidyaranyapura,Vijayanagar,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yeshwanthpur
3,1250.0,2.0,3.0,44.00,3,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,1250.0,2.0,2.0,40.00,2,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5,1200.0,2.0,2.0,83.00,2,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6,1170.0,2.0,2.0,40.00,2,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7,1425.0,2.0,2.0,65.00,3,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9612,1676.0,3.0,0.0,92.13,3,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
9613,2503.0,3.0,3.0,138.00,3,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
9614,1855.0,3.0,3.0,135.00,3,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
9615,1876.0,3.0,3.0,160.00,3,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


In [18]:
home1.reset_index(drop = True)
home1.shape

(7761, 189)

In [None]:
#home1.to_csv('house_price_cleaned-onehot_enc.csv', index=False)

In [None]:
## finding correlation values within the dataset

corr = home1.corr()
corr

### Machine Learning part

In [19]:
#X = home1[['bhk', 'sqft', 'bath', 'balcony', 'location_encoded', 'society_encoded']]
X = home1.drop('price',axis = 1)
X.head()

Unnamed: 0,sqft,bath,balcony,bhk,1st Phase JP Nagar,2nd Stage Nagarbhavi,5th Phase JP Nagar,6th Phase JP Nagar,7th Phase JP Nagar,8th Phase JP Nagar,...,Varthur,Vasanthapura,Vidyaranyapura,Vijayanagar,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yeshwanthpur
3,1250.0,2.0,3.0,3,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,1250.0,2.0,2.0,2,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5,1200.0,2.0,2.0,2,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6,1170.0,2.0,2.0,2,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7,1425.0,2.0,2.0,3,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [20]:
y = home1['price']
y.head()

3    44.0
4    40.0
5    83.0
6    40.0
7    65.0
Name: price, dtype: float64

In [21]:
home1.corr()

Unnamed: 0,sqft,bath,balcony,price,bhk,1st Phase JP Nagar,2nd Stage Nagarbhavi,5th Phase JP Nagar,6th Phase JP Nagar,7th Phase JP Nagar,...,Varthur,Vasanthapura,Vidyaranyapura,Vijayanagar,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yeshwanthpur
sqft,1.000000,0.072266,0.004574,0.093758,0.071596,0.001560,-0.001991,-0.001583,0.000095,-0.001647,...,-0.003078,-0.002223,-0.001983,-0.001184,-0.001711,0.004254,-0.001159,-0.002103,-0.004251,-0.000963
bath,0.072266,1.000000,0.210220,0.595047,0.784333,0.018810,0.078347,-0.012786,-0.000050,-0.002477,...,-0.017516,-0.013464,-0.001093,0.007571,-0.017525,0.041758,-0.018931,-0.007965,-0.044405,-0.004666
balcony,0.004574,0.210220,1.000000,0.095418,0.185534,-0.003823,0.027991,0.005997,0.014290,0.004646,...,0.016726,-0.007978,-0.016266,-0.026914,-0.032599,-0.000288,0.027247,0.002972,-0.008880,0.007787
price,0.093758,0.595047,0.095418,1.000000,0.554792,0.041237,0.078195,-0.016572,0.018097,0.012098,...,-0.031803,-0.022747,-0.014905,0.008939,-0.014421,0.040282,-0.003895,-0.024476,-0.030039,0.011437
bhk,0.071596,0.784333,0.185534,0.554792,1.000000,0.009979,0.106431,-0.008420,-0.003119,0.008746,...,-0.017238,-0.021247,-0.001573,0.000760,-0.030972,0.013224,-0.007314,-0.010337,-0.045988,-0.013915
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Whitefield,0.004254,0.041758,-0.000288,0.040282,0.013224,-0.013463,-0.010987,-0.016368,-0.011374,-0.029827,...,-0.020390,-0.009737,-0.014985,-0.017145,-0.018130,1.000000,-0.010587,-0.040389,-0.017396,-0.025532
Yelachenahalli,-0.001159,-0.018931,0.027247,-0.003895,-0.007314,-0.002134,-0.001741,-0.002594,-0.001803,-0.004727,...,-0.003231,-0.001543,-0.002375,-0.002717,-0.002873,-0.010587,1.000000,-0.006401,-0.002757,-0.004046
Yelahanka,-0.002103,-0.007965,0.002972,-0.024476,-0.010337,-0.008140,-0.006643,-0.009896,-0.006877,-0.018033,...,-0.012327,-0.005887,-0.009060,-0.010366,-0.010961,-0.040389,-0.006401,1.000000,-0.010518,-0.015436
Yelahanka New Town,-0.004251,-0.044405,-0.008880,-0.030039,-0.045988,-0.003506,-0.002861,-0.004262,-0.002962,-0.007767,...,-0.005310,-0.002536,-0.003902,-0.004465,-0.004721,-0.017396,-0.002757,-0.010518,1.000000,-0.006649


In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [23]:
from sklearn.linear_model import LinearRegression
m2 = LinearRegression().fit(X_train, y_train)
pred = m2.predict(X_test)
df1 = X_test.copy()
df1['price'] = y_test
df1['prediction'] = pred
df1.head()

Unnamed: 0,sqft,bath,balcony,bhk,1st Phase JP Nagar,2nd Stage Nagarbhavi,5th Phase JP Nagar,6th Phase JP Nagar,7th Phase JP Nagar,8th Phase JP Nagar,...,Vidyaranyapura,Vijayanagar,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yeshwanthpur,price,prediction
5655,812.0,2.0,1.0,2,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,38.25,57.028512
1412,1605.0,3.0,2.0,3,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,85.0,107.448446
6338,1350.0,3.0,0.0,3,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,60.0,122.652737
1489,1262.0,2.0,2.0,2,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,47.0,60.9412
5671,1432.0,2.0,2.0,3,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,61.11,75.930312


In [24]:
from sklearn.metrics import mean_squared_error,r2_score
acc = mean_squared_error(pred,y_test)
acc

1297.0188199207207

In [25]:
df1['error'] = df1['price'] - df1['prediction']
df1['sqerror'] = df1['error']** 2
df1.head()

Unnamed: 0,sqft,bath,balcony,bhk,1st Phase JP Nagar,2nd Stage Nagarbhavi,5th Phase JP Nagar,6th Phase JP Nagar,7th Phase JP Nagar,8th Phase JP Nagar,...,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yeshwanthpur,price,prediction,error,sqerror
5655,812.0,2.0,1.0,2,False,False,False,False,False,False,...,False,False,False,False,False,False,38.25,57.028512,-18.778512,352.632519
1412,1605.0,3.0,2.0,3,False,False,False,False,False,False,...,False,False,False,False,False,False,85.0,107.448446,-22.448446,503.932737
6338,1350.0,3.0,0.0,3,False,False,False,False,False,False,...,False,False,False,False,False,False,60.0,122.652737,-62.652737,3925.365441
1489,1262.0,2.0,2.0,2,False,False,False,False,False,False,...,False,False,False,False,False,False,47.0,60.9412,-13.9412,194.357059
5671,1432.0,2.0,2.0,3,False,False,False,False,False,False,...,False,False,False,False,False,False,61.11,75.930312,-14.820312,219.641654


In [26]:
# Step 1: Calculate the mean of the squared errors
mean_sqerror = df1['sqerror'].mean()
print("mean_sqerror:", mean_sqerror)
# Step 2: Take the square root of the mean squared error to obtain RMSE
rmse = mean_sqerror ** 0.5

print("Root Mean Squared Error (RMSE):", rmse)

mean_sqerror: 1297.0188199207207
Root Mean Squared Error (RMSE): 36.01414749679243


### Standardize features by removing the mean and scaling to unit variance



In [None]:
home1

In [None]:
## Dividing our dataset to Independent and Dependent Variables

X = home1.drop('price',axis = 1).values ## Independent Variables
y = home1.price.values ## Dependent Variables
X

In [None]:
## adding a new axis
y = y[:,np.newaxis]
y

In [None]:
## preprocessing the data values to StandardScaler
sc = preprocessing.StandardScaler()
X1 = sc.fit_transform(X)


In [None]:
## Standardize a dataset along any axis

## Center to the mean and component wise scale to unit variance.

Std_x1 = preprocessing.scale(X)

## Machine Learning Model


In [None]:
from sklearn.model_selection import cross_val_score,cross_val_predict
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
from sklearn.model_selection import cross_validate as CV

In [None]:
## using Cross Validation of 5 and scoring of Negative mean sqaured error

cross1 = cross_val_score(lr,Std_x1,y,cv=5,scoring='neg_mean_squared_error')
print(cross1.mean())

In [None]:
# from the model selection module import train_test_split for the ML training and testing.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X1,y,test_size=0.3,random_state=10)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)
acc = mean_squared_error(y_pred,y_test)
rscore = r2_score(y_pred,y_test)
rmse = acc ** 0.5
print("acc:", acc, "rmse:", rmse, "rscore:", rscore)
