In [1]:
import numpy as np 
import pandas as pd
from sklearn import preprocessing
from sklearn import model_selection
import sklearn

## Data Preprocessing!

In [2]:
home = pd.read_csv("house_price.csv")
home.head()

Unnamed: 0,area_type,location,society,size,total_sqft,sqft,bath,balcony,possession date,price
0,Super built-up Area,Electronic City Phase II,Coomee,2 BHK,1056,1056.0,2.0,1.0,44192,39.07
1,Plot Area,Chikka Tirupathi,Theanmp,4 Bedroom,2600,2600.0,5.0,3.0,Ready to move,120.0
2,Built-up Area,Uttarahalli,,3 BHK,1440,1440.0,2.0,3.0,Ready to move,62.0
3,Super built-up Area,Lingadheeranahalli,Soiewre,3 BHK,1521,1521.0,3.0,1.0,Ready to move,95.0
4,Super built-up Area,Kothanur,,2 BHK,1200,1200.0,2.0,1.0,Ready to move,51.0


In [3]:
## Dropping irrelevant columns
home.drop(columns=['possession date','area_type', 'total_sqft'],inplace = True)

In [4]:
## Replacing null balcony/bath values with 0
home['balcony'].fillna(0,inplace=True)
home['bath'].fillna(0,inplace=True)
home.shape

(13314, 7)

In [5]:
## Checking null values in %
round(100*(home.isnull().sum()/len(home.index)),2)

location     0.00
society     41.29
size         0.12
sqft         0.00
bath         0.00
balcony      0.00
price        0.00
dtype: float64

In [6]:
#removing NaN values from the dataset
home.dropna(inplace =True)
home.reset_index(drop= True, inplace =True)
home.shape

(7804, 7)

In [7]:
home['bhk'] = home['size'].str.split().str[0]
home['bhk'].dropna(inplace = True)
home['bhk'] = home['bhk'].astype('int')
home.drop(columns=['size'],inplace = True)
home.head()

Unnamed: 0,location,society,sqft,bath,balcony,price,bhk
0,Electronic City Phase II,Coomee,1056.0,2.0,1.0,39.07,2
1,Chikka Tirupathi,Theanmp,2600.0,5.0,3.0,120.0,4
2,Lingadheeranahalli,Soiewre,1521.0,3.0,1.0,95.0,3
3,Whitefield,DuenaTa,1170.0,2.0,1.0,38.0,2
4,Old Airport Road,Jaades,2732.0,4.0,0.0,204.0,4


### Cleaning and feature engineering

In [8]:
##Cleaning: removing invalid data entry
## e.g.: The total sqft divided by the number of bhk should always be more than 200

home = home[~(home.sqft/home.bhk<200)]
home.shape

(7799, 7)

In [9]:
home = home.drop(home[home['bath']>6].index)
home = home.drop(home[home['bhk']>7.0].index)
home.shape

(7784, 7)

In [10]:
## Feature Engineering step
home['price_per_sqft'] = home['price']*100000/home['sqft']
home.head()

Unnamed: 0,location,society,sqft,bath,balcony,price,bhk,price_per_sqft
0,Electronic City Phase II,Coomee,1056.0,2.0,1.0,39.07,2,3699.810606
1,Chikka Tirupathi,Theanmp,2600.0,5.0,3.0,120.0,4,4615.384615
2,Lingadheeranahalli,Soiewre,1521.0,3.0,1.0,95.0,3,6245.890861
3,Whitefield,DuenaTa,1170.0,2.0,1.0,38.0,2,3247.863248
4,Old Airport Road,Jaades,2732.0,4.0,0.0,204.0,4,7467.057101


In [11]:
home['price_per_sqft'].describe()

count     7784.000000
mean      5906.663356
std       2751.433547
min         33.210897
25%       4299.092126
50%       5362.739697
75%       6750.000000
max      76530.612245
Name: price_per_sqft, dtype: float64

In [12]:
## taking only the values with 1st Standard deviation values.
## as per Normal Distribution, 95% of our data lies within 1st Standard Deviation as per the location

def remove_pps_outliers(df):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf.price_per_sqft)
        st = np.std(subdf.price_per_sqft)
        reduced_df = subdf[(subdf.price_per_sqft>(m-st)) & (subdf.price_per_sqft<=(m+st))]
        df_out = pd.concat([df_out,reduced_df],ignore_index=True)
    return df_out
home = remove_pps_outliers(home)
home.shape

(5601, 8)

In [13]:
## checking the dataset with highest location data provided
## Removing the locations with less than frequency 10

home.location = home.location.str.strip()
location_stats = home['location'].value_counts(ascending=False)
location_stats

location
Whitefield                 376
Sarjapur  Road             252
Electronic City            225
Kanakpura Road             165
Yelahanka                  144
                          ... 
Sathya Layout                1
Muthurayya Swamy Layout      1
Mylasandra                   1
Dommasandra                  1
Chikka Banaswadi             1
Name: count, Length: 439, dtype: int64

In [14]:
## cretaing a Series of all the location having less than 10 entries against its  
location_stats_less_than_10 = location_stats[location_stats<=10]
location_stats_less_than_10

location
Dodda Nekkundi             10
Neeladri Nagar             10
Nagarbhavi                 10
Seegehalli                 10
Domlur                     10
                           ..
Sathya Layout               1
Muthurayya Swamy Layout     1
Mylasandra                  1
Dommasandra                 1
Chikka Banaswadi            1
Name: count, Length: 325, dtype: int64

In [15]:
## using lambda function to naming 'location_stats_less_than_10' as 'other' and then removing it

home.location = home.location.apply(lambda x: 'other' if x in location_stats_less_than_10 else x)
home = home[home.location != 'other']

In [16]:
## performing One hot encoding on the Categorical values
## 1st step. create dummies
#dummies = pd.get_dummies(home.location)
#dummies.head(3)


import category_encoders as ce

# Assuming your DataFrame is called 'df'
encoder = ce.TargetEncoder()
home['location_encoded'] = encoder.fit_transform(home['location'], home['price'])
home['society_encoded'] = encoder.fit_transform(home['society'], home['price'])
home.shape

(4543, 10)

In [17]:
## adding the dummies dataframe to our main DataFrame
#one-hot
#home = pd.concat([home,dummies],axis='columns')

## removing 'location' as we have already created the dummies
home1 = home.drop(columns=['location','society', 'price_per_sqft'],axis = 1)
home1

Unnamed: 0,sqft,bath,balcony,price,bhk,location_encoded,society_encoded
8,550.0,1.0,1.0,27.00,1,54.733378,56.185750
9,440.0,1.0,0.0,28.00,1,77.687596,76.289823
10,510.0,1.0,0.0,25.25,1,84.409379,72.440030
11,510.0,1.0,1.0,25.25,1,84.409379,72.440030
47,1080.0,2.0,2.0,72.00,2,94.864274,83.716217
...,...,...,...,...,...,...,...
5596,1160.0,2.0,2.0,64.08,2,82.359664,80.316307
5597,1676.0,3.0,0.0,92.13,3,82.359664,80.316307
5598,2503.0,3.0,3.0,138.00,3,82.359664,80.316307
5599,1855.0,3.0,3.0,135.00,3,82.359664,89.930471


In [18]:
home1.reset_index(drop = True)
home1.shape

(4543, 7)

In [None]:
#home1.to_csv('house_price_cleaned-target_enc.csv', index=False)

In [19]:
## finding correlation values within the dataset

corr = home1.corr()
corr

Unnamed: 0,sqft,bath,balcony,price,bhk,location_encoded,society_encoded
sqft,1.0,0.05907,0.000216,0.090821,0.061984,0.0379,0.055803
bath,0.05907,1.0,0.255776,0.665457,0.744912,0.365573,0.483787
balcony,0.000216,0.255776,1.0,0.122088,0.229081,0.082075,0.1232
price,0.090821,0.665457,0.122088,1.0,0.644258,0.670449,0.764194
bhk,0.061984,0.744912,0.229081,0.644258,1.0,0.282122,0.391238
location_encoded,0.0379,0.365573,0.082075,0.670449,0.282122,1.0,0.776591
society_encoded,0.055803,0.483787,0.1232,0.764194,0.391238,0.776591,1.0


### Machine Learning part

In [20]:
X = home1[['bhk', 'sqft', 'bath', 'balcony', 'location_encoded', 'society_encoded']]
X.head()

Unnamed: 0,bhk,sqft,bath,balcony,location_encoded,society_encoded
8,1,550.0,1.0,1.0,54.733378,56.18575
9,1,440.0,1.0,0.0,77.687596,76.289823
10,1,510.0,1.0,0.0,84.409379,72.44003
11,1,510.0,1.0,1.0,84.409379,72.44003
47,2,1080.0,2.0,2.0,94.864274,83.716217


In [21]:
y = home1['price']
y.head()

8     27.00
9     28.00
10    25.25
11    25.25
47    72.00
Name: price, dtype: float64

In [22]:
home1.corr()

Unnamed: 0,sqft,bath,balcony,price,bhk,location_encoded,society_encoded
sqft,1.0,0.05907,0.000216,0.090821,0.061984,0.0379,0.055803
bath,0.05907,1.0,0.255776,0.665457,0.744912,0.365573,0.483787
balcony,0.000216,0.255776,1.0,0.122088,0.229081,0.082075,0.1232
price,0.090821,0.665457,0.122088,1.0,0.644258,0.670449,0.764194
bhk,0.061984,0.744912,0.229081,0.644258,1.0,0.282122,0.391238
location_encoded,0.0379,0.365573,0.082075,0.670449,0.282122,1.0,0.776591
society_encoded,0.055803,0.483787,0.1232,0.764194,0.391238,0.776591,1.0


In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [24]:
from sklearn.linear_model import LinearRegression
m2 = LinearRegression().fit(X_train, y_train)
pred = m2.predict(X_test)
df1 = X_test.copy()
df1['price'] = y_test
df1['prediction'] = pred
df1.head()

Unnamed: 0,bhk,sqft,bath,balcony,location_encoded,society_encoded,price,prediction
2952,1,670.0,1.0,1.0,61.48483,76.357867,35.0,14.77856
3998,2,1253.0,2.0,1.0,83.509424,65.724454,52.66,50.216677
2531,3,1520.0,3.0,2.0,109.658833,90.529034,125.0,120.555025
3473,3,1937.0,3.0,2.0,111.255191,95.221474,140.0,126.352196
2590,2,1400.0,1.0,1.0,91.239983,88.327161,80.0,64.551928


In [25]:
from sklearn.metrics import mean_squared_error,r2_score
acc = mean_squared_error(pred,y_test)
acc

926.5140196243069

In [26]:
df1['error'] = df1['price'] - df1['prediction']
df1['sqerror'] = df1['error']** 2
df1.head()

Unnamed: 0,bhk,sqft,bath,balcony,location_encoded,society_encoded,price,prediction,error,sqerror
2952,1,670.0,1.0,1.0,61.48483,76.357867,35.0,14.77856,20.22144,408.906648
3998,2,1253.0,2.0,1.0,83.509424,65.724454,52.66,50.216677,2.443323,5.969829
2531,3,1520.0,3.0,2.0,109.658833,90.529034,125.0,120.555025,4.444975,19.757803
3473,3,1937.0,3.0,2.0,111.255191,95.221474,140.0,126.352196,13.647804,186.262542
2590,2,1400.0,1.0,1.0,91.239983,88.327161,80.0,64.551928,15.448072,238.642934


In [27]:
df1

Unnamed: 0,bhk,sqft,bath,balcony,location_encoded,society_encoded,price,prediction,error,sqerror
2952,1,670.0,1.0,1.0,61.484830,76.357867,35.00,14.778560,20.221440,408.906648
3998,2,1253.0,2.0,1.0,83.509424,65.724454,52.66,50.216677,2.443323,5.969829
2531,3,1520.0,3.0,2.0,109.658833,90.529034,125.00,120.555025,4.444975,19.757803
3473,3,1937.0,3.0,2.0,111.255191,95.221474,140.00,126.352196,13.647804,186.262542
2590,2,1400.0,1.0,1.0,91.239983,88.327161,80.00,64.551928,15.448072,238.642934
...,...,...,...,...,...,...,...,...,...,...
570,3,1270.0,2.0,1.0,74.507001,79.792521,73.00,85.443267,-12.443267,154.834892
3807,2,1113.0,2.0,1.0,77.687596,81.421440,55.00,64.545124,-9.545124,91.109397
758,3,1949.0,3.0,2.0,97.930789,91.609869,129.00,116.741252,12.258748,150.276896
1613,3,1651.0,3.0,1.0,40.684246,79.734773,49.53,84.180528,-34.650528,1200.659086


In [28]:
# Step 1: Calculate the mean of the squared errors
mean_sqerror = df1['sqerror'].mean()
print("mean_sqerror:", mean_sqerror)
# Step 2: Take the square root of the mean squared error to obtain RMSE
rmse = mean_sqerror ** 0.5

print("Root Mean Squared Error (RMSE):", rmse)

mean_sqerror: 926.5140196243069
Root Mean Squared Error (RMSE): 30.438692804131833


### Standardize features by removing the mean and scaling to unit variance



In [None]:
home1

In [None]:
## Dividing our dataset to Independent and Dependent Variables

X = home1.drop('price',axis = 1).values ## Independent Variables
y = home1.price.values ## Dependent Variables
X

In [None]:
## adding a new axis
y = y[:,np.newaxis]
y

In [None]:
## preprocessing the data values to StandardScaler
sc = preprocessing.StandardScaler()
X1 = sc.fit_transform(X)


In [None]:
## Standardize a dataset along any axis

## Center to the mean and component wise scale to unit variance.

Std_x1 = preprocessing.scale(X)

## Machine Learning Model


In [None]:
from sklearn.model_selection import cross_val_score,cross_val_predict
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
from sklearn.model_selection import cross_validate as CV

In [None]:
## using Cross Validation of 5 and scoring of Negative mean sqaured error

cross1 = cross_val_score(lr,Std_x1,y,cv=5,scoring='neg_mean_squared_error')
print(cross1.mean())

In [None]:
# from the model selection module import train_test_split for the ML training and testing.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X1,y,test_size=0.3,random_state=10)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)
acc = mean_squared_error(y_pred,y_test)
rscore = r2_score(y_pred,y_test)
rmse = acc ** 0.5
print("acc:", acc, "rmse:", rmse, "rscore:", rscore)
