# **Business problem for house Price prediction**

A real estate company is looking to enhance its pricing strategy through the implementation of a machine learning model. The company has collected data on various houses, each described by features such as area, number of bedrooms, bathrooms, stories, proximity to the main road, presence of a guest room, basement, hot water heating, air conditioning, parking availability, preference area, and furnishing status.

The goal is to build a reliable linear regression model that accurately predicts house prices based on these features. This predictive tool will assist the real estate company in setting optimal and competitive prices for houses in the market. The model should take into account the diverse characteristics of the houses and provide a transparent and data-driven approach to pricing.


In [57]:
import numpy as np 
import pandas as pd 


In [71]:
df = pd.read_csv("House.csv")
df.head(5)


Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420.0,4.0,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960.0,4.0,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960.0,3.0,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500.0,4.0,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420.0,4.0,1,2,yes,yes,yes,no,yes,2,no,furnished


In [72]:
from sklearn.preprocessing import LabelEncoder


cols = ['stories', 'mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning','prefarea', 'furnishingstatus']

label_encoder = LabelEncoder()

for column in cols:
    df[column] = label_encoder.fit_transform(df[column])

df.head()


Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420.0,4.0,2,2,1,0,0,0,1,2,1,0
1,12250000,8960.0,4.0,4,3,1,0,0,0,1,3,0,0
2,12250000,9960.0,3.0,2,1,1,0,1,0,0,2,1,1
3,12215000,7500.0,4.0,2,1,1,0,1,0,1,3,1,0
4,11410000,7420.0,4.0,1,1,1,1,1,0,1,2,0,0


In [73]:
df.isna().sum()

price                0
area                12
bedrooms             1
bathrooms            0
stories              0
mainroad             0
guestroom            0
basement             0
hotwaterheating      0
airconditioning      0
parking              0
prefarea             0
furnishingstatus     0
dtype: int64

In [74]:
x = int(np.mean(df['area']))
df['area'] = df['area'].fillna(x)
df.isna().sum()

price               0
area                0
bedrooms            1
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64

In [75]:
median_bedrooms = df['bedrooms'].median()
df['bedrooms'] = df['bedrooms'].fillna(median_bedrooms)
df.isna().sum()

price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64

In [76]:
x = df.drop(["price"],axis = 1)
x.head()

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,7420.0,4.0,2,2,1,0,0,0,1,2,1,0
1,8960.0,4.0,4,3,1,0,0,0,1,3,0,0
2,9960.0,3.0,2,1,1,0,1,0,0,2,1,1
3,7500.0,4.0,2,1,1,0,1,0,1,3,1,0
4,7420.0,4.0,1,1,1,1,1,0,1,2,0,0


In [77]:
y = df["price"]
y.head()

0    13300000
1    12250000
2    12250000
3    12215000
4    11410000
Name: price, dtype: int64

### Before Preprocessing

In [78]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print(model.score(X_test,y_test))


Mean Squared Error: 1816881466501.7864
Root Mean Squared Error: 1347917.4553739508
0.6405467966717265


In [79]:
df = pd.get_dummies(df, columns=['furnishingstatus'], prefix='furnishing')


In [80]:
df

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishing_0,furnishing_1,furnishing_2
0,13300000,7420.0,4.0,2,2,1,0,0,0,1,2,1,True,False,False
1,12250000,8960.0,4.0,4,3,1,0,0,0,1,3,0,True,False,False
2,12250000,9960.0,3.0,2,1,1,0,1,0,0,2,1,False,True,False
3,12215000,7500.0,4.0,2,1,1,0,1,0,1,3,1,True,False,False
4,11410000,7420.0,4.0,1,1,1,1,1,0,1,2,0,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000.0,2.0,1,0,1,0,1,0,0,2,0,False,False,True
541,1767150,2400.0,3.0,1,0,0,0,0,0,0,1,0,False,True,False
542,1750000,3620.0,2.0,1,0,1,0,0,0,0,0,0,False,False,True
543,1750000,2910.0,3.0,1,0,0,0,0,0,0,0,0,True,False,False


In [81]:
df.rename(columns={'furnishing_0': 'furnished', 'furnishing_1': 'semi_furnished', 'furnishing_2': 'unfurnished'}, inplace=True)
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnished,semi_furnished,unfurnished
0,13300000,7420.0,4.0,2,2,1,0,0,0,1,2,1,True,False,False
1,12250000,8960.0,4.0,4,3,1,0,0,0,1,3,0,True,False,False
2,12250000,9960.0,3.0,2,1,1,0,1,0,0,2,1,False,True,False
3,12215000,7500.0,4.0,2,1,1,0,1,0,1,3,1,True,False,False
4,11410000,7420.0,4.0,1,1,1,1,1,0,1,2,0,True,False,False


In [83]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df['area'] = scaler.fit_transform(df[['area']])


In [84]:
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnished,semi_furnished,unfurnished
0,13300000,1.053612,4.0,2,2,1,0,0,0,1,2,1,True,False,False
1,12250000,1.769333,4.0,4,3,1,0,0,0,1,3,0,True,False,False
2,12250000,2.234087,3.0,2,1,1,0,1,0,0,2,1,False,True,False
3,12215000,1.090792,4.0,2,1,1,0,1,0,1,3,1,True,False,False
4,11410000,1.053612,4.0,1,1,1,1,1,0,1,2,0,True,False,False


In [85]:
x = df.drop(["price"],axis = 1)
y = df['price']

In [86]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print(model.score(X_test,y_test))

Mean Squared Error: 1801594041213.8538
Root Mean Squared Error: 1342234.7191210089
0.6435712724516294
