# HOUSE PRICE PREDICTION

In [3]:
#importing libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

In [4]:
#loading data into python
data=pd.read_csv('HousePricePrediction.xlsx - Sheet1.csv')

In [5]:
#brief displayal of data
data

Unnamed: 0,Id,MSSubClass,MSZoning,LotArea,LotConfig,BldgType,OverallCond,YearBuilt,YearRemodAdd,Exterior1st,BsmtFinSF2,TotalBsmtSF,SalePrice
0,0,60,RL,8450,Inside,1Fam,5,2003,2003,VinylSd,0.0,856.0,208500.0
1,1,20,RL,9600,FR2,1Fam,8,1976,1976,MetalSd,0.0,1262.0,181500.0
2,2,60,RL,11250,Inside,1Fam,5,2001,2002,VinylSd,0.0,920.0,223500.0
3,3,70,RL,9550,Corner,1Fam,5,1915,1970,Wd Sdng,0.0,756.0,140000.0
4,4,60,RL,14260,FR2,1Fam,5,2000,2000,VinylSd,0.0,1145.0,250000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2914,2914,160,RM,1936,Inside,Twnhs,7,1970,1970,CemntBd,0.0,546.0,
2915,2915,160,RM,1894,Inside,TwnhsE,5,1970,1970,CemntBd,0.0,546.0,
2916,2916,20,RL,20000,Inside,1Fam,7,1960,1996,VinylSd,0.0,1224.0,
2917,2917,85,RL,10441,Inside,1Fam,5,1992,1992,HdBoard,0.0,912.0,


In [6]:
#training dependent variables and independent variables
x=data.drop('SalePrice',axis=1)
y=data['SalePrice']

In [7]:
#createing a categorical coloumns for data in type of strings and convert them into boolean
categorical_columns = data.select_dtypes(include=['object']).columns
cat_imputer = SimpleImputer(strategy='most_frequent')
data[categorical_columns] = cat_imputer.fit_transform(data[categorical_columns])

# DATA CLEANING

In [8]:
#checking null values
print(data.isnull().sum())

Id                 0
MSSubClass         0
MSZoning           0
LotArea            0
LotConfig          0
BldgType           0
OverallCond        0
YearBuilt          0
YearRemodAdd       0
Exterior1st        0
BsmtFinSF2         1
TotalBsmtSF        1
SalePrice       1459
dtype: int64


In [9]:
#dropping null values
data=data.dropna()
data

Unnamed: 0,Id,MSSubClass,MSZoning,LotArea,LotConfig,BldgType,OverallCond,YearBuilt,YearRemodAdd,Exterior1st,BsmtFinSF2,TotalBsmtSF,SalePrice
0,0,60,RL,8450,Inside,1Fam,5,2003,2003,VinylSd,0.0,856.0,208500.0
1,1,20,RL,9600,FR2,1Fam,8,1976,1976,MetalSd,0.0,1262.0,181500.0
2,2,60,RL,11250,Inside,1Fam,5,2001,2002,VinylSd,0.0,920.0,223500.0
3,3,70,RL,9550,Corner,1Fam,5,1915,1970,Wd Sdng,0.0,756.0,140000.0
4,4,60,RL,14260,FR2,1Fam,5,2000,2000,VinylSd,0.0,1145.0,250000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1455,60,RL,7917,Inside,1Fam,5,1999,2000,VinylSd,0.0,953.0,175000.0
1456,1456,20,RL,13175,Inside,1Fam,6,1978,1988,Plywood,163.0,1542.0,210000.0
1457,1457,70,RL,9042,Inside,1Fam,9,1941,2006,CemntBd,0.0,1152.0,266500.0
1458,1458,20,RL,9717,Inside,1Fam,6,1950,1996,MetalSd,1029.0,1078.0,142125.0


In [10]:
# Encoding categorical variables
# Use one-hot encoding to create dummy variables
encoder = OneHotEncoder(sparse_output=False)
encoded_data = encoder.fit_transform(data[categorical_columns])
# Getting feature names of encoded columns using categories_ attribute
encoded_columns = [f'{col}_{cat}' for col, cats in zip(categorical_columns, encoder.categories_) for cat in cats]
encoded_data = pd.DataFrame(encoded_data, columns=encoded_columns)

In [11]:
# Drop the original categorical columns and concatenate the encoded columns
data = data.drop(categorical_columns, axis=1)
data = pd.concat([data, encoded_data], axis=1)

In [12]:
data

Unnamed: 0,Id,MSSubClass,LotArea,OverallCond,YearBuilt,YearRemodAdd,BsmtFinSF2,TotalBsmtSF,SalePrice,MSZoning_C (all),...,Exterior1st_CemntBd,Exterior1st_HdBoard,Exterior1st_ImStucc,Exterior1st_MetalSd,Exterior1st_Plywood,Exterior1st_Stone,Exterior1st_Stucco,Exterior1st_VinylSd,Exterior1st_Wd Sdng,Exterior1st_WdShing
0,0,60,8450,5,2003,2003,0.0,856.0,208500.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1,20,9600,8,1976,1976,0.0,1262.0,181500.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,60,11250,5,2001,2002,0.0,920.0,223500.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,3,70,9550,5,1915,1970,0.0,756.0,140000.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,4,60,14260,5,2000,2000,0.0,1145.0,250000.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1455,60,7917,5,1999,2000,0.0,953.0,175000.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1456,1456,20,13175,6,1978,1988,163.0,1542.0,210000.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1457,1457,70,9042,9,1941,2006,0.0,1152.0,266500.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1458,1458,20,9717,6,1950,1996,1029.0,1078.0,142125.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
# Scaling numerical variables
numerical_columns = data.select_dtypes(include=['int64','float64']).columns
# Use standard scaling to transform the numerical variables to have zero mean and unit variance
scaler = StandardScaler()
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

# TRAINING THE MODEL

In [15]:
# Splitting the data into training and testing sets
# Use 80% of the data for training and 20% for testing
x = data.drop('SalePrice', axis=1) # Features
y = data['SalePrice'] # Target variable
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# MACHINE LEARNING MODEL

In [16]:
model=RandomForestRegressor()

In [17]:
model.fit(x_train,y_train)

In [18]:
#predicting data 
y_pred = model.predict(x_test)

#using imported libraries for mean square error and r2 score
mean_square=mean_squared_error(y_test,y_pred)
r2=r2_score(y_test,y_pred)

In [21]:
print(f'Mean squared error:{mean_square}')

Mean squared error:0.23054662250579322


In [22]:
print(f'r2 score:{r2_score}')

r2 score:<function r2_score at 0x0000025DDE8F8540>


# REPORT

->MSE: This is the average of the squared differences between the actual and predicted values of the dependent variable. A lower MSE means a better fit and less error. An MSE of 0.2305 means that the model’s predictions are, on average, about 0.23 units away from the actual values.

->R^2: This is the proportion of the variance in the dependent variable that is explained by the independent variables. It ranges from 0 to 1, where 0 means no correlation and 1 means perfect correlation. A higher R^2 means a better fit and more explanatory power. An R^2 score of 0.75 means that the model explains 75% of the variation in the dependent variable.