# House Price Prediction

Two methods of creating dummy variables: 

1. Pandas Dummy Method
2. SkLearn Pre-processing One Hot Encoder


In [1]:
#Importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from google.colab import files
uploaded = files.upload()

Saving homeprices.csv to homeprices.csv


In [3]:
# Reading the dataset
data=pd.read_csv('homeprices.csv',encoding='unicode_escape',on_bad_lines='skip')

In [4]:
data.head()

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000


In [5]:
data.describe()

Unnamed: 0,area,price
count,13.0,13.0
mean,3146.153846,629230.769231
std,453.900475,57621.109914
min,2600.0,550000.0
25%,2800.0,585000.0
50%,3100.0,615000.0
75%,3600.0,680000.0
max,4000.0,725000.0


In [7]:
# displaying the number of rows and columns
z=data.shape
print("Number of columns: ", z[0])
print("Number of rows: ", z[1])

Number of columns:  13
Number of rows:  3


In [8]:
# displaying column names
y=data.columns
print("Columns of the dataset:\n")
for i in y:
  print(i)

Columns of the dataset:

town
area
price


In [10]:
# displaying basic information about the dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13 entries, 0 to 12
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   town    13 non-null     object
 1   area    13 non-null     int64 
 2   price   13 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 440.0+ bytes


In [11]:
data

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


# Creating dummy variable columns using Pandas

In [13]:
#pandas has this function get_dummies, parameter is the categorical column
dummies= pd.get_dummies(data.town) 
#Tthe extra variables that are created are also known as dummy variables.

In [14]:
#printing the dummies table
dummies

Unnamed: 0,monroe township,robinsville,west windsor
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
5,0,0,1
6,0,0,1
7,0,0,1
8,0,0,1
9,0,1,0


The next step is to concatenate this dummy’s data frame with the original data frame.

In [17]:
#concat function of pandas is used to join two data frames
#it takes an array as an input
#you also need to tell how you do you want to join
merged = pd.concat([data, dummies], axis= 'columns')

#displaying the merged data frame
merged

Unnamed: 0,town,area,price,monroe township,robinsville,west windsor
0,monroe township,2600,550000,1,0,0
1,monroe township,3000,565000,1,0,0
2,monroe township,3200,610000,1,0,0
3,monroe township,3600,680000,1,0,0
4,monroe township,4000,725000,1,0,0
5,west windsor,2600,585000,0,0,1
6,west windsor,2800,615000,0,0,1
7,west windsor,3300,650000,0,0,1
8,west windsor,3600,710000,0,0,1
9,robinsville,2600,575000,0,1,0


After concatenating, you need to drop the original town column because you already have the dummy variable column.

We also need to drop one of the dummy variable columns because it creates the problem of dummy variable trap.

In [18]:
final = merged.drop(['town', 'west windsor'],axis='columns')

#displaying the final dataset
final

Unnamed: 0,area,price,monroe township,robinsville
0,2600,550000,1,0
1,3000,565000,1,0
2,3200,610000,1,0
3,3600,680000,1,0
4,4000,725000,1,0
5,2600,585000,0,0
6,2800,615000,0,0
7,3300,650000,0,0
8,3600,710000,0,0
9,2600,575000,0,1


# Creating a linear regression model

In [20]:
from sklearn.linear_model import LinearRegression
model = LinearRegression() #created a linear regression model object

In [22]:
#x is all the columns except price because price is the dependent variable
X = final.drop(['price'], axis= 'columns')

#display X
X

#price is removed because it is a dependent variable

Unnamed: 0,area,monroe township,robinsville
0,2600,1,0
1,3000,1,0
2,3200,1,0
3,3600,1,0
4,4000,1,0
5,2600,0,0
6,2800,0,0
7,3300,0,0
8,3600,0,0
9,2600,0,1


In [24]:
Y = final.price
#Y is a price column in your final data frame

#display Y
Y

0     550000
1     565000
2     610000
3     680000
4     725000
5     585000
6     615000
7     650000
8     710000
9     575000
10    600000
11    620000
12    695000
Name: price, dtype: int64

In [26]:
#training your model
model.fit(X,Y)

LinearRegression()

In [30]:
#prediction
model.predict([[2800,1,0]])

  "X does not have valid feature names, but"


array([565089.22812299])

In [34]:
#prediction
model.predict([[3400,0,0]])

  "X does not have valid feature names, but"


array([681241.66845839])

In [35]:
#find the accuracy of the model
model.score(X,Y)

0.9573929037221873

# One hot encoding using Sklearn pre-processing

In [36]:
#display the data
data

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


To do one hot encoding, we need to do label encoding on the town column.

In [39]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder() #create a class object

In [62]:
dfle = data #create a new data frame

#the next step is use label encoder to fit and transform
le.fit_transform(dfle.town)

#fit and transform means it takes the label column as an input and it will return the label 

array([0, 0, 0, 0, 0, 2, 2, 2, 2, 1, 1, 1, 1])

In [44]:
dfle.town = le.fit_transform(dfle.town)

#display dfle
dfle

Unnamed: 0,town,area,price
0,0,2600,550000
1,0,3000,565000
2,0,3200,610000
3,0,3600,680000
4,0,4000,725000
5,2,2600,585000
6,2,2800,615000
7,2,3300,650000
8,2,3600,710000
9,1,2600,575000


In [49]:
# X is training data set
X = dfle[['town', 'area']].values #it is a 2-d dataframe
#if you call .values on a dataframe, you get a 2-d array

#display X
X

array([[   0, 2600],
       [   0, 3000],
       [   0, 3200],
       [   0, 3600],
       [   0, 4000],
       [   2, 2600],
       [   2, 2800],
       [   2, 3300],
       [   2, 3600],
       [   1, 2600],
       [   1, 2900],
       [   1, 3100],
       [   1, 3600]])

In [51]:
Y = dfle.price

#display Y
Y

0     550000
1     565000
2     610000
3     680000
4     725000
5     585000
6     615000
7     650000
8     710000
9     575000
10    600000
11    620000
12    695000
Name: price, dtype: int64

# Import One Hot Encoder

In [52]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(categorical_features=[0])#create an object of the class
 

In [53]:
#ohe.fit_transform(X) : if we do this, then it's gonna assume all the columns of X are categorical
#ohe = OneHotEncoder(categorical_features=[0])
#the above line means, we are only looking at the categorical qualities of column 0

In [55]:
ohe.fit_transform(X).toarray()

#display X
X

#it created 2 dummy variable columns

array([[   0, 2600],
       [   0, 3000],
       [   0, 3200],
       [   0, 3600],
       [   0, 4000],
       [   2, 2600],
       [   2, 2800],
       [   2, 3300],
       [   2, 3600],
       [   1, 2600],
       [   1, 2900],
       [   1, 3100],
       [   1, 3600]])

In [56]:
model.fit(X,Y)

LinearRegression()

In [60]:
model.predict([[1, 2500]])

array([549327.1748618])

In [61]:
model.score(X, Y)

0.9552018104317441