# House Prices done with One Hot Encoding

## Dummies Variables and One Hot Encoding

In [1]:
import pandas as pd

## First we do dummies encoding 
In this approach we use dataframe for this encoding our textual data columns.

In [2]:
df = pd.read_csv("homeprices.csv")
df

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


### We are doing dummies encoding here on our town columns

In [3]:
dummies = pd.get_dummies(df.town)  
dummies

Unnamed: 0,monroe township,robinsville,west windsor
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
5,0,0,1
6,0,0,1
7,0,0,1
8,0,0,1
9,0,1,0


### Concatinating both dummies dataframe and our main dataframe

In [4]:
df_dummies = pd.concat([df,dummies],axis='columns')
df_dummies

Unnamed: 0,town,area,price,monroe township,robinsville,west windsor
0,monroe township,2600,550000,1,0,0
1,monroe township,3000,565000,1,0,0
2,monroe township,3200,610000,1,0,0
3,monroe township,3600,680000,1,0,0
4,monroe township,4000,725000,1,0,0
5,west windsor,2600,585000,0,0,1
6,west windsor,2800,615000,0,0,1
7,west windsor,3300,650000,0,0,1
8,west windsor,3600,710000,0,0,1
9,robinsville,2600,575000,0,1,0


### Droping town and west windsor. We can drop any town name but we need to drop one inorder to save from dummy variable trap

In [5]:
df_dummies.drop(['town','west windsor'],axis='columns',inplace=True)
df_dummies

Unnamed: 0,area,price,monroe township,robinsville
0,2600,550000,1,0
1,3000,565000,1,0
2,3200,610000,1,0
3,3600,680000,1,0
4,4000,725000,1,0
5,2600,585000,0,0
6,2800,615000,0,0
7,3300,650000,0,0
8,3600,710000,0,0
9,2600,575000,0,1


In [6]:
X = df_dummies.drop('price',axis='columns')
X

Unnamed: 0,area,monroe township,robinsville
0,2600,1,0
1,3000,1,0
2,3200,1,0
3,3600,1,0
4,4000,1,0
5,2600,0,0
6,2800,0,0
7,3300,0,0
8,3600,0,0
9,2600,0,1


In [7]:
Y = df_dummies.price
Y

0     550000
1     565000
2     610000
3     680000
4     725000
5     585000
6     615000
7     650000
8     710000
9     575000
10    600000
11    620000
12    695000
Name: price, dtype: int64

In [8]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X,Y)

LinearRegression()

#### If we want to predict the price for west windsor then we set the value 0,0 for munro and robinvelle

In [9]:
model.predict([[3400,0,0]])

array([681241.66845839])

### Here I wanted to predict the value for robinvelle so I set value 1 for this and other set to 0

In [10]:
model.predict([[2800,0,1]])

array([590775.63964739])

# Using Skitlean one hot encoding

In [11]:
from sklearn.preprocessing import LabelEncoder #In sklearn Label encoder convert textual columns to numeric columns for us
le = LabelEncoder()

In [12]:
dfle = df
dfle.town = le.fit_transform(dfle.town)    #le.fit_transform() use to convert that specific column to numeric

In [13]:
dfle

Unnamed: 0,town,area,price
0,0,2600,550000
1,0,3000,565000
2,0,3200,610000
3,0,3600,680000
4,0,4000,725000
5,2,2600,585000
6,2,2800,615000
7,2,3300,650000
8,2,3600,710000
9,1,2600,575000


In [14]:
X = dfle[['town','area']].values  # .values is converting this into two dimensional array
X

array([[   0, 2600],
       [   0, 3000],
       [   0, 3200],
       [   0, 3600],
       [   0, 4000],
       [   2, 2600],
       [   2, 2800],
       [   2, 3300],
       [   2, 3600],
       [   1, 2600],
       [   1, 2900],
       [   1, 3100],
       [   1, 3600]], dtype=int64)

In [15]:
Y =  dfle[['price']].values    # .values is converting this into two dimensional array
Y

array([[550000],
       [565000],
       [610000],
       [680000],
       [725000],
       [585000],
       [615000],
       [650000],
       [710000],
       [575000],
       [600000],
       [620000],
       [695000]], dtype=int64)

In [16]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer([('town', OneHotEncoder(), [0])], remainder = 'passthrough') # [0] indexing the targeted column, remainder='passthrough' means that only transforrm the targeted column and pass others

In [17]:
X = ct.fit_transform(X) # transform town column into hot encoding
X

array([[1.0e+00, 0.0e+00, 0.0e+00, 2.6e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.0e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.2e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.6e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 4.0e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.6e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.8e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.3e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.6e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 2.6e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 2.9e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.1e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.6e+03]])

#### Now we will drop one column from our data in order to avoid dummy variable trap

In [18]:
X = X[:,1:]   # first : means take all the rows and after "," means select columns from 1 onwards which shows that we dropped 0 col
X

array([[0.0e+00, 0.0e+00, 2.6e+03],
       [0.0e+00, 0.0e+00, 3.0e+03],
       [0.0e+00, 0.0e+00, 3.2e+03],
       [0.0e+00, 0.0e+00, 3.6e+03],
       [0.0e+00, 0.0e+00, 4.0e+03],
       [0.0e+00, 1.0e+00, 2.6e+03],
       [0.0e+00, 1.0e+00, 2.8e+03],
       [0.0e+00, 1.0e+00, 3.3e+03],
       [0.0e+00, 1.0e+00, 3.6e+03],
       [1.0e+00, 0.0e+00, 2.6e+03],
       [1.0e+00, 0.0e+00, 2.9e+03],
       [1.0e+00, 0.0e+00, 3.1e+03],
       [1.0e+00, 0.0e+00, 3.6e+03]])

In [19]:
model.fit(X,Y)

LinearRegression()

In [20]:
model.predict([[0,1,3400]])  #3400 sq ft home in west windsor

array([[681241.6684584]])

In [None]:
model.predict([[1,0,2800]]) #2800 sq ft home in robbinsville