### Using Pandas for One Hot Encoding

In [1]:
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/codebasics/py/master/ML/5_one_hot_encoding/homeprices.csv')
df

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


In [5]:
dummies = pd.get_dummies(df.town)   
merged = pd.concat([df,dummies],axis='columns')
merged

Unnamed: 0,town,area,price,monroe township,robinsville,west windsor
0,monroe township,2600,550000,1,0,0
1,monroe township,3000,565000,1,0,0
2,monroe township,3200,610000,1,0,0
3,monroe township,3600,680000,1,0,0
4,monroe township,4000,725000,1,0,0
5,west windsor,2600,585000,0,0,1
6,west windsor,2800,615000,0,0,1
7,west windsor,3300,650000,0,0,1
8,west windsor,3600,710000,0,0,1
9,robinsville,2600,575000,0,1,0


In [10]:
final = merged.drop(['town','west windsor'],axis='columns')     #dropping original and one dummy var to avoid dummy variable trap
final

Unnamed: 0,area,price,monroe township,robinsville
0,2600,550000,1,0
1,3000,565000,1,0
2,3200,610000,1,0
3,3600,680000,1,0
4,4000,725000,1,0
5,2600,585000,0,0
6,2800,615000,0,0
7,3300,650000,0,0
8,3600,710000,0,0
9,2600,575000,0,1


In [11]:
# impleting regression
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [16]:
X = final.drop('price', axis='columns')
y = final.price
model.fit(X,y)

LinearRegression()

In [17]:
model.predict([[2800,0,1]])  #robinson



array([590775.63964739])

In [18]:
model.predict([[3400,0,0]])     #winsor



array([681241.66845839])

In [19]:
model.score(X,y)        #check the accuracy

0.9573929037221873

### Using Sklearn for One Hot Encoding

In [20]:
df

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


In [21]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [24]:
dfle = df
dfle.town = le.fit_transform(dfle.town)   # changing categorical data 'town' to label encoding
dfle

Unnamed: 0,town,area,price
0,0,2600,550000
1,0,3000,565000
2,0,3200,610000
3,0,3600,680000
4,0,4000,725000
5,2,2600,585000
6,2,2800,615000
7,2,3300,650000
8,2,3600,710000
9,1,2600,575000


In [43]:
X = dfle[['town','area']].values
y = dfle.price
X



array([[   0, 2600],
       [   0, 3000],
       [   0, 3200],
       [   0, 3600],
       [   0, 4000],
       [   2, 2600],
       [   2, 2800],
       [   2, 3300],
       [   2, 3600],
       [   1, 2600],
       [   1, 2900],
       [   1, 3100],
       [   1, 3600]])

In [58]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()             #in future the object with column index 0 is categorical feature

In [59]:
X = ohe.fit_transform(X).toarray()
X

array([[1., 0., 0., 1., 0., 1., 1., 0., 0., 1., 1., 0., 1., 0., 0., 1.,
        0., 1., 1., 0., 0., 1., 1., 0., 0., 1., 1., 0., 0., 1., 1., 0.,
        0., 1., 1., 0., 0., 1., 1., 0., 0., 1., 1., 0., 0., 1., 1., 0.],
       [1., 0., 0., 1., 0., 1., 1., 0., 0., 1., 1., 0., 0., 1., 1., 0.,
        0., 1., 1., 0., 0., 1., 1., 0., 1., 0., 0., 1., 0., 1., 1., 0.,
        0., 1., 1., 0., 0., 1., 1., 0., 0., 1., 1., 0., 0., 1., 1., 0.],
       [1., 0., 0., 1., 0., 1., 1., 0., 0., 1., 1., 0., 0., 1., 1., 0.,
        0., 1., 1., 0., 0., 1., 1., 0., 0., 1., 1., 0., 0., 1., 1., 0.,
        1., 0., 0., 1., 0., 1., 1., 0., 0., 1., 1., 0., 0., 1., 1., 0.],
       [1., 0., 0., 1., 0., 1., 1., 0., 0., 1., 1., 0., 0., 1., 1., 0.,
        0., 1., 1., 0., 0., 1., 1., 0., 0., 1., 1., 0., 0., 1., 1., 0.,
        0., 1., 1., 0., 0., 1., 1., 0., 1., 0., 0., 1., 0., 1., 1., 0.],
       [1., 0., 0., 1., 0., 1., 1., 0., 0., 1., 1., 0., 0., 1., 1., 0.,
        0., 1., 1., 0., 0., 1., 1., 0., 0., 1., 1., 0., 0., 