In [3]:
import pandas as pd
import numpy as np
import math
from sklearn import linear_model

In [4]:
data=pd.read_csv('onehotencoding.csv')
data
#data.head()
#data.tail()

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


In [5]:
dummies=pd.get_dummies(data)
dummies

Unnamed: 0,area,price,town_monroe township,town_robinsville,town_west windsor
0,2600,550000,1,0,0
1,3000,565000,1,0,0
2,3200,610000,1,0,0
3,3600,680000,1,0,0
4,4000,725000,1,0,0
5,2600,585000,0,0,1
6,2800,615000,0,0,1
7,3300,650000,0,0,1
8,3600,710000,0,0,1
9,2600,575000,0,1,0


In [6]:
merged=pd.concat([data,dummies],axis='columns')
merged

Unnamed: 0,town,area,price,area.1,price.1,town_monroe township,town_robinsville,town_west windsor
0,monroe township,2600,550000,2600,550000,1,0,0
1,monroe township,3000,565000,3000,565000,1,0,0
2,monroe township,3200,610000,3200,610000,1,0,0
3,monroe township,3600,680000,3600,680000,1,0,0
4,monroe township,4000,725000,4000,725000,1,0,0
5,west windsor,2600,585000,2600,585000,0,0,1
6,west windsor,2800,615000,2800,615000,0,0,1
7,west windsor,3300,650000,3300,650000,0,0,1
8,west windsor,3600,710000,3600,710000,0,0,1
9,robinsville,2600,575000,2600,575000,0,1,0


In [7]:
#we need to drop town column and one of the dummy variable column (it can be any column out of the given three) because of 
#the dummy variable trap caused by multicollinearity in data. 

#NOTE-we need not drop the dummy variable column when using linear regression linear model because it is aware of the the
#dummy variable trap. but we will do it anyways as it is a good practice from a coding/research perspective

In [9]:
dropped=merged.drop(['town','town_west windsor'],axis='columns')
dropped

Unnamed: 0,area,price,area.1,price.1,town_monroe township,town_robinsville
0,2600,550000,2600,550000,1,0
1,3000,565000,3000,565000,1,0
2,3200,610000,3200,610000,1,0
3,3600,680000,3600,680000,1,0
4,4000,725000,4000,725000,1,0
5,2600,585000,2600,585000,0,0
6,2800,615000,2800,615000,0,0
7,3300,650000,3300,650000,0,0
8,3600,710000,3600,710000,0,0
9,2600,575000,2600,575000,0,1


In [14]:
dropped=dropped.T.drop_duplicates().T
dropped

Unnamed: 0,area,price,town_monroe township,town_robinsville
0,2600,550000,1,0
1,3000,565000,1,0
2,3200,610000,1,0
3,3600,680000,1,0
4,4000,725000,1,0
5,2600,585000,0,0
6,2800,615000,0,0
7,3300,650000,0,0
8,3600,710000,0,0
9,2600,575000,0,1


In [15]:
#now we have clean data to work with
model=linear_model.LinearRegression()

In [19]:
X=dropped.drop(['price'],axis='columns')
X

Unnamed: 0,area,town_monroe township,town_robinsville
0,2600,1,0
1,3000,1,0
2,3200,1,0
3,3600,1,0
4,4000,1,0
5,2600,0,0
6,2800,0,0
7,3300,0,0
8,3600,0,0
9,2600,0,1


In [20]:
Y=dropped.price
Y

0     550000
1     565000
2     610000
3     680000
4     725000
5     585000
6     615000
7     650000
8     710000
9     575000
10    600000
11    620000
12    695000
Name: price, dtype: int64

In [22]:
model=model.fit(X,Y)

In [23]:
#predict price of 2800 sq ft land in robinsville
model.predict([[2800,0,1]])
#0 for monroe 1 for robinsville



array([590775.63964739])

In [24]:
model.predict([[3400,0,0]])




array([681241.66845839])

In [26]:
model.predict([[3000,1,0]])



array([590468.71640508])

In [28]:
model.score(X,Y)
#used to predict how acccurate the data prediction is

0.9573929037221873

In [30]:
#we will do one hot encoding now
data

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


In [34]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()

In [91]:
dfle=data #copying data for 1he
dfle.town=le.fit_transform(dfle.town)
dfle

Unnamed: 0,town,area,price
0,0,2600,550000
1,0,3000,565000
2,0,3200,610000
3,0,3600,680000
4,0,4000,725000
5,2,2600,585000
6,2,2800,615000
7,2,3300,650000
8,2,3600,710000
9,1,2600,575000


In [38]:
#now we can compare this data with the previous data to check
#codes for the new data
#0=town_monroe township
#2=town_west windsor
#1=town_robinsville

In [39]:
X=dfle[['town','area']].values #values is used to get 2d arrays as output and not dataframes
X

array([[   0, 2600],
       [   0, 3000],
       [   0, 3200],
       [   0, 3600],
       [   0, 4000],
       [   2, 2600],
       [   2, 2800],
       [   2, 3300],
       [   2, 3600],
       [   1, 2600],
       [   1, 2900],
       [   1, 3100],
       [   1, 3600]], dtype=int64)

In [41]:
Y=dfle.price
Y

0     550000
1     565000
2     610000
3     680000
4     725000
5     585000
6     615000
7     650000
8     710000
9     575000
10    600000
11    620000
12    695000
Name: price, dtype: int64

In [52]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [82]:
ohe=OneHotEncoder(categories='auto')

In [83]:
X=ohe.fit_transform(X).toarray()
X

array([[0., 1., 0., ..., 1., 1., 0.],
       [0., 1., 0., ..., 1., 1., 0.],
       [0., 1., 0., ..., 1., 1., 0.],
       ...,
       [1., 0., 1., ..., 1., 1., 0.],
       [1., 0., 1., ..., 1., 1., 0.],
       [1., 0., 1., ..., 1., 1., 0.]])

In [84]:
#dropping first column
X=X[:,1:]
X

array([[1., 0., 1., ..., 1., 1., 0.],
       [1., 0., 1., ..., 1., 1., 0.],
       [1., 0., 1., ..., 1., 1., 0.],
       ...,
       [0., 1., 0., ..., 1., 1., 0.],
       [0., 1., 0., ..., 1., 1., 0.],
       [0., 1., 0., ..., 1., 1., 0.]])

In [86]:
model.fit(X,Y)

In [89]:
model.predict([[1,0,2800]])

ValueError: X has 3 features, but LinearRegression is expecting 85 features as input.

In [None]:
#fuck this sghit bro im done