In [46]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

In [23]:
df = pd.read_csv('homeprices.csv')
df.head()

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13 entries, 0 to 12
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   town    13 non-null     object
 1   area    13 non-null     int64 
 2   price   13 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 444.0+ bytes


In [25]:
df.town.unique()

array(['monroe township', 'west windsor', 'robinsville'], dtype=object)

In [26]:
towns = pd.get_dummies(df.town,dtype=int)
towns

Unnamed: 0,monroe township,robinsville,west windsor
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
5,0,0,1
6,0,0,1
7,0,0,1
8,0,0,1
9,0,1,0


In [27]:
df1 = pd.concat([df,towns],axis=1)
df1

Unnamed: 0,town,area,price,monroe township,robinsville,west windsor
0,monroe township,2600,550000,1,0,0
1,monroe township,3000,565000,1,0,0
2,monroe township,3200,610000,1,0,0
3,monroe township,3600,680000,1,0,0
4,monroe township,4000,725000,1,0,0
5,west windsor,2600,585000,0,0,1
6,west windsor,2800,615000,0,0,1
7,west windsor,3300,650000,0,0,1
8,west windsor,3600,710000,0,0,1
9,robinsville,2600,575000,0,1,0


In [28]:
df1 = df1.drop(['town','west windsor'],axis='columns')
df1

Unnamed: 0,area,price,monroe township,robinsville
0,2600,550000,1,0
1,3000,565000,1,0
2,3200,610000,1,0
3,3600,680000,1,0
4,4000,725000,1,0
5,2600,585000,0,0
6,2800,615000,0,0
7,3300,650000,0,0
8,3600,710000,0,0
9,2600,575000,0,1


In [29]:
y = df1.price
x = df1.drop('price',axis='columns')
y

Unnamed: 0,price
0,550000
1,565000
2,610000
3,680000
4,725000
5,585000
6,615000
7,650000
8,710000
9,575000


In [30]:
x

Unnamed: 0,area,monroe township,robinsville
0,2600,1,0
1,3000,1,0
2,3200,1,0
3,3600,1,0
4,4000,1,0
5,2600,0,0
6,2800,0,0
7,3300,0,0
8,3600,0,0
9,2600,0,1


In [31]:
model = LinearRegression()
model.fit(x,y)

In [32]:
model.predict([[2800,0,1]])



array([590775.63964739])

In [33]:
model.score(x,y)

0.9573929037221872

## Now using label encoder

In [34]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [35]:
df2 = df
df2.town = le.fit_transform(df2.town)
df2

Unnamed: 0,town,area,price
0,0,2600,550000
1,0,3000,565000
2,0,3200,610000
3,0,3600,680000
4,0,4000,725000
5,2,2600,585000
6,2,2800,615000
7,2,3300,650000
8,2,3600,710000
9,1,2600,575000


In [36]:
# Here .values will return array instead of data frame
x = df2[['town','area']].values
x

array([[   0, 2600],
       [   0, 3000],
       [   0, 3200],
       [   0, 3600],
       [   0, 4000],
       [   2, 2600],
       [   2, 2800],
       [   2, 3300],
       [   2, 3600],
       [   1, 2600],
       [   1, 2900],
       [   1, 3100],
       [   1, 3600]])

In [38]:
y = df.price
y

Unnamed: 0,price
0,550000
1,565000
2,610000
3,680000
4,725000
5,585000
6,615000
7,650000
8,710000
9,575000


## Now using OneHotEncoder

In [43]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(handle_unknown='ignore',sparse_output=False)

In [44]:
# Reshape the 'town' column to be 2D as OneHotEncoder expects
town_column_reshaped = x[:, 0].reshape(-1, 1)

# Fit and transform the 'town' column
x_onehot = ohe.fit_transform(town_column_reshaped)
print(x_onehot)

[[1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]]


## dropping first column like we did for get_dummies method

In [45]:
x_onehot = x_onehot[:,1:]
x_onehot

array([[0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.]])

## Now combining area column

In [47]:
x_combined = np.column_stack((x_onehot,x[:,1]))
x_combined

array([[0.0e+00, 0.0e+00, 2.6e+03],
       [0.0e+00, 0.0e+00, 3.0e+03],
       [0.0e+00, 0.0e+00, 3.2e+03],
       [0.0e+00, 0.0e+00, 3.6e+03],
       [0.0e+00, 0.0e+00, 4.0e+03],
       [0.0e+00, 1.0e+00, 2.6e+03],
       [0.0e+00, 1.0e+00, 2.8e+03],
       [0.0e+00, 1.0e+00, 3.3e+03],
       [0.0e+00, 1.0e+00, 3.6e+03],
       [1.0e+00, 0.0e+00, 2.6e+03],
       [1.0e+00, 0.0e+00, 2.9e+03],
       [1.0e+00, 0.0e+00, 3.1e+03],
       [1.0e+00, 0.0e+00, 3.6e+03]])

In [48]:
model.fit(x_combined,y)

In [49]:
model.predict([[1,0,2800]])

array([590775.63964739])

In [50]:
model.score(x_combined,y)

0.9573929037221873

### **Exercise**

In [52]:
df0 = pd.read_csv('car_prices.csv')
df0

Unnamed: 0,Car Model,Mileage,Sell Price($),Age(yrs)
0,BMW X5,69000,18000,6
1,BMW X5,35000,34000,3
2,BMW X5,57000,26100,5
3,BMW X5,22500,40000,2
4,BMW X5,46000,31500,4
5,Audi A5,59000,29400,5
6,Audi A5,52000,32000,5
7,Audi A5,72000,19300,6
8,Audi A5,91000,12000,8
9,Mercedez Benz C class,67000,22000,6


In [55]:
models = pd.get_dummies(df0['Car Model'],dtype=int,drop_first=True)
models

Unnamed: 0,BMW X5,Mercedez Benz C class
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0
5,0,0
6,0,0
7,0,0
8,0,0
9,0,1


In [56]:
df_updated = pd.concat([df0,models],axis='columns')
df_updated

Unnamed: 0,Car Model,Mileage,Sell Price($),Age(yrs),BMW X5,Mercedez Benz C class
0,BMW X5,69000,18000,6,1,0
1,BMW X5,35000,34000,3,1,0
2,BMW X5,57000,26100,5,1,0
3,BMW X5,22500,40000,2,1,0
4,BMW X5,46000,31500,4,1,0
5,Audi A5,59000,29400,5,0,0
6,Audi A5,52000,32000,5,0,0
7,Audi A5,72000,19300,6,0,0
8,Audi A5,91000,12000,8,0,0
9,Mercedez Benz C class,67000,22000,6,0,1


In [57]:
df_updated = df_updated.drop('Car Model',axis='columns')
df_updated

Unnamed: 0,Mileage,Sell Price($),Age(yrs),BMW X5,Mercedez Benz C class
0,69000,18000,6,1,0
1,35000,34000,3,1,0
2,57000,26100,5,1,0
3,22500,40000,2,1,0
4,46000,31500,4,1,0
5,59000,29400,5,0,0
6,52000,32000,5,0,0
7,72000,19300,6,0,0
8,91000,12000,8,0,0
9,67000,22000,6,0,1


In [58]:
y = df_updated['Sell Price($)']
y

Unnamed: 0,Sell Price($)
0,18000
1,34000
2,26100
3,40000
4,31500
5,29400
6,32000
7,19300
8,12000
9,22000


In [59]:
x = df_updated.drop('Sell Price($)',axis='columns')
x

Unnamed: 0,Mileage,Age(yrs),BMW X5,Mercedez Benz C class
0,69000,6,1,0
1,35000,3,1,0
2,57000,5,1,0
3,22500,2,1,0
4,46000,4,1,0
5,59000,5,0,0
6,52000,5,0,0
7,72000,6,0,0
8,91000,8,0,0
9,67000,6,0,1


In [60]:
model1 = LinearRegression()
model1.fit(x,y)

In [61]:
model1.predict([[45000,4,0,1]])



array([36991.31721061])

In [62]:
model1.predict([[86000,7,1,0]])



array([11080.74313219])

In [63]:
model1.score(x,y)

0.9417050937281083

## Again using OneHotEncoder

In [67]:
y = df0['Sell Price($)']
y

Unnamed: 0,Sell Price($)
0,18000
1,34000
2,26100
3,40000
4,31500
5,29400
6,32000
7,19300
8,12000
9,22000


In [68]:
x = df0.drop('Sell Price($)',axis='columns')
x

Unnamed: 0,Car Model,Mileage,Age(yrs)
0,BMW X5,69000,6
1,BMW X5,35000,3
2,BMW X5,57000,5
3,BMW X5,22500,2
4,BMW X5,46000,4
5,Audi A5,59000,5
6,Audi A5,52000,5
7,Audi A5,72000,6
8,Audi A5,91000,8
9,Mercedez Benz C class,67000,6


In [69]:
x_carModel = le.fit_transform(x['Car Model'])
x_carModel

array([1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 2, 2])

In [70]:
x_reshaped = x_carModel.reshape(-1,1)
x_reshaped

array([[1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [2],
       [2],
       [2],
       [2]])

In [71]:
x_onehot = ohe.fit_transform(x_reshaped)
x_onehot

array([[0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]])

In [72]:
x_onehot = x_onehot[:,1:]
x_onehot

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.]])

In [74]:
x_combined = np.column_stack((x_onehot,x.values[:,1:]))
x_combined

array([[1.0, 0.0, 69000, 6],
       [1.0, 0.0, 35000, 3],
       [1.0, 0.0, 57000, 5],
       [1.0, 0.0, 22500, 2],
       [1.0, 0.0, 46000, 4],
       [0.0, 0.0, 59000, 5],
       [0.0, 0.0, 52000, 5],
       [0.0, 0.0, 72000, 6],
       [0.0, 0.0, 91000, 8],
       [0.0, 1.0, 67000, 6],
       [0.0, 1.0, 83000, 7],
       [0.0, 1.0, 79000, 7],
       [0.0, 1.0, 59000, 5]], dtype=object)

In [76]:
model2 = LinearRegression()
model2.fit(x_combined,y)

In [81]:
model2.predict([[0,1,45000,4]])

array([36991.31721062])

In [82]:
model2.predict([[1,0,86000,7]])

array([11080.74313219])

In [83]:
model2.score(x_combined,y)

0.9417050937281082