In [1]:
url = "https://raw.githubusercontent.com/codebasics/py/refs/heads/master/ML/5_one_hot_encoding/homeprices.csv"

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [38]:
df = pd.read_csv(url)
df.head()

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000


For categorical data, we create dummy variables either 1 or 0

In [4]:
# categorical data here is town

df.town.unique()

array(['monroe township', 'west windsor', 'robinsville'], dtype=object)

In [12]:
# for each of towns, create a column
# and assign 1 or 0, if it's the town or not

# this can be done easily using pd.get_dummies
dummies = pd.get_dummies(df.town, dtype=int)
dummies

Unnamed: 0,monroe township,robinsville,west windsor
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
5,0,0,1
6,0,0,1
7,0,0,1
8,0,0,1
9,0,1,0


In [13]:
# now merge this with the original df, based on index

merged = pd.concat([df, dummies], axis='columns')
merged

Unnamed: 0,town,area,price,monroe township,robinsville,west windsor
0,monroe township,2600,550000,1,0,0
1,monroe township,3000,565000,1,0,0
2,monroe township,3200,610000,1,0,0
3,monroe township,3600,680000,1,0,0
4,monroe township,4000,725000,1,0,0
5,west windsor,2600,585000,0,0,1
6,west windsor,2800,615000,0,0,1
7,west windsor,3300,650000,0,0,1
8,west windsor,3600,710000,0,0,1
9,robinsville,2600,575000,0,1,0


In [14]:
# now drop the town column, because it's of no use now
# we also need to drop any one of the dummy columns,
  # because it can be deduced based on the other 2 columns
  # if town is not monroe or robinson it will be west windsor

merged.drop(['town', 'west windsor'], axis='columns', inplace=True)
merged

Unnamed: 0,area,price,monroe township,robinsville
0,2600,550000,1,0
1,3000,565000,1,0
2,3200,610000,1,0
3,3600,680000,1,0
4,4000,725000,1,0
5,2600,585000,0,0
6,2800,615000,0,0
7,3300,650000,0,0
8,3600,710000,0,0
9,2600,575000,0,1


In [15]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

In [16]:
model.fit(merged.drop('price', axis='columns'), merged.price)

In [17]:
print(model.coef_)
print(model.intercept_)

[   126.89744141 -40013.97548914 -14327.56396474]
249790.36766292527


In [36]:
model.predict([[2800, 0, 1]])
# 2800 sq ft in robensville



array([590775.63964739])

In [20]:
model.predict([[3400, 0, 0]])
# 3400 sq ft in west windsor



array([681241.66845839])

In [21]:
model.score(merged.drop('price', axis='columns'), merged.price)

0.9573929037221872

This same thing can be done using sklearn one hot encoder

In [39]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [40]:
df

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


In [41]:
dfle = df
dfle.town = le.fit_transform(dfle.town)
dfle

Unnamed: 0,town,area,price
0,0,2600,550000
1,0,3000,565000
2,0,3200,610000
3,0,3600,680000
4,0,4000,725000
5,2,2600,585000
6,2,2800,615000
7,2,3300,650000
8,2,3600,710000
9,1,2600,575000


In [42]:
X = dfle[['town', 'area']].values
X

array([[   0, 2600],
       [   0, 3000],
       [   0, 3200],
       [   0, 3600],
       [   0, 4000],
       [   2, 2600],
       [   2, 2800],
       [   2, 3300],
       [   2, 3600],
       [   1, 2600],
       [   1, 2900],
       [   1, 3100],
       [   1, 3600]])

In [43]:
Y = dfle.price.values
Y

array([550000, 565000, 610000, 680000, 725000, 585000, 615000, 650000,
       710000, 575000, 600000, 620000, 695000])

In [44]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer([('town', OneHotEncoder(), [0])], remainder='passthrough')

In [45]:
X = ct.fit_transform(X)
X

array([[1.0e+00, 0.0e+00, 0.0e+00, 2.6e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.0e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.2e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.6e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 4.0e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.6e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.8e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.3e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.6e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 2.6e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 2.9e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.1e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.6e+03]])

In [46]:
X = X[:, 1:]
X
# drop the first column

array([[0.0e+00, 0.0e+00, 2.6e+03],
       [0.0e+00, 0.0e+00, 3.0e+03],
       [0.0e+00, 0.0e+00, 3.2e+03],
       [0.0e+00, 0.0e+00, 3.6e+03],
       [0.0e+00, 0.0e+00, 4.0e+03],
       [0.0e+00, 1.0e+00, 2.6e+03],
       [0.0e+00, 1.0e+00, 2.8e+03],
       [0.0e+00, 1.0e+00, 3.3e+03],
       [0.0e+00, 1.0e+00, 3.6e+03],
       [1.0e+00, 0.0e+00, 2.6e+03],
       [1.0e+00, 0.0e+00, 2.9e+03],
       [1.0e+00, 0.0e+00, 3.1e+03],
       [1.0e+00, 0.0e+00, 3.6e+03]])

In [47]:
model2 = LinearRegression()
model2.fit(X, Y)

In [49]:
model2.predict([[1, 0, 2800]])
# 2800 sq ft in robensville


# 0 - monroe
# 1 - robinsville
# 2 - west windsor

array([590775.63964739])

In [51]:
model2.predict([[0, 1, 3400]])
# 3400 sq ft in west windsor

array([681241.6684584])

# Assignment - Car Pricing

In [52]:
url = "https://raw.githubusercontent.com/codebasics/py/refs/heads/master/ML/5_one_hot_encoding/Exercise/carprices.csv"

In [53]:
df = pd.read_csv(url)
df

Unnamed: 0,Car Model,Mileage,Sell Price($),Age(yrs)
0,BMW X5,69000,18000,6
1,BMW X5,35000,34000,3
2,BMW X5,57000,26100,5
3,BMW X5,22500,40000,2
4,BMW X5,46000,31500,4
5,Audi A5,59000,29400,5
6,Audi A5,52000,32000,5
7,Audi A5,72000,19300,6
8,Audi A5,91000,12000,8
9,Mercedez Benz C class,67000,22000,6


In [55]:
df.rename(
    columns={
        'Car Model': 'model',
        'Sell Price($)': 'price',
        'Age(yrs)': 'age',
        'Mileage': 'mileage'
    },
    inplace=True
)

In [56]:
df.head()

Unnamed: 0,model,mileage,price,age
0,BMW X5,69000,18000,6
1,BMW X5,35000,34000,3
2,BMW X5,57000,26100,5
3,BMW X5,22500,40000,2
4,BMW X5,46000,31500,4


In [58]:
dummies = pd.get_dummies(df.model, dtype=int)
dummies

Unnamed: 0,Audi A5,BMW X5,Mercedez Benz C class
0,0,1,0
1,0,1,0
2,0,1,0
3,0,1,0
4,0,1,0
5,1,0,0
6,1,0,0
7,1,0,0
8,1,0,0
9,0,0,1


In [59]:
merged = pd.concat([df, dummies], axis='columns')
merged

Unnamed: 0,model,mileage,price,age,Audi A5,BMW X5,Mercedez Benz C class
0,BMW X5,69000,18000,6,0,1,0
1,BMW X5,35000,34000,3,0,1,0
2,BMW X5,57000,26100,5,0,1,0
3,BMW X5,22500,40000,2,0,1,0
4,BMW X5,46000,31500,4,0,1,0
5,Audi A5,59000,29400,5,1,0,0
6,Audi A5,52000,32000,5,1,0,0
7,Audi A5,72000,19300,6,1,0,0
8,Audi A5,91000,12000,8,1,0,0
9,Mercedez Benz C class,67000,22000,6,0,0,1


In [60]:
final = merged.drop(['model', 'Mercedez Benz C class'], axis='columns')
final

Unnamed: 0,mileage,price,age,Audi A5,BMW X5
0,69000,18000,6,0,1
1,35000,34000,3,0,1
2,57000,26100,5,0,1
3,22500,40000,2,0,1
4,46000,31500,4,0,1
5,59000,29400,5,1,0
6,52000,32000,5,1,0
7,72000,19300,6,1,0
8,91000,12000,8,1,0
9,67000,22000,6,0,0


In [65]:
X = final.drop('price', axis='columns').values
X

array([[69000,     6,     0,     1],
       [35000,     3,     0,     1],
       [57000,     5,     0,     1],
       [22500,     2,     0,     1],
       [46000,     4,     0,     1],
       [59000,     5,     1,     0],
       [52000,     5,     1,     0],
       [72000,     6,     1,     0],
       [91000,     8,     1,     0],
       [67000,     6,     0,     0],
       [83000,     7,     0,     0],
       [79000,     7,     0,     0],
       [59000,     5,     0,     0]])

In [66]:
Y = final.price.values
Y

array([18000, 34000, 26100, 40000, 31500, 29400, 32000, 19300, 12000,
       22000, 20000, 21000, 33000])

In [67]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [68]:
model.fit(X, Y)

In [69]:
print(model.coef_)
print(model.intercept_)

[-3.70122094e-01 -1.33245363e+03 -2.45354074e+03 -6.73820733e+03]
58976.62596853724


In [71]:
# order: mileage, age, Audi A5, BMW X5

model.predict([[45000, 4, 0, 0]])
# Merc with 45000 and 4 year old

array([36991.31721061])

In [72]:
# X5, 7 years, 86000 km
model.predict([[86000, 7, 0, 1]])

array([11080.74313219])

In [73]:
model.score(X, Y)

0.9417050937281082

In [74]:
# use train_test_split to split the data into train and test