In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

import joblib

In [2]:
df = pd.read_csv(r"homeprices3.csv")

In [3]:
df.head()

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000


Using One-Hot-Encoding

In [4]:
pd.get_dummies(df['town'])

Unnamed: 0,monroe township,robinsville,west windsor
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
5,0,0,1
6,0,0,1
7,0,0,1
8,0,0,1
9,0,1,0


In [5]:
dummies = pd.get_dummies(df['town'])


In [6]:
merged_df = pd.concat([df,dummies] , axis = "columns")

In [7]:
merged_df

Unnamed: 0,town,area,price,monroe township,robinsville,west windsor
0,monroe township,2600,550000,1,0,0
1,monroe township,3000,565000,1,0,0
2,monroe township,3200,610000,1,0,0
3,monroe township,3600,680000,1,0,0
4,monroe township,4000,725000,1,0,0
5,west windsor,2600,585000,0,0,1
6,west windsor,2800,615000,0,0,1
7,west windsor,3300,650000,0,0,1
8,west windsor,3600,710000,0,0,1
9,robinsville,2600,575000,0,1,0


In [8]:
# town variable is not needed anymore after one-hot-encoding

merged_df.drop(columns=['town'] , inplace = True)

Drop one column from dummy variables to avoid dummy variable trap.

https://www.geeksforgeeks.org/ml-dummy-variable-trap-in-regression-models/

In [9]:


merged_df.drop(columns= ['west windsor'] , inplace = True)

In [10]:
# Renaming columns
merged_df.rename(columns={'monroe township' : 'mt' , 'robinsville' : 'rv'}, inplace= True)

In [11]:
merged_df.head()

Unnamed: 0,area,price,mt,rv
0,2600,550000,1,0
1,3000,565000,1,0
2,3200,610000,1,0
3,3600,680000,1,0
4,4000,725000,1,0


Create Linear Regression Model


In [12]:
reg = LinearRegression()

In [13]:
reg.fit(merged_df[['area' , 'mt' , 'rv']].values , merged_df['price'].values)

In [14]:
reg.predict([[2800 , 1,0]])

array([565089.22812299])

In [15]:
reg.predict([[2800 , 0,1]])


array([590775.63964739])

In [16]:
reg.predict([[2800 , 0,1]])


array([590775.63964739])

Using Label Encoder

In [47]:
lb = LabelEncoder()

df = pd.read_csv("homeprices3.csv")

In [48]:
lb.fit_transform(df['town'])

array([0, 0, 0, 0, 0, 2, 2, 2, 2, 1, 1, 1, 1])

In [49]:

df['town'] = lb.fit_transform(df['town'])

In [50]:
df

Unnamed: 0,town,area,price
0,0,2600,550000
1,0,3000,565000
2,0,3200,610000
3,0,3600,680000
4,0,4000,725000
5,2,2600,585000
6,2,2800,615000
7,2,3300,650000
8,2,3600,710000
9,1,2600,575000


In [51]:
X = df[['town' , 'area']].values
y = df[['price']].values

In [52]:
ohe = OneHotEncoder()


In [61]:
town_dummies = ohe.fit_transform(df[['town']].values).toarray()

In [64]:
# Droping one column to avoid Dummy Variable Trap

town_dummies = town_dummies[:,:-1]

In [86]:
town_dummies = pd.DataFrame(town_dummies ,dtype=int , columns=['mt','rv'])
# len(town_dummies)

In [85]:
area = pd.Series(X[:,1])

In [91]:
X = pd.concat([ town_dummies , area], axis= "columns")


In [89]:
reg = LinearRegression()

In [92]:
reg.fit(X.values , y)

In [94]:
reg.coef_ , reg.intercept_

(array([[-40013.97548914, -14327.56396474,    126.89744141]]),
 array([249790.36766287]))

In [30]:
joblib.dump(reg , r"lr_model")

['lr_model']

In [95]:
reg.predict([[0,1,2800 ]])

array([[590775.63964739]])

In [96]:
reg.predict([[0,0,3400 ]])


array([[681241.6684584]])

In [98]:
joblib.dump(reg , r"lr_model")

['lr_model']

In [99]:
del reg

In [100]:
reg = joblib.load(r"lr_model")

In [102]:
reg.predict([[0,0,3400 ]])


array([[681241.6684584]])