In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [107]:
df=pd.read_csv("homeprices_3col.csv")
df.head()

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000


***Preprocessing***

In [28]:
df.isnull().sum()

town     0
area     0
price    0
dtype: int64

# using pandas to create dummy

In [29]:
dummies=pd.get_dummies(df["town"]).astype(int) # this gives dummy result in 0,1 form
dummies

Unnamed: 0,monroe township,robinsville,west windsor
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
5,0,0,1
6,0,0,1
7,0,0,1
8,0,0,1
9,0,1,0


In [30]:
df_dummies=pd.concat([df,dummies],axis="columns") # adds df & dummies horizentally
df_dummies

Unnamed: 0,town,area,price,monroe township,robinsville,west windsor
0,monroe township,2600,550000,1,0,0
1,monroe township,3000,565000,1,0,0
2,monroe township,3200,610000,1,0,0
3,monroe township,3600,680000,1,0,0
4,monroe township,4000,725000,1,0,0
5,west windsor,2600,585000,0,0,1
6,west windsor,2800,615000,0,0,1
7,west windsor,3300,650000,0,0,1
8,west windsor,3600,710000,0,0,1
9,robinsville,2600,575000,0,1,0


***Now eliminating the "town" & dummy trap (if two cols can represent third col then eliminate any one col)***

In [31]:
df_dummies.drop(["town","robinsville"],axis="columns",inplace=True)
df_dummies

Unnamed: 0,area,price,monroe township,west windsor
0,2600,550000,1,0
1,3000,565000,1,0
2,3200,610000,1,0
3,3600,680000,1,0
4,4000,725000,1,0
5,2600,585000,0,1
6,2800,615000,0,1
7,3300,650000,0,1
8,3600,710000,0,1
9,2600,575000,0,0


# Spliting attributes & target col from the dataset

In [38]:
x=df_dummies.drop(["price"],axis="columns")
x

Unnamed: 0,area,monroe township,west windsor
0,2600,1,0
1,3000,1,0
2,3200,1,0
3,3600,1,0
4,4000,1,0
5,2600,0,1
6,2800,0,1
7,3300,0,1
8,3600,0,1
9,2600,0,0


In [39]:
y=df_dummies["price"]
y

0     550000
1     565000
2     610000
3     680000
4     725000
5     585000
6     615000
7     650000
8     710000
9     575000
10    600000
11    620000
12    695000
Name: price, dtype: int64

# Model Training

In [35]:
from sklearn.linear_model import LinearRegression

In [36]:
model=LinearRegression()

In [40]:
model.fit(x,y)

# Predicting house rent with 3400sqrft & located in West Windsor

In [46]:
model.predict([[3400,0,1]]) # area,monroe township(not from this), west windsor(from this)



array([681241.66845839])

# Predicting house rent with 2800sqrft & located in Robbinsville

In [47]:
model.predict([[2800,0,0]]) #area(2800),monroe township(not from this), west windsor(not from this)



array([590775.6396474])

# Using sklearn for One Hot Encoding

In [76]:
# making df copy for learning purpose
dfle=df

In [108]:
dfle.head()

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000


In [109]:
from sklearn.preprocessing import OneHotEncoder
ohe=OneHotEncoder(dtype=int) 

In [110]:
dfle["town"].value_counts()

town
monroe township    5
west windsor       4
robinsville        4
Name: count, dtype: int64

In [117]:
x=pd.DataFrame(
   ohe.fit_transform(dfle[["town"]]).toarray(), # encodes town column into number
   columns=["monroe township","west windsor","robinsville"] # giving column name for each generated col
)

x.drop("west windsor",axis="columns",inplace=True)
x

Unnamed: 0,monroe township,robinsville
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0
5,0,1
6,0,1
7,0,1
8,0,1
9,0,0


In [118]:
x=pd.concat([x,(dfle["area"])],axis="columns").values
x

array([[   1,    0, 2600],
       [   1,    0, 3000],
       [   1,    0, 3200],
       [   1,    0, 3600],
       [   1,    0, 4000],
       [   0,    1, 2600],
       [   0,    1, 2800],
       [   0,    1, 3300],
       [   0,    1, 3600],
       [   0,    0, 2600],
       [   0,    0, 2900],
       [   0,    0, 3100],
       [   0,    0, 3600]], dtype=int64)

In [119]:
y=dfle["price"].values
y

array([550000, 565000, 610000, 680000, 725000, 585000, 615000, 650000,
       710000, 575000, 600000, 620000, 695000], dtype=int64)

In [120]:
model.fit(x,y)

# Predicting house rent with 3400sqrft & located in West Windsor

In [123]:
model.predict([[0,0,3400]])

array([666914.10449366])

# Predicting house rent with 2800sqrft & located in Robbinsville

In [124]:
model.predict([[0,1,2800]])

array([605103.20361213])

# Checking model's score

In [126]:
model.score(x,y)

0.9573929037221873

# Exercise

***At the same level as this notebook on github, there is an Exercise folder that contains carprices.csv. This file has car sell prices for 3 different models. First plot data points on a scatter plot chart to see if linear regression model can be applied. If yes, then build a model that can answer following questions,***

**1) Predict price of a mercedez benz that is 4 yr old with mileage 45000**

**2) Predict price of a BMW X5 that is 7 yr old with mileage 86000**

***3) Tell me the score (accuracy) of your model. (Hint: use LinearRegression().score())***

In [19]:
df=pd.read_csv("exe3_carprices.csv")
df

Unnamed: 0,Car Model,Mileage,Sell Price($),Age(yrs)
0,BMW X5,69000,18000,6
1,BMW X5,35000,34000,3
2,BMW X5,57000,26100,5
3,BMW X5,22500,40000,2
4,BMW X5,46000,31500,4
5,Audi A5,59000,29400,5
6,Audi A5,52000,32000,5
7,Audi A5,72000,19300,6
8,Audi A5,91000,12000,8
9,Mercedez Benz C class,67000,22000,6


# Preprocessing

In [5]:
df.isnull().sum()

Car Model        0
Mileage          0
Sell Price($)    0
Age(yrs)         0
dtype: int64

In [23]:
df.value_counts()

Car Model              Mileage  Sell Price($)  Age(yrs)
Audi A5                52000    32000          5           1
                       59000    29400          5           1
                       72000    19300          6           1
                       91000    12000          8           1
BMW X5                 22500    40000          2           1
                       35000    34000          3           1
                       46000    31500          4           1
                       57000    26100          5           1
                       69000    18000          6           1
Mercedez Benz C class  59000    33000          5           1
                       67000    22000          6           1
                       79000    21000          7           1
                       83000    20000          7           1
Name: count, dtype: int64

In [26]:
from sklearn.preprocessing import OneHotEncoder
ohe=OneHotEncoder(dtype=int)

# Encoding Car Model

In [33]:
dfle=pd.DataFrame(
    ohe.fit_transform(df[["Car Model"]]).toarray(),
    columns=["Audi A5","BMW X5","Mercedez Benz C class"]
)

dfle

Unnamed: 0,Audi A5,BMW X5,Mercedez Benz C class
0,0,1,0
1,0,1,0
2,0,1,0
3,0,1,0
4,0,1,0
5,1,0,0
6,1,0,0
7,1,0,0
8,1,0,0
9,0,0,1


In [34]:
dfle=pd.concat([df,dfle],axis="columns")
dfle.drop(["Mercedez Benz C class","Car Model"],axis="columns",inplace=True)

dfle

Unnamed: 0,Mileage,Sell Price($),Age(yrs),Audi A5,BMW X5
0,69000,18000,6,0,1
1,35000,34000,3,0,1
2,57000,26100,5,0,1
3,22500,40000,2,0,1
4,46000,31500,4,0,1
5,59000,29400,5,1,0
6,52000,32000,5,1,0
7,72000,19300,6,1,0
8,91000,12000,8,1,0
9,67000,22000,6,0,0


In [35]:
dfle.corr()

Unnamed: 0,Mileage,Sell Price($),Age(yrs),Audi A5,BMW X5
Mileage,1.0,-0.927116,0.993452,0.275426,-0.642673
Sell Price($),-0.927116,1.0,-0.921741,-0.246918,0.40064
Age(yrs),0.993452,-0.921741,1.0,0.290701,-0.651155
Audi A5,0.275426,-0.246918,0.290701,1.0,-0.527046
BMW X5,-0.642673,0.40064,-0.651155,-0.527046,1.0


In [50]:
x=dfle[["Audi A5","BMW X5","Age(yrs)","Mileage"]].values
y=dfle["Sell Price($)"].values

# Model Training

In [51]:
from sklearn.linear_model import LinearRegression
model=LinearRegression()

In [52]:
model.fit(x,y)

# Price of mercedez benz that is 4 yr old with mileage 45000

***audi, bmw,age,mileage***

In [54]:
model.predict([[0,0,4,45000]])

array([36991.3172106])

# Price of BMW X5 that is 7 yr old with mileage 86000

In [55]:
model.predict([[0,1,7,86000]])

array([11080.74313221])

# The score (accuracy) of the model

In [56]:
model.score(x,y)

0.9417050937281082