In [3]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression

In [20]:
df = pd.read_csv('Csv_home_prices_by_cities.csv')
df

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


# Aim :
- to create a model which can take town,area as data and predict 'price'
- but we know, LinearRegression( ) works with numeric data, so we have to somehow represent town data as numbers

### One way of representing town in numeric format :
- assign numeric value to each distinct town

example , - 'monroe township' = 1 , 'west windsor' = 2	, 'robinsville' = 3

In [8]:
df2 = df.replace(['monroe township','west windsor','robinsville'],[1,2,3])
df2

Unnamed: 0,town,area,price
0,1,2600,550000
1,1,3000,565000
2,1,3200,610000
3,1,3600,680000
4,1,4000,725000
5,2,2600,585000
6,2,2800,615000
7,2,3300,650000
8,2,3600,710000
9,3,2600,575000


- but prolem with this is that our machinelearning model might draw some unnecessery conclusion fron this:
        -  'monroe township' < 'west windsor' < 'robinsville'  OR
        -  'monroe township' + 'west windsor' = 'robinsville'

### Another way of representing town in numeric format :

#### using encoding like :
- 'monroe township' = 100
- 'west windsor' = 010
- 'robinsville' = 001

#### we usually drop one column to prevent  'Dummy variable trap' ,   making the code like:

- 'monroe township' = 10
- 'west windsor' = 01
- 'robinsville' = 00


## Catagorical Variables

 - These type of catagory type variables in Machine Learning input are known as catagorical variables
 - They can be of two types :
 
 1. Nominal Catagorical Variables
 
 Variable having no numerical relation with one another,
 ex - 
 - city names (just like our case)
 - gender (male/female)
 - Colours ... etc
 
 2. Ordinal Catagorical Variables
 
 Variable having some sort of numerical relation with one another,
 ex -
 - degrees (graduate < masters < PHD)
 - high < medium < low
 - dissatisfied < nutral < satisfied
 - grades etc 

##### We are dealing with Nominal Catagorical Variables here

- To prevent unnecessery assumptions from machine we can encode Nominal catagorical variables using :

1. Pandas get_dummies method
2. One Hot Encoding method

# 1. Pandas get_dummies method

In [10]:
df

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


In [13]:
dummies = pd.get_dummies(df.town)
dummies

Unnamed: 0,monroe township,robinsville,west windsor
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
5,0,0,1
6,0,0,1
7,0,0,1
8,0,0,1
9,0,1,0


In [21]:
# we concat dummies column to origial dataframe

df = pd.concat([df,dummies],axis = 1)
df

Unnamed: 0,town,area,price,monroe township,robinsville,west windsor
0,monroe township,2600,550000,1,0,0
1,monroe township,3000,565000,1,0,0
2,monroe township,3200,610000,1,0,0
3,monroe township,3600,680000,1,0,0
4,monroe township,4000,725000,1,0,0
5,west windsor,2600,585000,0,0,1
6,west windsor,2800,615000,0,0,1
7,west windsor,3300,650000,0,0,1
8,west windsor,3600,710000,0,0,1
9,robinsville,2600,575000,0,1,0


In [23]:
# we need to drop one dumy  column to avoid dummy trap problem

df = df.drop(['west windsor'],axis = 1)
df

Unnamed: 0,town,area,price,monroe township,robinsville
0,monroe township,2600,550000,1,0
1,monroe township,3000,565000,1,0
2,monroe township,3200,610000,1,0
3,monroe township,3600,680000,1,0
4,monroe township,4000,725000,1,0
5,west windsor,2600,585000,0,0
6,west windsor,2800,615000,0,0
7,west windsor,3300,650000,0,0
8,west windsor,3600,710000,0,0
9,robinsville,2600,575000,0,1


### Machine Learning Model = 

In [29]:
x  = df[['monroe township','robinsville','area']]
x

Unnamed: 0,monroe township,robinsville,area
0,1,0,2600
1,1,0,3000
2,1,0,3200
3,1,0,3600
4,1,0,4000
5,0,0,2600
6,0,0,2800
7,0,0,3300
8,0,0,3600
9,0,1,2600


In [30]:
y = df.price
y

0     550000
1     565000
2     610000
3     680000
4     725000
5     585000
6     615000
7     650000
8     710000
9     575000
10    600000
11    620000
12    695000
Name: price, dtype: int64

In [31]:
model = LinearRegression()
model.fit(x,y)

LinearRegression()

In [33]:
# for 'monroe township' town

model.predict([[1,0,2900]])

array([577778.97226403])

##### This is the final dataframe we can work on and build the model

# 2. One Hot Encoding method

In [48]:
df = pd.read_csv('Csv_home_prices_by_cities.csv')
df

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


In [42]:
# we need to first do LabelEncoding on town column

from sklearn.preprocessing import LabelEncoder

In [49]:
encoder = LabelEncoder()
df.town = encoder.fit_transform(df.town)
df

Unnamed: 0,town,area,price
0,0,2600,550000
1,0,3000,565000
2,0,3200,610000
3,0,3600,680000
4,0,4000,725000
5,2,2600,585000
6,2,2800,615000
7,2,3300,650000
8,2,3600,710000
9,1,2600,575000


In [53]:
# train dataset
# values to create 2D array instead of a dataframe

x = df[['town','area']].values
x

array([[   0, 2600],
       [   0, 3000],
       [   0, 3200],
       [   0, 3600],
       [   0, 4000],
       [   2, 2600],
       [   2, 2800],
       [   2, 3300],
       [   2, 3600],
       [   1, 2600],
       [   1, 2900],
       [   1, 3100],
       [   1, 3600]], dtype=int64)

In [54]:
y = df.price
y.head()

0    550000
1    565000
2    610000
3    680000
4    725000
Name: price, dtype: int64

In [66]:
# Applying One Hot Encoder

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [67]:
# encode - just a name
# [0] sets up zeroth column to set dummies
# passthrough - doesnt apply anything to reminder columns

encoder = ColumnTransformer([("encode", OneHotEncoder(), [0])], remainder = 'passthrough')
X = encoder.fit_transform(x)
X

array([[1.0e+00, 0.0e+00, 0.0e+00, 2.6e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.0e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.2e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.6e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 4.0e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.6e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.8e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.3e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.6e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 2.6e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 2.9e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.1e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.6e+03]])

In [68]:
# we have to drop a column for preventing dummy trap
# here we are dropping first column

X = X[:,1:]
X

array([[0.0e+00, 0.0e+00, 2.6e+03],
       [0.0e+00, 0.0e+00, 3.0e+03],
       [0.0e+00, 0.0e+00, 3.2e+03],
       [0.0e+00, 0.0e+00, 3.6e+03],
       [0.0e+00, 0.0e+00, 4.0e+03],
       [0.0e+00, 1.0e+00, 2.6e+03],
       [0.0e+00, 1.0e+00, 2.8e+03],
       [0.0e+00, 1.0e+00, 3.3e+03],
       [0.0e+00, 1.0e+00, 3.6e+03],
       [1.0e+00, 0.0e+00, 2.6e+03],
       [1.0e+00, 0.0e+00, 2.9e+03],
       [1.0e+00, 0.0e+00, 3.1e+03],
       [1.0e+00, 0.0e+00, 3.6e+03]])

In [69]:
# Machine Learning Model

model = LinearRegression()
model.fit(X,y)

LinearRegression()

In [70]:
model.predict([[0,0,2900]])

array([577778.97226403])

# To score accuracy of a model :

In [72]:
# 95.73 % accurate

model.score(X,y)

0.9573929037221874