In [2]:
import pandas as pd
import numpy as np

# reading dataset

In [3]:
df = pd.read_csv('./Datasets/csv_files/Csv_home_prices_by_cities.csv')
df

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


# Aim :
- to create a model which can take town,area as data and predict 'price'
- but we know, LinearRegression( ) works with numeric data, so we have to somehow represent town data as numbers

### One way of representing town in numeric format :
- assign numeric value to each distinct town

example , - 'monroe township' = 1 , 'west windsor' = 2	, 'robinsville' = 3

In [8]:
df2 = df.replace(['monroe township','west windsor','robinsville'],[1,2,3])
df2

Unnamed: 0,town,area,price
0,1,2600,550000
1,1,3000,565000
2,1,3200,610000
3,1,3600,680000
4,1,4000,725000
5,2,2600,585000
6,2,2800,615000
7,2,3300,650000
8,2,3600,710000
9,3,2600,575000


- but prolem with this is that our machinelearning model might draw some unnecessery conclusion fron this:
        -  'monroe township' < 'west windsor' < 'robinsville'  OR
        -  'monroe township' + 'west windsor' = 'robinsville'

### Another way of representing town in numeric format :

#### using encoding like :
- 'monroe township' = 100
- 'west windsor' = 010
- 'robinsville' = 001

#### we usually drop one column to prevent  'Dummy variable trap' ,   making the code like:

- 'monroe township' = 10
- 'west windsor' = 01
- 'robinsville' = 00


## Catagorical Variables

 - These type of catagory type variables in Machine Learning input are known as catagorical variables
 - They can be of two types :
 
 1. Nominal Catagorical Variables
 
 Variable having no numerical relation with one another,
 ex - 
 - city names (just like our case)
 - gender (male/female)
 - Colours ... etc
 
 2. Ordinal Catagorical Variables
 
 Variable having some sort of numerical relation with one another,
 ex -
 - degrees (graduate < masters < PHD)
 - high < medium < low
 - dissatisfied < nutral < satisfied
 - grades etc 

##### We are dealing with Nominal Catagorical Variables here

- To prevent unnecessery assumptions from machine we can encode Nominal catagorical variables using :

1. Pandas get_dummies method
2. One Hot Encoding method

# 1. pd.get_dummies( ) method

In [4]:
df

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


In [18]:
# pd.get_dummies( ) separates all numerical columns to the left and 
# converts each catagories in each nominal column as a separate column.

# if the row contained that catagory it places '1' as value in that catagory column, or else it places '0'

df2 = pd.get_dummies(df)
df2

Unnamed: 0,area,price,town_monroe township,town_robinsville,town_west windsor
0,2600,550000,1,0,0
1,3000,565000,1,0,0
2,3200,610000,1,0,0
3,3600,680000,1,0,0
4,4000,725000,1,0,0
5,2600,585000,0,0,1
6,2800,615000,0,0,1
7,3300,650000,0,0,1
8,3600,710000,0,0,1
9,2600,575000,0,1,0


In [19]:
# we need to drop one dummy column to avoid dummy trap problem

df2 = df2.drop(['town_west windsor'],axis = 1)
df2

Unnamed: 0,area,price,town_monroe township,town_robinsville
0,2600,550000,1,0
1,3000,565000,1,0
2,3200,610000,1,0
3,3600,680000,1,0
4,4000,725000,1,0
5,2600,585000,0,0
6,2800,615000,0,0
7,3300,650000,0,0
8,3600,710000,0,0
9,2600,575000,0,1


### Or we can use various parameters of pd.get_dummies( ) to drop the extra columns and manipulate other outputs simultaneously

### pd.get_dummies( ) parameters : 

- prefix : to provide custom prefixes to dummy column names
- prefix_sep : setting the separation between prefix and actual column name // default = '_'
- dummy_na : if set True, it will make a separate dummy column where it willl mark '1' if the row contained NAN value there
- sparse : if set true will return sparse matrix
- drop_first : if set true will drop the first catagory of a nominal column to be converted into a dummy column // used to address the 'Dummy variable Trap'
- dtype : to specify the datatypes of dummy values

In [28]:
df3 = pd.get_dummies(df, prefix='dummy', prefix_sep='123',dtype='float', drop_first= True)
df3

Unnamed: 0,area,price,dummy123robinsville,dummy123west windsor
0,2600,550000,0.0,0.0
1,3000,565000,0.0,0.0
2,3200,610000,0.0,0.0
3,3600,680000,0.0,0.0
4,4000,725000,0.0,0.0
5,2600,585000,0.0,1.0
6,2800,615000,0.0,1.0
7,3300,650000,0.0,1.0
8,3600,710000,0.0,1.0
9,2600,575000,1.0,0.0


### Machine Learning Model = 

In [29]:
x  = df2[['town_monroe township','town_robinsville','area']]
x

Unnamed: 0,town_monroe township,town_robinsville,area
0,1,0,2600
1,1,0,3000
2,1,0,3200
3,1,0,3600
4,1,0,4000
5,0,0,2600
6,0,0,2800
7,0,0,3300
8,0,0,3600
9,0,1,2600


In [30]:
y = df2.price
y

0     550000
1     565000
2     610000
3     680000
4     725000
5     585000
6     615000
7     650000
8     710000
9     575000
10    600000
11    620000
12    695000
Name: price, dtype: int64

In [31]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(x,y)

LinearRegression()

In [32]:
# for 'monroe township' town

model.predict([[1,0,2900]])

array([577778.97226403])

# 2. One Hot Encoding method

In [67]:
df4 = df
df4.head()

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000


In [68]:
# importing onehotencoder

from sklearn.preprocessing import OneHotEncoder

### OneHotEncoder parameters
- categories : we can define our own set of custom catagories 
    //default = 'auto', it autometically separates out catagories from columns
- drop : allows us to tackle dummy variable trap by dropping a not-required dummy column// Default = None. 
    Possible values - 
    - 'first' : drop the first category in each feature. If only one category is present, the feature will be dropped entirely.
    - 'if_binary' : drop the first category in each feature with two categories. Features with 1 or more than 2 categories are
      left intact.
    - array : ``drop[i]`` is the category in feature ``X[:, i]`` that
      should be dropped. 
- sparse : if set to false , it returns numpy nd array// Default = True
- dtype : to specify dtype of the values in dummy columns

### Note : instead of callunf fit & transform separately, we can call it simultaneously as fit_transform

In [69]:
# making an object of the onehotencoder

ohe = OneHotEncoder(drop='first', sparse=False, dtype='int64')

In [70]:
# fit & transform

df5 = ohe.fit_transform(df4[['town']])

In [71]:
df4['town'].unique()

array(['monroe township', 'west windsor', 'robinsville'], dtype=object)

In [72]:
# converting to dataframe

df5 = pd.DataFrame(df5, columns= ['west windsor', 'robinsville'])
df5

Unnamed: 0,west windsor,robinsville
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
5,0,1
6,0,1
7,0,1
8,0,1
9,1,0


In [76]:
df6 = df4.drop('town',axis = 1).join([df5])
df6

Unnamed: 0,area,price,west windsor,robinsville
0,2600,550000,0,0
1,3000,565000,0,0
2,3200,610000,0,0
3,3600,680000,0,0
4,4000,725000,0,0
5,2600,585000,0,1
6,2800,615000,0,1
7,3300,650000,0,1
8,3600,710000,0,1
9,2600,575000,1,0


In [77]:
# train dataset
# values to create 2D array instead of a dataframe

x = df6.drop('price', axis = 1)
x

Unnamed: 0,area,west windsor,robinsville
0,2600,0,0
1,3000,0,0
2,3200,0,0
3,3600,0,0
4,4000,0,0
5,2600,0,1
6,2800,0,1
7,3300,0,1
8,3600,0,1
9,2600,1,0


In [78]:
y = df.price
y.head()

0    550000
1    565000
2    610000
3    680000
4    725000
Name: price, dtype: int64

In [79]:
# Machine Learning Model

model = LinearRegression()
model.fit(x,y)

LinearRegression()

In [85]:
model.predict([[2900,0,0]])

array([577778.97226403])

# To score accuracy of a model :

In [86]:
# 95.73 % accurate

model.score(x,y)

0.9573929037221872