## Read data from File

In [1]:
import pandas as pd

df = pd.read_csv('homeprices-with-town.csv')
df

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


## Here we have Town which is catagorical Field
### There are 2 types of Categorical fields
### 1. Nominal Where there is no specific ordering in values
#### E.g. Town names, Gender, 
### 2. Ordinal where there is some sort of ordering among the values
#### E.g. Grades, (high, medium, low)

### Here in data, the down is nominal categorical field. And we convert it to Numeric by using Dummy columns

In [2]:
df_dummies = pd.get_dummies(df.town, dtype=int)
df_dummies

Unnamed: 0,monroe township,robinsville,west windsor
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
5,0,0,1
6,0,0,1
7,0,0,1
8,0,0,1
9,0,1,0


#### Now we will join the 2 tables

In [3]:
df_inter = pd.concat([df, df_dummies], axis = 1)
df_inter

Unnamed: 0,town,area,price,monroe township,robinsville,west windsor
0,monroe township,2600,550000,1,0,0
1,monroe township,3000,565000,1,0,0
2,monroe township,3200,610000,1,0,0
3,monroe township,3600,680000,1,0,0
4,monroe township,4000,725000,1,0,0
5,west windsor,2600,585000,0,0,1
6,west windsor,2800,615000,0,0,1
7,west windsor,3300,650000,0,0,1
8,west windsor,3600,710000,0,0,1
9,robinsville,2600,575000,0,1,0


#### Now we will drop the town column which is categorical field. 
#### And we will also drop onw of dummy columns to avoid over fitting. It should be auto infered from 2 dummy columns.
#### When both are 0, that means it is third type

In [4]:
df_final = df_inter.drop(['town', 'robinsville'], axis = 1)
df_final

Unnamed: 0,area,price,monroe township,west windsor
0,2600,550000,1,0
1,3000,565000,1,0
2,3200,610000,1,0
3,3600,680000,1,0
4,4000,725000,1,0
5,2600,585000,0,1
6,2800,615000,0,1
7,3300,650000,0,1
8,3600,710000,0,1
9,2600,575000,0,0


### We can also do same with SKLearns One Hot Encoder method
#### First we label the categorical field by converting it into Numeric

In [5]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

df_label_encoded = df.copy()
df_label_encoded.town = le.fit_transform(df_label_encoded.town)
df_label_encoded

Unnamed: 0,town,area,price
0,0,2600,550000
1,0,3000,565000
2,0,3200,610000
3,0,3600,680000
4,0,4000,725000
5,2,2600,585000
6,2,2800,615000
7,2,3300,650000
8,2,3600,710000
9,1,2600,575000


#### Then we use One hot encoder and Column Transformer to transform the column

In [6]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# We Trasnform the Town Field only by OneHotEncoder dropping 1st column, and remainder columns just pass through
ct = ColumnTransformer([('town', OneHotEncoder(drop='first'), [0])], remainder = 'passthrough')
df_ohe_inter = pd.DataFrame(ct.fit_transform(df_label_encoded[['town']]))
df_ohe_inter

Unnamed: 0,0,1
0,0.0,0.0
1,0.0,0.0
2,0.0,0.0
3,0.0,0.0
4,0.0,0.0
5,0.0,1.0
6,0.0,1.0
7,0.0,1.0
8,0.0,1.0
9,1.0,0.0


### And now we join the 2 DFs to get final DF

In [7]:
df_final_ohe = pd.concat([df_ohe_inter, df_label_encoded.drop(['town'], axis='columns')], axis='columns')
df_final_ohe

Unnamed: 0,0,1,area,price
0,0.0,0.0,2600,550000
1,0.0,0.0,3000,565000
2,0.0,0.0,3200,610000
3,0.0,0.0,3600,680000
4,0.0,0.0,4000,725000
5,0.0,1.0,2600,585000
6,0.0,1.0,2800,615000
7,0.0,1.0,3300,650000
8,0.0,1.0,3600,710000
9,1.0,0.0,2600,575000


## And now we can run the Linear regression on this data

In [8]:
from sklearn.model_selection import train_test_split

train_features, test_features, train_labels, test_labels = train_test_split(df_final_ohe.drop(['price'], axis = 'columns').values, df_final_ohe.price, test_size=0.2)

## Now lets create Regressing Models

In [9]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression()
reg.fit(train_features, train_labels)

## Lets test the model on test set

In [10]:
prediction = reg.predict(test_features)
print(list(zip(prediction, test_labels)))

[(605178.7773933113, 600000), (662975.7785467093, 680000), (613333.3333333326, 610000)]


## And we can see the predictions are quite close

## Now lets store the model to a pickle file and later we will load it and run test again

In [11]:
import joblib
joblib.dump(reg, 'joblib/MVLinearRegressionOneHotEncoder.pkl')

['joblib/MVLinearRegressionOneHotEncoder.pkl']

#### Now load and run Test again

In [12]:
reg2 = joblib.load('joblib/MVLinearRegressionOneHotEncoder.pkl')
prediction = reg2.predict(test_features)
print(list(zip(prediction, test_labels)))

[(605178.7773933113, 600000), (662975.7785467093, 680000), (613333.3333333326, 610000)]
