# Multiple Linear Regression
* Read the data from CSV
* Read all independent columns to X
* Read dependent (predictable) column to y
* X has categorical column called 'State', **do one-hot-encoding for categorical varaibles**
* Split the X,y to training and test data-sets

In [1]:
# import pandas, matplotlib and numpy
import pandas as pd
import numpy as np

In [2]:
# Read 50 startups data
df_startup = pd.read_csv('../data-csv/csv-files/50_Startups.csv')
df_startup.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [3]:
# Read all the columns except the 'Profit' column to X
X = df_startup.iloc[:, :-1]
X.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State
0,165349.2,136897.8,471784.1,New York
1,162597.7,151377.59,443898.53,California
2,153441.51,101145.55,407934.54,Florida
3,144372.41,118671.85,383199.62,New York
4,142107.34,91391.77,366168.42,Florida


In [4]:
# Read the 'Profit' column to y
y = df_startup.iloc[:, 4]
y.head()

0    192261.83
1    191792.06
2    191050.39
3    182901.99
4    166187.94
Name: Profit, dtype: float64

In [5]:
# "State" is a categorical variable, lets use One Hot encoding 
# One hot encoding is a process by which categorical variables are converted into a form that could be 
# provided to ML algorithms to do a better job in prediction.
# Drop one column to avoid "DUMMY VARIABLE TRAP"
states = pd.get_dummies(X['State'], drop_first=True)
states.head()

Unnamed: 0,Florida,New York
0,0,1
1,0,0
2,1,0
3,0,1
4,1,0


In [6]:
# Drops the 'State' column from X
X = X.drop('State', axis=1)
X.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,165349.2,136897.8,471784.1
1,162597.7,151377.59,443898.53
2,153441.51,101145.55,407934.54
3,144372.41,118671.85,383199.62
4,142107.34,91391.77,366168.42


In [7]:
# Conacat "one-hot-encoded" values of State column to X (axis=1 means add column)
X = pd.concat([X, states], axis=1)
X.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Florida,New York
0,165349.2,136897.8,471784.1,0,1
1,162597.7,151377.59,443898.53,0,0
2,153441.51,101145.55,407934.54,1,0
3,144372.41,118671.85,383199.62,0,1
4,142107.34,91391.77,366168.42,1,0


In [8]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [9]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)