# Multiple Linear Regression

## Importing the libraries

In [0]:
#importing the three required libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [43]:
#Reading the dataset
dataset = pd.read_csv('50_Startups.csv')
#extracting matrix of features and vector of dependent variable (profit)
X = dataset.iloc[:, :-1] 
Y = dataset.iloc[:, -1]
print(X)

    R&D Spend  Administration  Marketing Spend       State
0   165349.20       136897.80        471784.10    New York
1   162597.70       151377.59        443898.53  California
2   153441.51       101145.55        407934.54     Florida
3   144372.41       118671.85        383199.62    New York
4   142107.34        91391.77        366168.42     Florida
5   131876.90        99814.71        362861.36    New York
6   134615.46       147198.87        127716.82  California
7   130298.13       145530.06        323876.68     Florida
8   120542.52       148718.95        311613.29    New York
9   123334.88       108679.17        304981.62  California
10  101913.08       110594.11        229160.95     Florida
11  100671.96        91790.61        249744.55  California
12   93863.75       127320.38        249839.44     Florida
13   91992.39       135495.07        252664.93  California
14  119943.24       156547.42        256512.92     Florida
15  114523.61       122616.84        261776.23    New Yo

## Encoding categorical data

In [44]:
#The categorical data in the 'State' column needs to be split into three separate
#columns (New York, California and Florida). Each country is encoded into
#bits. For example, New York is represented by 001. [3] stands for column 3.

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder = 'passthrough')
X = np.array(ct.fit_transform(X))
print(X)

[[0.00e+00 0.00e+00 1.00e+00 1.65e+05 1.37e+05 4.72e+05]
 [1.00e+00 0.00e+00 0.00e+00 1.63e+05 1.51e+05 4.44e+05]
 [0.00e+00 1.00e+00 0.00e+00 1.53e+05 1.01e+05 4.08e+05]
 [0.00e+00 0.00e+00 1.00e+00 1.44e+05 1.19e+05 3.83e+05]
 [0.00e+00 1.00e+00 0.00e+00 1.42e+05 9.14e+04 3.66e+05]
 [0.00e+00 0.00e+00 1.00e+00 1.32e+05 9.98e+04 3.63e+05]
 [1.00e+00 0.00e+00 0.00e+00 1.35e+05 1.47e+05 1.28e+05]
 [0.00e+00 1.00e+00 0.00e+00 1.30e+05 1.46e+05 3.24e+05]
 [0.00e+00 0.00e+00 1.00e+00 1.21e+05 1.49e+05 3.12e+05]
 [1.00e+00 0.00e+00 0.00e+00 1.23e+05 1.09e+05 3.05e+05]
 [0.00e+00 1.00e+00 0.00e+00 1.02e+05 1.11e+05 2.29e+05]
 [1.00e+00 0.00e+00 0.00e+00 1.01e+05 9.18e+04 2.50e+05]
 [0.00e+00 1.00e+00 0.00e+00 9.39e+04 1.27e+05 2.50e+05]
 [1.00e+00 0.00e+00 0.00e+00 9.20e+04 1.35e+05 2.53e+05]
 [0.00e+00 1.00e+00 0.00e+00 1.20e+05 1.57e+05 2.57e+05]
 [0.00e+00 0.00e+00 1.00e+00 1.15e+05 1.23e+05 2.62e+05]
 [1.00e+00 0.00e+00 0.00e+00 7.80e+04 1.22e+05 2.64e+05]
 [0.00e+00 0.00e+00 1.00e+00 9.

## Splitting the dataset into the Training set and Test set

In [45]:
#using the model selection tool from scikit learn to split the dataset. 

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

#In single/multiple linear regression feature scaling is not required since coefficients are present.
#If you have faced any problem in understanding the code till now, please view 
#the notes in the Data Preprocessing and Simple Linear Regression notebooks.
print(y_test)

28    103282.38
11    144259.40
10    146121.95
41     77798.83
2     191050.39
27    105008.31
38     81229.06
31     97483.56
22    110352.25
4     166187.94
Name: Profit, dtype: float64


## Training the Multiple Linear Regression model on the Training set

In [0]:
#The LinearRegression class is perfect for both simple and multiple linear regression models.
#The class automatically selects the columns (features) which have the lowest p values
#and are the most significant for our model. It also recognizes dummy variables thus avoiding
#the dummy variable trap.

from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

## Predicting the Test set results

In [62]:
#Predicting the profit values for the features in the test set (x_test)
y_predicted = regressor.predict(x_test)
#this is to display all numerical values within 2 decimal places
np.set_printoptions(precision = 2) 
#The purpose of this line is to observe predicted values (left column) and real values (right column)
#firstly, predicted and real value vector of profit in the test set are reshaped into a signle column
#They are then concatenated to view them side by side
#IMPORTANT NOTE: not writing values in 'y_test.values.reshape' will result in an error since y_test is
#a series
print(np.concatenate((y_predicted.reshape(len(y_predicted),1), y_test.values.reshape(len(y_test),1)),1))

[[103015.2  103282.38]
 [132582.28 144259.4 ]
 [132447.74 146121.95]
 [ 71976.1   77798.83]
 [178537.48 191050.39]
 [116161.24 105008.31]
 [ 67851.69  81229.06]
 [ 98791.73  97483.56]
 [113969.44 110352.25]
 [167921.07 166187.94]]
