In [34]:
# import 'pandas' library as 'pd'
import pandas as pd

# read the csv file using "pd.read_csv(filename.csv)"
dataset = pd.read_csv('50_Startups.csv')
print(dataset.head())

   R&D Spend  Administration  Marketing Spend       State     Profit
0  165349.20       136897.80        471784.10    New York  192261.83
1  162597.70       151377.59        443898.53  California  191792.06
2  153441.51       101145.55        407934.54     Florida  191050.39
3  144372.41       118671.85        383199.62    New York  182901.99
4  142107.34        91391.77        366168.42     Florida  166187.94


In [20]:
### why we do this 'get_dummies' step - Its Nominal data in Multi linear regression
### So single nominal data column will expand more than 2 columns or many based on its column values
# Get dummies using "pd.get_dummies()"
# dataset - our dataset from csv file
# drop_first - used to remove first column of dummies
# dtype - used to convert 'True & False' to '0 & 1'

dataset = pd.get_dummies(dataset, drop_first=True, dtype=int)
# dataset = pd.get_dummies(dataset, drop_first=True).astype(int)

In [21]:
# .head(n) - used to print first few rows only - default 5
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State_Florida,State_New York
0,165349.2,136897.8,471784.1,192261.83,0,1
1,162597.7,151377.59,443898.53,191792.06,0,0
2,153441.51,101145.55,407934.54,191050.39,1,0
3,144372.41,118671.85,383199.62,182901.99,0,1
4,142107.34,91391.77,366168.42,166187.94,1,0


In [22]:
# .columns - used to get all the columns name in the dataset
dataset.columns

Index(['R&D Spend', 'Administration', 'Marketing Spend', 'Profit',
       'State_Florida', 'State_New York'],
      dtype='object')

In [23]:
# Split the inputs(independent) and outputs(dependent)
independent = dataset[['R&D Spend', 'Administration', 'Marketing Spend', 'State_Florida', 'State_New York']]
dependent = dataset[['Profit']]

In [24]:
# import the 'train_test_split' method from 'sklearn.model_selection' library
from sklearn.model_selection import train_test_split

In [25]:
# Set input and out values, test_size and randowm_state to train_test_split()
# test_size - used to defile how much of data we will use for testing - Ex: here we use 30% data for testing remaining will use for training the model
# train_test_split() - will give four values and we can store it like below variable names
X_train, X_test, Y_train, Y_test = train_test_split(independent, dependent, test_size=0.30, random_state=0)

In [26]:
X_train.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State_Florida,State_New York
7,130298.13,145530.06,323876.68,1,0
14,119943.24,156547.42,256512.92,1,0
45,1000.23,124153.04,1903.93,0,1
48,542.05,51743.15,0.0,0,1
29,65605.48,153032.06,107138.38,0,1


In [28]:
# Get the 'LinearRegression' method from 'sklearn.linear_model'
from sklearn.linear_model import LinearRegression

# assing it to new variable
regressor = LinearRegression()

# set training values to model for taining purpose
regressor.fit(X_train, Y_train)

In [29]:
# To get 'weight' using '.coef_'
# here we have 5 inputs so we will get 5 different weights
weight = regressor.coef_

# To get 'bias' using '.intercept_'
bias = regressor.intercept_

print(weight, ' | ', bias)

[[7.90840255e-01 3.01968165e-02 3.10148566e-02 4.63028992e+02
  3.04799573e+02]]  |  [42403.87087053]


In [30]:
# Using the trained Model, get the Predictions based on testing values
Y_pred = regressor.predict(X_test)

# print the predictions 
Y_pred

array([[104282.76472172],
       [132536.88499212],
       [133910.85007766],
       [ 72584.77489417],
       [179920.9276189 ],
       [114549.31079234],
       [ 66444.43261346],
       [ 98404.96840122],
       [114499.82808602],
       [169367.50639895],
       [ 96522.6253998 ],
       [ 88040.6718287 ],
       [110949.99405525],
       [ 90419.1897851 ],
       [128020.46250064]])

In [31]:
# After predictions completed, WE have to Evaluate the Model
# So that we use method called 'r2_score' and get it from 'sklearn.metrics' library
from sklearn.metrics import r2_score

# assign the r2_score to new variable with actual_output_values(Y_test) and predicted_output_values(Y_pred)
r_score = r2_score(Y_test, Y_pred)

# print the r_score
print(r_score)

# If score is near 1, it is Good Model
# If scode is near 0. it is low model

0.9358680970046243


In [32]:
# Once the Nodel get good score the save it for future purpose
# so that we need 'pickle' library
import pickle

# we have to set a filename for the model to be save
filename = 'finalised_model_multiple_regression.sav'

# using 'pickle.dump()' method we can save the Model 
pickle.dump(regressor, open(filename, 'wb'))

In [33]:
# Once saved the model then we again check the model from saved file 
# so that we use 'pickle.load()'
loaded_model = pickle.load(open(filename, 'rb'))

# then assign the input values to the saved model to predict
result = loaded_model.predict([[1234, 345, 4565, 1, 0]])

# print
result



array([[43994.79745873]])