In [13]:
# import 'pandas' library as 'pd'
import pandas as pd

# read the csv file using "pd.read_csv(filename.csv)"
dataset = pd.read_csv('50_Startups.csv')

dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [14]:
### why we do this 'get_dummies' step - Its Nominal data in Multi linear regression
### So single nominal data column will expand more than 2 columns or many based on its column values
# Get dummies using "pd.get_dummies()"
# dataset - our dataset from csv file
# drop_first - used to remove first column of dummies
# dtype - used to convert 'True & False' to '0 & 1'
dataset = pd.get_dummies(dataset, drop_first=True, dtype=int)

In [15]:
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State_Florida,State_New York
0,165349.2,136897.8,471784.1,192261.83,0,1
1,162597.7,151377.59,443898.53,191792.06,0,0
2,153441.51,101145.55,407934.54,191050.39,1,0
3,144372.41,118671.85,383199.62,182901.99,0,1
4,142107.34,91391.77,366168.42,166187.94,1,0


In [16]:
# .columns - used to get all the columns name in the dataset
dataset.columns

Index(['R&D Spend', 'Administration', 'Marketing Spend', 'Profit',
       'State_Florida', 'State_New York'],
      dtype='object')

In [17]:
# Split the inputs(independent) and outputs(dependent)
independent = dataset[['R&D Spend', 'Administration', 'Marketing Spend', 'State_Florida', 'State_New York']]
print(independent.head(2))

dependent = dataset[['Profit']]
print(dependent.head(2))

   R&D Spend  Administration  Marketing Spend  State_Florida  State_New York
0   165349.2       136897.80        471784.10              0               1
1   162597.7       151377.59        443898.53              0               0
      Profit
0  192261.83
1  191792.06


In [19]:
# import the 'train_test_split' method from 'sklearn.model_selection' library
from sklearn.model_selection import train_test_split

# Set input and out values, test_size and randowm_state to train_test_split()
# test_size - used to defile how much of data we will use for testing - Ex: here we use 30% data for testing remaining will use for training the model
# train_test_split() - will give four values and we can store it like below variable names
X_train, X_test, Y_train, Y_test = train_test_split(independent, dependent, test_size=0.30, random_state=0)

X_train.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State_Florida,State_New York
7,130298.13,145530.06,323876.68,1,0
14,119943.24,156547.42,256512.92,1,0
45,1000.23,124153.04,1903.93,0,1
48,542.05,51743.15,0.0,0,1
29,65605.48,153032.06,107138.38,0,1


In [20]:
# Here we do 'preprocessing' the dataset due to low r2_score 
# We will do this step after find the 'r_score' and if there is chance to improve the model then we do this 'standardization'
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
X_train

# After 'standardization' then give those data to our Model to process further

array([[ 1.17644103,  0.84515251,  0.94354978,  2.        , -0.76870611],
       [ 0.96420324,  1.27283565,  0.42738817,  2.        , -0.76870611],
       [-1.47369826,  0.0153175 , -1.52350329, -0.5       ,  1.30088727],
       [-1.48308929, -2.79556363, -1.53809178, -0.5       ,  1.30088727],
       [-0.14952431,  1.13637282, -0.71716495, -0.5       ,  1.30088727],
       [ 0.85312042, -0.04431628,  0.46771725, -0.5       ,  1.30088727],
       [-0.22353674, -0.3151007 , -0.83981652,  2.        , -0.76870611],
       [-0.19454707,  0.21199679, -1.18497259, -0.5       , -0.76870611],
       [ 0.10478723, -0.08388412,  0.48740807, -0.5       , -0.76870611],
       [-1.0096458 , -1.07019473, -0.4040623 , -0.5       , -0.76870611],
       [ 0.06872897, -0.38396487,  0.75036616, -0.5       , -0.76870611],
       [-1.17638797,  0.14067421, -1.26581817, -0.5       ,  1.30088727],
       [ 0.97648631,  0.9689421 ,  0.84958395, -0.5       ,  1.30088727],
       [ 0.39131191,  0.45560401,  0.3

In [21]:
# import 'SVR' method from 'sklearn.svm' library
from sklearn.svm import SVR

# set kernel => rbf | linear | poly | sigmoid | precomputed => default is 'rbf'
# 'C' is Regularization parameter - Allows given number of errors
regressor = SVR(kernel='rbf', C=1000)

# set training values to model for taining purpose
regressor.fit(X_train, Y_train)

  y = column_or_1d(y, warn=True)


In [22]:
# get 'bias' value using '.intercept_'
print(regressor.intercept_)

# get total supports
print(regressor.n_support_)

# get all the supporters
print(regressor.support_, len(regressor.support_))

[108599.21691873]
[35]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34] 35


In [23]:
# Using the trained Model, get the Predictions based on testing values
Y_pred = regressor.predict(X_test)

In [12]:
# After predictions completed, WE have to Evaluate the Model
# So that we use method called 'r2_score' and get it from 'sklearn.metrics' library
from sklearn.metrics import r2_score

# assign the r2_score to new variable with actual_output_values(Y_test) and predicted_output_values(Y_pred)
r_score = r2_score(Y_test, Y_pred)

# print the r_score
print(r_score)

# If score is near 1, it is Good Model
# If scode is near 0. it is low model

0.0067683444800727965