Import the data set

In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

Load the data set

In [2]:
data = pd.read_csv("50_Startups.csv")
data.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


Some information about the data set

In [3]:
data.shape

(50, 5)

In [4]:
data.columns

Index(['R&D Spend', 'Administration', 'Marketing Spend', 'State', 'Profit'], dtype='object')

Some statistical information about the dataset

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [6]:
data.describe()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
count,50.0,50.0,50.0,50.0
mean,73721.6156,121344.6396,211025.0978,112012.6392
std,45902.256482,28017.802755,122290.310726,40306.180338
min,0.0,51283.14,0.0,14681.4
25%,39936.37,103730.875,129300.1325,90138.9025
50%,73051.08,122699.795,212716.24,107978.19
75%,101602.8,144842.18,299469.085,139765.9775
max,165349.2,182645.56,471784.1,192261.83


In [7]:
data["State"].value_counts()

New York      17
California    17
Florida       16
Name: State, dtype: int64

Load the label encoder function

In [8]:
label_encode = LabelEncoder()
print("Load the Label Encoder function")

Load the Label Encoder function


In [9]:
data["State"] = label_encode.fit_transform(data["State"])

In [10]:
data.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,2,192261.83
1,162597.7,151377.59,443898.53,0,191792.06
2,153441.51,101145.55,407934.54,1,191050.39
3,144372.41,118671.85,383199.62,2,182901.99
4,142107.34,91391.77,366168.42,1,166187.94


In [11]:
data.describe()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
count,50.0,50.0,50.0,50.0,50.0
mean,73721.6156,121344.6396,211025.0978,1.0,112012.6392
std,45902.256482,28017.802755,122290.310726,0.832993,40306.180338
min,0.0,51283.14,0.0,0.0,14681.4
25%,39936.37,103730.875,129300.1325,0.0,90138.9025
50%,73051.08,122699.795,212716.24,1.0,107978.19
75%,101602.8,144842.18,299469.085,2.0,139765.9775
max,165349.2,182645.56,471784.1,2.0,192261.83


Spliting the data set into feature and target

In [12]:
x = data.drop(columns=["Profit"],axis=1)
y = data["Profit"]


In [13]:
print(x)

    R&D Spend  Administration  Marketing Spend  State
0   165349.20       136897.80        471784.10      2
1   162597.70       151377.59        443898.53      0
2   153441.51       101145.55        407934.54      1
3   144372.41       118671.85        383199.62      2
4   142107.34        91391.77        366168.42      1
5   131876.90        99814.71        362861.36      2
6   134615.46       147198.87        127716.82      0
7   130298.13       145530.06        323876.68      1
8   120542.52       148718.95        311613.29      2
9   123334.88       108679.17        304981.62      0
10  101913.08       110594.11        229160.95      1
11  100671.96        91790.61        249744.55      0
12   93863.75       127320.38        249839.44      1
13   91992.39       135495.07        252664.93      0
14  119943.24       156547.42        256512.92      1
15  114523.61       122616.84        261776.23      2
16   78013.11       121597.55        264346.06      0
17   94657.16       145077.5

In [14]:
print(y)

0     192261.83
1     191792.06
2     191050.39
3     182901.99
4     166187.94
5     156991.12
6     156122.51
7     155752.60
8     152211.77
9     149759.96
10    146121.95
11    144259.40
12    141585.52
13    134307.35
14    132602.65
15    129917.04
16    126992.93
17    125370.37
18    124266.90
19    122776.86
20    118474.03
21    111313.02
22    110352.25
23    108733.99
24    108552.04
25    107404.34
26    105733.54
27    105008.31
28    103282.38
29    101004.64
30     99937.59
31     97483.56
32     97427.84
33     96778.92
34     96712.80
35     96479.51
36     90708.19
37     89949.14
38     81229.06
39     81005.76
40     78239.91
41     77798.83
42     71498.49
43     69758.98
44     65200.33
45     64926.08
46     49490.75
47     42559.73
48     35673.41
49     14681.40
Name: Profit, dtype: float64


Spliting the data into train test split

In [15]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=3)

In [16]:
print(x.shape,x_train.shape,x_test.shape)

(50, 4) (40, 4) (10, 4)


In [17]:
scaler = StandardScaler()
print("Load The Model")

Load The Model


In [21]:
x_train = scaler.fit_transform(x_train)

In [22]:
print(x_train)

[[-1.08893857  0.25772204 -0.25548125 -0.06213698]
 [-0.21101013 -0.57623306  0.63627087 -0.06213698]
 [-0.74095914 -2.70828614 -0.29073595 -1.30487651]
 [ 0.3359706  -0.24646014  0.55152707 -0.06213698]
 [ 1.4734102  -1.13830192  1.16456395 -0.06213698]
 [-0.27578555  0.74689393 -0.79895646 -1.30487651]
 [ 0.0257146   0.04405206  0.28846754 -1.30487651]
 [ 0.85037682  0.08395043  0.2663563   1.18060256]
 [-0.336098   -0.18909602 -1.20190139 -0.06213698]
 [ 0.53750977 -1.12269     0.16283374 -1.30487651]
 [ 1.20667572  0.98084953  0.80067883 -0.06213698]
 [-1.08689286 -0.07539373 -0.49924714 -1.30487651]
 [-0.10767308  0.28936313  1.05284222  1.18060256]
 [-0.68774149  1.45698318 -0.17227165 -1.30487651]
 [-1.70665429 -0.18224868  0.57041227 -0.06213698]
 [ 1.24233537 -0.80860023  1.13610946  1.18060256]
 [-1.38613883  0.27048628 -1.68026777  1.18060256]
 [ 1.93622571  1.20974116  1.83336666 -1.30487651]
 [-0.69704846 -1.38663994 -0.2177019   1.18060256]
 [ 1.72941496 -0.75650676  1.52

In [23]:
print(x_train.std())

1.0


In [24]:
x_test = scaler.transform(x_test)
print(x_test)

[[ 0.38373282  0.26806252  0.16365019 -0.06213698]
 [-0.86544597 -1.46748456 -0.48028675 -1.30487651]
 [ 1.04939687 -0.46161599  0.63810261 -1.30487651]
 [-1.73636657  0.58537927 -1.98600964 -1.30487651]
 [-0.35547989  1.26157981 -1.22696541  1.18060256]
 [-0.24446169  2.43367276 -0.969443   -0.06213698]
 [ 0.34146443  0.58804688  0.18796117 -1.30487651]
 [-1.72412329 -2.69027983 -1.98600964  1.18060256]
 [-1.71377438  0.14408238 -1.96962791  1.18060256]
 [ 1.3041912   1.04617226 -0.88711302 -1.30487651]]


In [25]:
print(x_test.std())

1.1809177655865086


In [26]:
model = LinearRegression()
print("Load The Model")

Load The Model


In [28]:
model_train = model.fit(x_train,y_train)
print("Model is trainned")

Model is trainned


In [29]:
model_pred = model_train.predict(x_test)

In [30]:
print(model_pred)

[128702.69304691  85659.2266389  156171.59195638  45645.02919205
  95665.57123074  98241.86668068 126938.46967509  52353.72786651
  46648.50311576 158355.76233462]


In [31]:
print(y_test)

12    141585.52
39     81005.76
9     149759.96
47     42559.73
31     97483.56
28    103282.38
13    134307.35
48     35673.41
45     64926.08
6     156122.51
Name: Profit, dtype: float64


In [33]:
model_acc = r2_score(y_test,model_pred)

In [34]:
print(model_acc)

0.9465912289066153
