In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
%matplotlib inline

In [3]:
df = pd.read_csv("50_startups.csv")

In [4]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [5]:
x = df['State'].unique()
y = np.array([i for i in range(len(x))])
df["State"].replace(x,y,inplace=True)

In [6]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,0,192261.83
1,162597.7,151377.59,443898.53,1,191792.06
2,153441.51,101145.55,407934.54,2,191050.39
3,144372.41,118671.85,383199.62,0,182901.99
4,142107.34,91391.77,366168.42,2,166187.94


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     int32  
 4   Profit           50 non-null     float64
dtypes: float64(4), int32(1)
memory usage: 1.9 KB


In [8]:
df.describe()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
count,50.0,50.0,50.0,50.0,50.0
mean,73721.6156,121344.6396,211025.0978,0.98,112012.6392
std,45902.256482,28017.802755,122290.310726,0.820403,40306.180338
min,0.0,51283.14,0.0,0.0,14681.4
25%,39936.37,103730.875,129300.1325,0.0,90138.9025
50%,73051.08,122699.795,212716.24,1.0,107978.19
75%,101602.8,144842.18,299469.085,2.0,139765.9775
max,165349.2,182645.56,471784.1,2.0,192261.83


In [9]:
model = LinearRegression()

In [10]:
X,Y = df.iloc[:,:-1],df.iloc[:,-1:]

In [11]:
X.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State
0,165349.2,136897.8,471784.1,0
1,162597.7,151377.59,443898.53,1
2,153441.51,101145.55,407934.54,2
3,144372.41,118671.85,383199.62,0
4,142107.34,91391.77,366168.42,2


In [12]:
Y.head()

Unnamed: 0,Profit
0,192261.83
1,191792.06
2,191050.39
3,182901.99
4,166187.94


In [13]:
model.fit(X,Y)

LinearRegression()

In [14]:
model.coef_

array([[ 8.06048883e-01, -2.69870879e-02,  2.70265084e-02,
         1.18514656e+02]])

Here, profit = m1xRnd + m2xAdmin + m3xMarket + m4xState + constant

In [15]:
model.intercept_

array([50044.73578848])

In [16]:
print(X.shape,Y.shape)

(50, 4) (50, 1)


In [17]:
model.score(X,Y)

0.9507516438334586

Now lets look at saving the model.
Once we save the trained model into a binary file, we can use that model over different files and share the model to others
We can save by using pickle and joblib

Let us try with pickle

In [18]:
import pickle

In [19]:
with open("model_pickle","wb") as file:
    pickle.dump(model,file)

In [20]:
with open("model_pickle","rb") as file:
    mod = pickle.load(file)

In [21]:
mod.coef_

array([[ 8.06048883e-01, -2.69870879e-02,  2.70265084e-02,
         1.18514656e+02]])

In [22]:
mod.intercept_

array([50044.73578848])

Let us now do it with Joblib

In [23]:
import joblib

In [24]:
joblib.dump(model,"model_joblib")

['model_joblib']

In [25]:
mj = joblib.load("model_joblib")

In [26]:
mj.coef_

array([[ 8.06048883e-01, -2.69870879e-02,  2.70265084e-02,
         1.18514656e+02]])

Now, lets also look at test_train_split on the dataset, we will make a few changes to the dataset, using OneHotEncoder

In [27]:
df_new = pd.read_csv("50_startups.csv")

In [28]:
df_new.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [30]:
dummies = pd.get_dummies(df_new.State)

In [32]:
merged = pd.concat([df,dummies],axis="columns")

In [33]:
final = merged.drop(["State","Florida"],axis="columns")

In [34]:
final.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,California,New York
0,165349.2,136897.8,471784.1,192261.83,0,1
1,162597.7,151377.59,443898.53,191792.06,1,0
2,153441.51,101145.55,407934.54,191050.39,0,0
3,144372.41,118671.85,383199.62,182901.99,0,1
4,142107.34,91391.77,366168.42,166187.94,0,0


In [36]:
X = final.drop(["Profit"],axis="columns")
Y = final["Profit"]

In [37]:
X.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,California,New York
0,165349.2,136897.8,471784.1,0,1
1,162597.7,151377.59,443898.53,1,0
2,153441.51,101145.55,407934.54,0,0
3,144372.41,118671.85,383199.62,0,1
4,142107.34,91391.77,366168.42,0,0


In [38]:
Y.head()

0    192261.83
1    191792.06
2    191050.39
3    182901.99
4    166187.94
Name: Profit, dtype: float64

In [39]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2)

In [41]:
model = LinearRegression()

In [43]:
model.fit(X_train,Y_train)

LinearRegression()

In [44]:
model.score(X_test,Y_test)

0.8967114747346867