In [1]:
import pandas as pd 
import sklearn 
from sklearn.linear_model import LinearRegression, Lasso, Ridge 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error 

In [2]:
df = pd.read_csv('supershops.csv')
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [3]:
#df = df.dropna()
df.dropna(inplace=True)

In [4]:
x = df.drop(['Profit','Area'], axis=1)

In [5]:
x.head()

Unnamed: 0,Marketing Spend,Administration,Transport
0,114523.61,136897.8,471784.1
1,162597.7,151377.59,443898.53
2,153441.51,101145.55,407934.54
3,144372.41,118671.85,383199.62
4,142107.34,91391.77,366168.42


In [6]:
y = df[['Profit']]

In [7]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.25, random_state=0)

In [8]:
xtrain.shape

(36, 3)

In [9]:
xtrain.head()

Unnamed: 0,Marketing Spend,Administration,Transport
28,66051.52,182645.56,118148.2
49,0.0,116983.8,45173.06
2,153441.51,101145.55,407934.54
47,0.0,135426.92,0.0
18,91749.16,114175.79,294919.57


In [10]:
ytrain.head()

Unnamed: 0,Profit
28,103282.38
49,14681.4
2,191050.39
47,42559.73
18,124266.9


In [11]:
df.corr()

Unnamed: 0,Marketing Spend,Administration,Transport,Profit
Marketing Spend,1.0,0.227141,0.718574,0.937853
Administration,0.227141,1.0,0.009534,0.197201
Transport,0.718574,0.009534,1.0,0.782578
Profit,0.937853,0.197201,0.782578,1.0


# OLS

In [12]:
model = LinearRegression() # polynomial regression with degree 1
model.fit(xtrain, ytrain)

In [13]:
y_pred = model.predict(xtest)

In [14]:
y_pred

array([[ 90707.18524202],
       [166377.24276987],
       [124018.59727829],
       [ 93252.51801345],
       [ 97588.41924098],
       [ 68948.39245538],
       [ 89037.14295216],
       [ 73472.98068535],
       [159657.23912108],
       [129149.84623501],
       [128674.69774365],
       [ 88409.42998689],
       [ 96436.18820079]])

In [15]:
mse = mean_squared_error(ytest, y_pred)
print('MSE:', mse)

MSE: 101360809.28512491


In [16]:
model.score(xtest, ytest)

0.8744319145336104

# Polynomial

In [17]:
df.corr()

Unnamed: 0,Marketing Spend,Administration,Transport,Profit
Marketing Spend,1.0,0.227141,0.718574,0.937853
Administration,0.227141,1.0,0.009534,0.197201
Transport,0.718574,0.009534,1.0,0.782578
Profit,0.937853,0.197201,0.782578,1.0


In [18]:
from sklearn.preprocessing import PolynomialFeatures

In [19]:
x.head()

Unnamed: 0,Marketing Spend,Administration,Transport
0,114523.61,136897.8,471784.1
1,162597.7,151377.59,443898.53
2,153441.51,101145.55,407934.54
3,144372.41,118671.85,383199.62
4,142107.34,91391.77,366168.42


# Degree 2

In [20]:
poly = PolynomialFeatures(degree=2) # polynomial regression with degree 2 
X_poly = poly.fit_transform(x)

In [21]:
pd.DataFrame(X_poly).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.0,114523.61,136897.8,471784.1,13115660000.0,15678030000.0,54030420000.0,18741010000.0,64586210000.0,222580200000.0
1,1.0,162597.7,151377.59,443898.53,26438010000.0,24613650000.0,72176880000.0,22915170000.0,67196290000.0,197045900000.0
2,1.0,153441.51,101145.55,407934.54,23544300000.0,15519930000.0,62594090000.0,10230420000.0,41260760000.0,166410600000.0
3,1.0,144372.41,118671.85,383199.62,20843390000.0,17132940000.0,55323450000.0,14083010000.0,45475010000.0,146841900000.0
4,1.0,142107.34,91391.77,366168.42,20194500000.0,12987440000.0,52035220000.0,8352456000.0,33464780000.0,134079300000.0


In [22]:
xtrain, xtest, ytrain, ytest = train_test_split(X_poly, y, test_size=0.25, random_state=0)

In [23]:
xtrain.shape

(36, 10)

In [24]:
model2 = LinearRegression()
model2.fit(xtrain, ytrain)

In [25]:
y_pred2 = model2.predict(xtest) 

In [26]:
y_pred2 

array([[101606.42693299],
       [164653.76826556],
       [131119.04610867],
       [104593.86990985],
       [ 99609.91567357],
       [ 68898.23379729],
       [ 92076.37087874],
       [ 72287.07989273],
       [154325.98721725],
       [128236.41897809],
       [128001.27841803],
       [110500.67879367],
       [ 76394.9778894 ]])

In [27]:
mse = mean_squared_error(ytest, y_pred2)
print('MSE:', mse)

MSE: 162866427.0974326


In [28]:
model2.score(xtest, ytest) #testing score

0.7982373504946239

In [29]:
model2.score(xtrain, ytrain) # training score

0.9505396347868063

# Degree 3

In [30]:
poly = PolynomialFeatures(degree=3) # polynomial regression with degree 2 
X_poly_deg3 = poly.fit_transform(x)

In [31]:
X_poly_deg3.shape

(49, 20)

In [32]:
xtrain, xtest, ytrain, ytest = train_test_split(X_poly_deg3, y, test_size=0.25, random_state=0)

In [33]:
model2 = LinearRegression()
model2.fit(xtrain, ytrain)

In [34]:
model2.score(xtest, ytest) # testing

0.61507564077777

In [35]:
model2.score(xtrain, ytrain) # training

0.9650079946684086

# Let's talk about Regularization

In [36]:
x = df.drop(['Profit','Area'], axis=1) 
y = df[['Profit']] 

In [37]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.25, random_state=0)

In [38]:
xtrain.head()

Unnamed: 0,Marketing Spend,Administration,Transport
28,66051.52,182645.56,118148.2
49,0.0,116983.8,45173.06
2,153441.51,101145.55,407934.54
47,0.0,135426.92,0.0
18,91749.16,114175.79,294919.57


# L1 Regularization with LR - Lasso

In [39]:
lasso_model = Lasso()
lasso_model.fit(xtrain, ytrain)

# L2 Regularization with LR - Ridge

In [40]:
ridge_model = Ridge() 
ridge_model.fit(xtrain, ytrain)

# Performance

In [41]:
lasso_predictions = lasso_model.predict(xtest)
lasso_mse = mean_squared_error(ytest, lasso_predictions)

ridge_predictions = ridge_model.predict(xtest)
ridge_mse = mean_squared_error(ytest, ridge_predictions)

print('Lasso MSE:', lasso_mse)
print('Ridge MSE:', ridge_mse)

Lasso MSE: 101360808.35784721
Ridge MSE: 101360809.28872341


In [42]:
lasso_model.score(xtest, ytest)

0.8744319156823431

In [43]:
ridge_model.score(xtest, ytest)

0.8744319145291525

In [None]:
0.8744319145336104

In [None]:
recall -> 
1st class -> linear regression 
2nd -> LR with gradient descent  
3rd -> linear relationship and non-linear relationship 