# Multiple Linear Regression using Scikit-Learn
## CPE 490 590 
### Author: Rahul Bhadani

Note: The multiple linear regression used in this notebook, doesn'thave bias term $w_0$, i.e. $w_0 = 0$.

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'Serif'
plt.rcParams['font.size'] = 15

## Read the data

In [2]:
df = pd.read_csv('Dataset/Concrete_Compressive_Strength/Concrete_Data.csv')
df.head()

Unnamed: 0,Cement (component 1)(kg in a m^3 mixture),Blast Furnace Slag (component 2)(kg in a m^3 mixture),Fly Ash (component 3)(kg in a m^3 mixture),Water (component 4)(kg in a m^3 mixture),Superplasticizer (component 5)(kg in a m^3 mixture),Coarse Aggregate (component 6)(kg in a m^3 mixture),Fine Aggregate (component 7)(kg in a m^3 mixture),Age (day),"Concrete compressive strength(MPa, megapascals)"
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [3]:
df.columns

Index(['Cement (component 1)(kg in a m^3 mixture)',
       'Blast Furnace Slag (component 2)(kg in a m^3 mixture)',
       'Fly Ash (component 3)(kg in a m^3 mixture)',
       'Water  (component 4)(kg in a m^3 mixture)',
       'Superplasticizer (component 5)(kg in a m^3 mixture)',
       'Coarse Aggregate  (component 6)(kg in a m^3 mixture)',
       'Fine Aggregate (component 7)(kg in a m^3 mixture)', 'Age (day)',
       'Concrete compressive strength(MPa, megapascals) '],
      dtype='object')

## We are only going to use three features

In [4]:
df_filtered = df[['Cement (component 1)(kg in a m^3 mixture)', 'Blast Furnace Slag (component 2)(kg in a m^3 mixture)',
        'Fly Ash (component 3)(kg in a m^3 mixture)']]
y = df[['Concrete compressive strength(MPa, megapascals) ']]
# Separate features and labels
x = df_filtered.values.astype(np.float64)
y = y.values.reshape(-1, 1).astype(np.float64)

In [5]:
df_filtered.shape

(1030, 3)

## Split the Dataset into Training and Testing

In [6]:
X_Train, X_Test, Y_Train, Y_Test = train_test_split(x, y, test_size = 1/3, random_state = 0)


# Fitting Simple Linear Regression to the training set

In [7]:
regressor = LinearRegression()
regressor.fit(X_Train, Y_Train)

# Predicting the Test set result

In [8]:
Y_Pred = regressor.predict(X_Test)


#  $R^2$, coefficient of determination

In [9]:
regressor.score(x, y)

0.4028652202528764

# Coefficients

In [10]:
regressor.coef_, regressor.intercept_

(array([[0.12429187, 0.08808451, 0.09133734]]), array([-10.59158372]))

# Mean Squared Error

In [11]:
import sklearn.metrics as sm
# error
e= sm.mean_squared_error(Y_Test, Y_Pred)
print("MSE = {}".format(e))

MSE = 159.4968526436981


# L2 Regularization

In [12]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
ridgeModel = Lasso(alpha = 5)
ridgeModel.fit(X_Train, Y_Train)
ridgeModel.score(X_Test, Y_Test)

0.38617537093913956

# Now Use All Features

In [13]:
df_filtered = df[['Cement (component 1)(kg in a m^3 mixture)', 'Blast Furnace Slag (component 2)(kg in a m^3 mixture)',
        'Fly Ash (component 3)(kg in a m^3 mixture)',
       'Water  (component 4)(kg in a m^3 mixture)',
       'Superplasticizer (component 5)(kg in a m^3 mixture)',
       'Coarse Aggregate  (component 6)(kg in a m^3 mixture)',
       'Fine Aggregate (component 7)(kg in a m^3 mixture)', 'Age (day)']]
y = df[['Concrete compressive strength(MPa, megapascals) ']]
# Separate features and labels
x = df_filtered.values.astype(np.float64)
y = y.values.reshape(-1, 1).astype(np.float64)

In [14]:
X_Train, X_Test, Y_Train, Y_Test = train_test_split(x, y, test_size = 1/3, random_state = 0)



In [15]:
regressor = LinearRegression()
regressor.fit(X_Train, Y_Train)

In [16]:
Y_Pred = regressor.predict(X_Test)


In [17]:
regressor.score(x, y)

0.6139509974771105

In [18]:
regressor.coef_, regressor.intercept_

(array([[ 0.1244788 ,  0.10816318,  0.0940972 , -0.11776245,  0.38679612,
          0.0243998 ,  0.02529243,  0.10774492]]),
 array([-41.80447231]))

In [19]:
import sklearn.metrics as sm
# error
e= sm.mean_squared_error(Y_Test, Y_Pred)
print("MSE = {}".format(e))

MSE = 98.97039111843102


In [20]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
ridgeModel = Lasso(alpha = 5)
ridgeModel.fit(X_Train, Y_Train)
ridgeModel.score(X_Test, Y_Test)

0.6228036684175421