# Multiple linear regression

In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

Dataset1 = pd.read_csv("/content/drive/MyDrive/MyPersonalDataset/tips.csv")
Dataset1.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [2]:
Dataset1.shape

(244, 7)

In [3]:
# missing values
Dataset1.isnull().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

In [4]:
# splitting the dataset into input and output
X = Dataset1[['total_bill', 'size']]
X

Unnamed: 0,total_bill,size
0,16.99,2
1,10.34,3
2,21.01,3
3,23.68,2
4,24.59,4
...,...,...
239,29.03,3
240,27.18,2
241,22.67,2
242,17.82,2


In [5]:
y = Dataset1[['tip']]
y

Unnamed: 0,tip
0,1.01
1,1.66
2,3.50
3,3.31
4,3.61
...,...
239,5.92
240,2.00
241,2.00
242,1.75


# Train Test Splitting

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, random_state = 42) # train_size = 0.75 <=> test_size = 0.25

In [7]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(183, 2)
(61, 2)
(183, 1)
(61, 1)


In [None]:
X_train

Unnamed: 0,total_bill,size
115,17.31,2
181,23.33,2
225,16.27,2
68,20.23,2
104,20.92,2
...,...,...
106,20.49,2
14,14.83,2
92,5.75,2
179,34.63,2


In [8]:
# model building

In [9]:
from sklearn.linear_model import LinearRegression

model1 = LinearRegression()

In [10]:
# training phase
model1.fit(X_train, y_train)

In [11]:
# testing phase
y_pred = model1.predict(X_test)
y_pred[:5] # predicted output

array([[2.89795206],
       [1.84870778],
       [3.83076077],
       [3.95799945],
       [2.25036419]])

In [15]:
# actual output
y_test[:5]

Unnamed: 0,tip
24,3.18
6,2.0
153,2.0
211,5.16
198,2.0


In [13]:
# performance checking
from sklearn.metrics import r2_score

In [14]:
print(r2_score(y_test, y_pred))

0.4179983009893019


In [16]:
# mean square error
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
print(mse)

0.7743095126500432


In [17]:
# root mean square error
import math
math.sqrt(mse)

0.8799485852310027

# Homework: Multiple Linear Regression ML Algorithm

In [18]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

Dataset1 = pd.read_csv("/content/drive/MyDrive/MyPersonalDataset/insurance.csv")
Dataset1.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [19]:
Dataset1.shape

(1338, 7)

In [20]:
# missing values
Dataset1.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [21]:
# splitting the dataset into input and output
X = Dataset1[['age', 'bmi', 'children']]
X

Unnamed: 0,age,bmi,children
0,19,27.900,0
1,18,33.770,1
2,28,33.000,3
3,33,22.705,0
4,32,28.880,0
...,...,...,...
1333,50,30.970,3
1334,18,31.920,0
1335,18,36.850,0
1336,21,25.800,0


In [23]:
y = Dataset1[['charges']]
y

Unnamed: 0,charges
0,16884.92400
1,1725.55230
2,4449.46200
3,21984.47061
4,3866.85520
...,...
1333,10600.54830
1334,2205.98080
1335,1629.83350
1336,2007.94500


In [24]:
# Train Test Splitting

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, random_state = 42) # train_size = 0.75 <=> test_size = 0.25

In [26]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1003, 3)
(335, 3)
(1003, 1)
(335, 1)


In [27]:
X_train

Unnamed: 0,age,bmi,children
693,24,23.655,0
1297,28,26.510,2
634,51,39.700,1
1022,47,36.080,1
178,46,28.900,2
...,...,...,...
1095,18,31.350,4
1130,39,23.870,5
1294,58,25.175,0
860,37,47.600,2


In [28]:
# model building

In [29]:
from sklearn.linear_model import LinearRegression

model1 = LinearRegression()

In [30]:
# training phase
model1.fit(X_train, y_train)

In [31]:
# testing phase
y_pred = model1.predict(X_test)
y_pred[:5] # predicted output

array([[13418.54626725],
       [11622.83868919],
       [17103.27393013],
       [14495.74898025],
       [ 8294.30385086]])

In [33]:
# actual output
y_test[:5]

Unnamed: 0,charges
764,9095.06825
887,5272.1758
890,29330.98315
1293,9301.89355
259,33750.2918


In [34]:
# performance checking
from sklearn.metrics import r2_score

In [35]:
print(r2_score(y_test, y_pred))

0.12733318587824705


In [36]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
print(mse)

131677690.16518274


In [37]:
# root mean square error
import math
math.sqrt(mse)

11475.089985058188