# Multivariate regression Gradient Descent

In [None]:

import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

main_df = pd.read_csv("insurance.csv")


# Data Description

In [None]:
main_df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [None]:
main_df.describe()


Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


# Checking missing values

In [None]:
main_df.isna().sum()


age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

# One hot encoding

In [None]:
column_names_to_one_hot = ["sex", "smoker", "region"]


In [None]:
main_df = pd.get_dummies(main_df, columns=column_names_to_one_hot)


In [None]:
main_df.loc[:20,:]


Unnamed: 0,age,bmi,children,charges,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.924,1,0,0,1,0,0,0,1
1,18,33.77,1,1725.5523,0,1,1,0,0,0,1,0
2,28,33.0,3,4449.462,0,1,1,0,0,0,1,0
3,33,22.705,0,21984.47061,0,1,1,0,0,1,0,0
4,32,28.88,0,3866.8552,0,1,1,0,0,1,0,0
5,31,25.74,0,3756.6216,1,0,1,0,0,0,1,0
6,46,33.44,1,8240.5896,1,0,1,0,0,0,1,0
7,37,27.74,3,7281.5056,1,0,1,0,0,1,0,0
8,37,29.83,2,6406.4107,0,1,1,0,1,0,0,0
9,60,25.84,0,28923.13692,1,0,1,0,0,1,0,0


In [None]:
main_df.columns


Index(['age', 'bmi', 'children', 'charges', 'sex_female', 'sex_male',
       'smoker_no', 'smoker_yes', 'region_northeast', 'region_northwest',
       'region_southeast', 'region_southwest'],
      dtype='object')

# Checking for duplicate values

In [None]:
main_df.index[main_df.duplicated()]


Int64Index([581], dtype='int64')

In [None]:
main_df.duplicated().sum()

1

In [None]:
main_df.drop(axis="rows", labels=main_df.index[main_df.duplicated()], inplace=True)


In [None]:
main_df.duplicated().sum()


0

# Normalization

In [None]:
normalized_df =  (main_df-main_df.min())/(main_df.max()-main_df.min())
np.random.shuffle(normalized_df.values)

In [None]:
X = normalized_df.drop(axis="columns", labels="charges").to_numpy().astype(np.float64)
y = normalized_df["charges"].to_numpy().astype(np.float64)
ones = np.ones([X.shape[0],1])
X = np.concatenate((ones,X),axis=1)

In [None]:
len(X)

1337

In [None]:
len(y)

1337

# Train test Split

In [None]:
train_num = int(1337*.8)

In [None]:
train_X,test_X = X[train_num:], X[:train_num]
train_y, test_y = y[train_num:], y[:train_num]

In [None]:
# train_X, test_X, train_y, test_y = train_test_split(X, y, train_size=0.9)
#

In [None]:
train_X


array([[1.        , 0.95652174, 0.3118106 , ..., 1.        , 0.        ,
        0.        ],
       [1.        , 0.17391304, 0.37772397, ..., 0.        , 0.        ,
        1.        ],
       [1.        , 0.04347826, 0.38081786, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [1.        , 0.2173913 , 0.21092279, ..., 0.        , 0.        ,
        1.        ],
       [1.        , 0.        , 0.41404358, ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.10869565, 0.21213344, ..., 0.        , 0.        ,
        0.        ]])

# Initialising Gradient descent

In [None]:
alpha = 0.00000066
iters = 100000
theta = np.zeros([12,1])
print (theta)


[[0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]]


In [None]:
  def computeCost(X,y,theta):
    tobesummed = np.power(((X @ theta)-y),2)
    return np.sum(tobesummed)/(len(y))

In [None]:
#gradient descent
def gradientDescent(X,y,theta,iters,alpha):
    cost = np.zeros(iters)
    
    for i in range(iters):
        sum_delta = (alpha ) * (X.transpose()).dot( np.subtract(X.dot(theta), y))
        theta = theta - sum_delta                #(alpha/len(X)) * np.sum(X * (X @theta.transpose - y), axis=0)
        cost[i] = computeCost(X, y, theta)
        print(cost[i])
    
    return theta,cost

#running the gd and cost function
g,cost = gradientDescent(train_X,train_y,theta,iters,alpha)
print(g)

finalCost = computeCost(train_X,train_y,g)
print(finalCost)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
0.24930713194686144
0.2492053088129298
0.2491035788542461
0.24900194197783618
0.24890039809081937
0.24879894710040854
0.2486975889139097
0.24859632343872234
0.24849515058233887
0.24839407025234508
0.24829308235641925
0.2481921868023331
0.24809138349795062
0.2479906723512288
0.24789005327021701
0.24778952616305733
0.24768909093798414
0.24758874750332405
0.2474884957674959
0.24738833563901094
0.24728826702647205
0.24718828983857433
0.24708840398410453
0.24698860937194136
0.24688890591105508
0.24678929351050738
0.24668977207945175
0.2465903415271328
0.24649100176288655
0.2463917526961402
0.24629259423641206
0.2461935262933114
0.24609454877653855
0.24599566159588448
0.24589686466123117
0.24579815788255097
0.24569954116990703
0.24560101443345278
0.2455025775834321
0.2454042305301793
0.2453059731841186
0.24520780545576454
0.24510972725572164
0.24501173849468427
0.24491383908343672
0.24481602893285306
0.24471830795389682
0.24462

KeyboardInterrupt: ignored

In [None]:
#plot the cost
fig, ax = plt.subplots()  
ax.plot(np.arange(iters), cost, 'r')  
ax.set_xlabel('Iterations')  
ax.set_ylabel('Cost')  
ax.set_title('Error vs. Training Epoch')  