In [19]:
import pandas as pd   # import pandas to read the csv file data
# problem definition: we need to predict the costs of the health insurance for individuals so we use our dataset to build our ML model that will 
# help us to predict the costs for individuals
from sklearn.linear_model import LinearRegression  # import the model function
from sklearn.model_selection import train_test_split # to split the data to train part and test part
from sklearn.metrics import mean_squared_error, r2_score # import this functions to help us get MSE and R^2

In [20]:
df = pd.read_csv('insurance.csv')  # read the data from the csv file
# becuase we can not act with strings, we need to map each string value with a special number representing it 
df['sex'] = df['sex'].map({'male': 0, 'female': 1}) 
df['smoker'] = df['smoker'].map({'no': 0, 'yes': 1})
df['region'] = df['region'].map({'northeast': 0, 'northwest': 1, 'southeast': 2, 'southwest': 3})

x = df.drop(['charges'], axis=1) # to split outputs from inputs we should drop 'charges' from y
y = df['charges'] # then take outputs to a special list

In [27]:
# we then split the data to two parts: the first part we use it to train the model and the second part we use it to predict the result
# we use random_state to fix the chosen data for training to make it easier for debugging
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) 

Unnamed: 0,age,sex,bmi,children,smoker,region
560,46,1,19.950,2,0,1
1285,47,1,24.320,0,0,0
1142,52,1,24.860,0,0,2
969,39,1,34.320,5,0,2
486,54,1,21.470,3,0,1
...,...,...,...,...,...,...
1095,18,1,31.350,4,0,0
1130,39,1,23.870,5,0,2
1294,58,0,25.175,0,0,0
860,37,1,47.600,2,1,3


In [22]:
our_model = LinearRegression()  # create our model
our_model.fit(x_train, y_train) # pass the training data to it
y_predict = our_model.predict(x_test) # find the model results for the testing data
MSE = mean_squared_error(y_test, y_predict) # find the mean squared error between the predicted results and the actual results
r2 = r2_score(y_test, y_predict) # find the R^2 of the predicted results
print('mean squared error = %.f' % MSE)
print('R-squared: %.2f' % r2)

mean squared error = 33635210
R-squared: 0.78


In [67]:
# we pass to our function the initial steepness of our variables, and the initail intercept value (b) and our data(points), and our desired learning rate(alpha)
def gradient_descent(m1_now, m2_now, m3_now, m4_now, m5_now, m6_now, b_now, points, alpha):
    # firstly initialize every gradient by 0
    m1_gradient = 0   
    m2_gradient = 0
    m3_gradient = 0
    m4_gradient = 0
    m5_gradient = 0
    m6_gradient = 0
    b_gradient = 0

    n = len(points)  # store the length of our data in a variable to iterate with this value 

    for i in range(n):
        # in each iteration, we store every variable to use it in the gradient descent equations 
        x1 = points.iloc[i].age
        x2 = points.iloc[i].sex
        x3 = points.iloc[i].bmi
        x4 = points.iloc[i].children
        x5 = points.iloc[i].smoker
        x6 = points.iloc[i].region
        y = points.iloc[i].charges 
        # then we calculate the value of each gradient for our parameters using the derivative of the cost equation
        m1_gradient += -(2/n) * x1 * (y - (m1_now * x1 + m2_now * x2 + m3_now * x3 + m4_now * x4 + m5_now * x5 + m6_now * x6 + b_now))
        m2_gradient += -(2/n) * x2 * (y - (m1_now * x1 + m2_now * x2 + m3_now * x3 + m4_now * x4 + m5_now * x5 + m6_now * x6 + b_now))
        m3_gradient += -(2/n) * x3 * (y - (m1_now * x1 + m2_now * x2 + m3_now * x3 + m4_now * x4 + m5_now * x5 + m6_now * x6 + b_now))
        m4_gradient += -(2/n) * x4 * (y - (m1_now * x1 + m2_now * x2 + m3_now * x3 + m4_now * x4 + m5_now * x5 + m6_now * x6 + b_now))
        m5_gradient += -(2/n) * x5 * (y - (m1_now * x1 + m2_now * x2 + m3_now * x3 + m4_now * x4 + m5_now * x5 + m6_now * x6 + b_now))
        m6_gradient += -(2/n) * x6 * (y - (m1_now * x1 + m2_now * x2 + m3_now * x3 + m4_now * x4 + m5_now * x5 + m6_now * x6 + b_now))
        b_gradient += -(2/n) * (y - (m1_now * x1 + m2_now * x2 + m3_now * x3 + m4_now * x4 + m5_now * x5 + m6_now * x6 + b_now))
    
    # then we update the value of our steepness values using our learning rate
    m1 = m1_now - m1_gradient * alpha
    m2 = m2_now - m2_gradient * alpha
    m3 = m3_now - m3_gradient * alpha
    m4 = m4_now - m4_gradient * alpha
    m5 = m5_now - m5_gradient * alpha
    m6 = m6_now - m6_gradient * alpha
    b = b_now - b_gradient * alpha

    return m1, m2, m3, m4, m5, m6, b

# intialize the steepness values with any values
m1 = 0
m2 = 0
m3 = 0
m4 = 0
m5 = 0
m6 = 0
b = 0
# make our learning rate slow to have better accuracy
alpha = 0.0001 
# we choose to 
T = 600

for i in range(T):
    m1, m2, m3, m4, m5, m6, b = gradient_descent(m1, m2, m3, m4, m5, m6, b, df, alpha)

print(m1, m2, m3, m4, m5, m6, b)

208.53424734426662 -50.79386514678979 171.63849605449636 70.97616017751956 456.3483540214725 -58.13325569738491 -19.380435883477375


In [68]:
# we need to make a list for the y_pred so we take the len(df) to know the length of the list
n = len(df)
# firstly initialize it with zeros 
y_pred = [0] * n 

# then for each row in the data set take the values of the variables and put it in the model equation to predict y
for i in range(n):
    x1 = df.iloc[i].age
    x2 = df.iloc[i].sex
    x3 = df.iloc[i].bmi
    x4 = df.iloc[i].children
    x5 = df.iloc[i].smoker
    x6 = df.iloc[i].region
    y_pred[i] = m1 * x1 + m2 * x2 + m3 * x3 + m4 * x4 + m5 * x5 + m6 * x6 + b
y_pred

[8962.639025360566,
 9485.177676856414,
 11580.310828692158,
 10701.168523697277,
 11552.501989489525,
 10696.095743689964,
 15216.702033651105,
 12561.639956094501,
 12958.315373515054,
 16818.886021976527,
 9694.337114272084,
 17711.406938213673,
 10506.871750217175,
 18326.12195174394,
 13182.255923814357,
 8061.653659683564,
 16127.615629606513,
 8869.627191454121,
 18401.1690392995,
 12577.43448209756,
 18621.72459006787,
 11643.51678454878,
 9470.842220376879,
 13026.015416933315,
 12590.374632439303,
 17087.82737204989,
 17029.757963076576,
 17108.480075748164,
 7773.68541142221,
 13099.559545851364,
 10960.652052159778,
 8200.109174840605,
 8981.31841946484,
 17919.229714410725,
 12420.144493246493,
 7390.353289873291,
 18671.807281585905,
 8798.190945908824,
 14100.626385683228,
 19622.99898427624,
 9500.231630281736,
 12707.19128607847,
 12223.519798081135,
 12957.744338146356,
 14335.103402195264,
 17677.719303791742,
 10461.796921468675,
 11678.521876726652,
 16535.90633644

In [69]:
# find the mean squared error between the actual y and our predicted y
MSE = mean_squared_error(y, y_pred)
print(MSE)
#find the R^2 of our predicted y
r2 = r2_score(y, y_pred)
print(r2)

127027812.75316179
0.13316900047226388


In [None]:
# I don't know why the R^2 is small like that but it seems that my implemented model has a poor accuracy compared to the scikit-learn model