# Machine Learning - Linear Regression Model

In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sqlalchemy import create_engine
from dotenv import load_dotenv
import pandas as pd
import os

load_dotenv()

DATABASE_URL = os.getenv("DATABASE_URL").replace("postgres://", "postgresql://")

df = pd.read_sql("ml_4geeks_medical_insurance_cost_clean", DATABASE_URL)

In [2]:
df.head()

Unnamed: 0,age,bmi,children,charges,sex_num,smoker_num,region_num
0,19,27.9,0,16884.924,1,1,0
1,18,33.77,1,1725.5523,0,0,1
2,28,33.0,3,4449.462,0,0,1
3,33,22.705,0,21984.47061,0,0,2
4,32,28.88,0,3866.8552,0,0,2


#### Split data

In [3]:
X = df.drop('charges', axis=1)
y = df['charges']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [10]:
print(X_train.shape, X_test.shape)

(1002, 6) (335, 6)


#### Create Model

In [11]:
model = LinearRegression()
model.fit(X_train, y_train)

##### Fitted parameters

In [12]:
print(f"Intercept (a): {model.intercept_}")
print(f"Coefficients (b1, b2): {model.coef_}")

Intercept (a): -11735.80902630741
Coefficients (b1, b2): [2.45899692e+02 3.14061094e+02 5.28365682e+02 1.74679413e+01
 2.30716633e+04 2.03119474e+02]


#### Model Prediction

In [13]:
y_pred = model.predict(X_test)
y_pred

array([ 7993.71982593,  5566.17670809, 14240.9920928 , 31750.64176448,
        9203.73765008, 13353.55834216, 30191.72525637,  1323.13671526,
       10779.9682047 , 11315.02465132, 10311.3275375 , 33121.58876013,
       30904.24313553, 17100.22824699, 10518.39528464,  9399.96186443,
        4074.56658765, 31974.67887212,  3157.5622492 ,  5437.30290064,
        3766.78832116, 30127.52947275, 14994.72213809, 30342.23692689,
       31032.84169081,  5470.39734003, 35619.60650146, 36391.03969721,
       11135.26838888, 13978.62380055,  6270.21566415, 12932.90547095,
         745.77529047, 11937.31855694, 39724.66154073, 12091.79829094,
        4577.72875201,  3915.50448673, 31135.53431231,  9133.94357892,
        6802.01239622, 29888.45341123, 34994.53512151, 12086.97269396,
        7321.53387558,  3269.62650741,  5967.59264807,  8774.13805246,
        4246.62938521,  9270.86556828,  6790.71722028, 11859.91333479,
       31001.05694418,  3852.96054084, 10892.09655427,  9982.1632232 ,
      

In [14]:
from sklearn.metrics import mean_squared_error, r2_score

print(f"Mean squared error: {mean_squared_error(y_test, y_pred)}")
print(f"Coefficient of determination: {r2_score(y_test, y_pred)}")

Mean squared error: 35229015.3273374
Coefficient of determination: 0.7962578620326625
