In [3]:
!pip install scikit-learn


Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp310-cp310-macosx_12_0_arm64.whl (8.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.7/8.7 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting threadpoolctl>=3.1.0
  Downloading threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Collecting joblib>=1.2.0
  Downloading joblib-1.5.2-py3-none-any.whl (308 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m308.4/308.4 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting scipy>=1.8.0
  Downloading scipy-1.15.3-cp310-cp310-macosx_14_0_arm64.whl (22.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m22.4/22.4 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed joblib-1.5.2 scikit-learn-1.7.2 scipy-1.15.3 threadpoolctl-3.6.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new

In [4]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [5]:
df = pd.read_csv('insurance.csv')
df['sex'] = df['sex'].map({'male': 1, 'female': 0})
df['smoker'] = df['smoker'].map({'yes': 1, 'no': 0})
df = pd.get_dummies(df, columns=['region'], drop_first=True)

BASELINE MODEL 

In [6]:
# train test split 
X = df.drop('charges', axis=1)
y = df['charges']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Linear Regression using all features 
model = LinearRegression()
model.fit(X_train, y_train)

# evaluate the model 
y_pred = model.predict(X_test)

# plot the results 
print(f"R-squared: {r2_score(y_test, y_pred)}")
print(f"Mean Squared Error: {mean_squared_error(y_test, y_pred)}")

# get the coefficients 
coef_df = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.coef_
})
print(coef_df)

R-squared: 0.7835929767120724
Mean Squared Error: 33596915.85136145
            Feature   Coefficient
0               age    256.975706
1               sex    -18.591692
2               bmi    337.092552
3          children    425.278784
4            smoker  23651.128856
5  region_northwest   -370.677326
6  region_southeast   -657.864297
7  region_southwest   -809.799354


In [15]:
# Linear Regression using only most correlated features 
model1 = LinearRegression()
model1.fit(X_train[["smoker", "bmi", "age"]], y_train)

# evaluate the model 
y_pred1 = model1.predict(X_test[["smoker", "bmi", "age"]])

print(f"R-squared: {r2_score(y_test, y_pred1)}")
print(f"Mean Squared Error: {mean_squared_error(y_test, y_pred1)}")

coef_df = pd.DataFrame({
    'Feature': X[["smoker", "bmi", "age"]].columns,
    'Coefficient': model1.coef_
})
print(coef_df)

R-squared: 0.7776932310583375
Mean Squared Error: 34512843.880227886
  Feature   Coefficient
0  smoker  23675.371847
1     bmi    326.450232
2     age    259.410205


In [16]:
# Linear Regression using only smoker 
model2 = LinearRegression()
model2.fit(X_train[["smoker"]], y_train)

# evaluate the model 
y_pred2 = model2.predict(X_test[["smoker"]])

print(f"R-squared: {r2_score(y_test, y_pred2)}")
print(f"Mean Squared Error: {mean_squared_error(y_test, y_pred2)}")

print(f"Coefficient for smoker: {model2.coef_[0]}")

R-squared: 0.6602486589056529
Mean Squared Error: 52745964.72752624
Coefficient for smoker: 23188.68587068186
