In [61]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [62]:
df = pd.read_csv('./insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [63]:
df.shape

(1338, 7)

In [64]:
# Drop Duplicate if any
df.drop_duplicates(inplace=True)

In [65]:
## Check null
df.isnull().sum().sum()

0

In [66]:
df['region'].value_counts()

region
southeast    364
southwest    325
northwest    324
northeast    324
Name: count, dtype: int64

In [67]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [68]:
X = df.drop("charges", axis=1)
y = df['charges']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20, random_state=32)

In [69]:
X_train.shape

(1069, 6)

In [70]:
ohe = OneHotEncoder( drop="first", sparse_output=False)
X_train_ohe = ohe.fit_transform(X_train[['sex','region','smoker']])

X_test_ohe = ohe.transform(X_test[['sex','region','smoker']])

In [73]:
stdscaler = StandardScaler()
X_train_scaler = stdscaler.fit_transform(X_train[['age','bmi','children']])

X_test_scaler = stdscaler.transform(X_test[['age','bmi','children']])

In [92]:
X_train_scaler

array([[ 0.06379712, -0.40254323, -0.08955721],
       [-1.28597713, -2.19036783, -0.08955721],
       [-1.49909938, -2.43052337, -0.91487327],
       ...,
       [ 0.56108237,  1.40195885,  0.73575884],
       [-1.49909938,  1.05339976, -0.91487327],
       [-1.49909938,  1.62543692, -0.91487327]])

In [74]:
X_train_ohe

array([[0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0.],
       ...,
       [1., 0., 1., 0., 1.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0.]])

In [79]:
X_train_ohe.shape, X_train_scaler.shape

((1069, 5), (1069, 3))

In [80]:
X_train_final = np.hstack((X_train_ohe,X_train_scaler))
X_test_final = np.hstack((X_test_ohe,X_test_scaler))

In [82]:
from sklearn.linear_model import LinearRegression

In [83]:
lr = LinearRegression()
lr.fit(X_train_final,y_train)

In [85]:
y_pred = lr.predict(X_test_final)

In [86]:
from sklearn.metrics import r2_score, mean_squared_error

In [87]:
r2_score(y_test, y_pred)

0.7906993658088287

In [88]:
mean_squared_error(y_test, y_pred)

30924767.21758346

# ColumnTransformer

In [90]:
colTransformer = ColumnTransformer(
    transformers=[
        ('ohe_transf',OneHotEncoder(drop='first'),['sex','region','smoker']),
        ('std_scaler',StandardScaler(),['age','bmi','children'])
    ],remainder='passthrough'
)

In [91]:
colTransformer.fit_transform(X_train)

array([[ 0.        ,  0.        ,  0.        , ...,  0.06379712,
        -0.40254323, -0.08955721],
       [ 0.        ,  0.        ,  0.        , ..., -1.28597713,
        -2.19036783, -0.08955721],
       [ 1.        ,  0.        ,  0.        , ..., -1.49909938,
        -2.43052337, -0.91487327],
       ...,
       [ 1.        ,  0.        ,  1.        , ...,  0.56108237,
         1.40195885,  0.73575884],
       [ 0.        ,  0.        ,  1.        , ..., -1.49909938,
         1.05339976, -0.91487327],
       [ 0.        ,  0.        ,  0.        , ..., -1.49909938,
         1.62543692, -0.91487327]])