In [72]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm,t
import statsmodels as sm
import warnings
from sklearn.metrics import confusion_matrix, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score
#from sklearn.preprocessing import train
warnings.filterwarnings('ignore')

### Scenario 3: Medical Cost Prediction
    You are provided with the medical cost dataset. You need to predict individual medical costs billed by
    health insurance.
    Dataset Description:
    The dataset contains 7 features
    
    age: age of primary beneficiary
    sex: gender of primary beneficiary female, male
    bmi: Body mass index, providing an understanding of body, weights that are relatively
    high or low relative to height, objective index of body weight (kg / m ^ 2) using the ratio of
    height to weight, ideally 18.5 to 24.9
    children: Number of children covered by health insurance / Number of dependents
    smoker: Smokes or not
    region: the beneficiary's residential area in the US, northeast, southeast, southwest,
    northwest
    charges: Individual medical costs billed by health insurance
        
    Tasks to be performed:
    1. Load the data, check its shape and check for null values - Beginner
    2. Convert categorical feature to numerical values - Intermediate
    3. Split the dataset for training and testing - Beginner
    4. Train the model using sklearn - Beginner
    5. Find the intercept and coefficient from trained model - Beginner
    6. Predict the prices of test data and evaluate the model - Beginner

In [52]:
df = pd.read_csv('insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [6]:
unique_regions = df.region.unique()
unique_regions

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [8]:
region_dict = {}
for i in range(len(unique_regions)):
    region_dict[unique_regions[i]] = i
region_dict

{'southwest': 0, 'southeast': 1, 'northwest': 2, 'northeast': 3}

In [9]:
def get_region_code(region_name):
    return region_dict[region_name]
get_region_code('southeast')

1

In [11]:
def get_gender_code(gender):
    return 0 if gender == 'male' else 1
get_gender_code('female')

1

In [16]:
def get_smoker_code(smoker):
    return 0 if smoker == 'no' else 1
get_gender_code('yes')

1

In [12]:
df['sex'] = df['sex'].apply(get_gender_code)
df['region'] = df['region'].apply(get_region_code)
df['smoker'] = df['smoker'].apply(get_smoker_code)
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,1,27.9,0,yes,0,16884.924
1,18,0,33.77,1,no,1,1725.5523
2,28,0,33.0,3,no,1,4449.462
3,33,0,22.705,0,no,2,21984.47061
4,32,0,28.88,0,no,2,3866.8552


In [25]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.25, random_state=42)
print(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}, y_test shape: {y_test.shape}")

X_train shape: (1003, 6), X_test shape: (335, 6)
y_train shape: (1003,), y_test shape: (335,)


In [26]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

result = lr.fit(X_train, y_train)
result.score(X_train, y_train)

0.7449087316606229

In [27]:
print(f"Coeff: {result.coef_}, \nintercept: {result.intercept_}")

Coeff: [  259.870936     -48.14518497   337.05864381   427.52339986
 23622.50371905   299.04523816], 
intercept: -12966.13974368092


In [28]:
result.score(X_test, y_test)

0.7668905583460909

In [29]:
y_pred = result.predict(X_test)
y_pred[:5]

array([ 8917.54106359,  7057.659731  , 36899.8708097 ,  9546.15877323,
       26950.91414625])

In [30]:
r2_score(y_test, y_pred)

0.7668905583460909

In [33]:
from sklearn.preprocessing import OneHotEncoder

In [61]:
df_encoded = df.copy()
df_encoded.drop(columns=['sex', 'smoker', 'region'], inplace=True)
ohe = OneHotEncoder(sparse=False)
d = df.select_dtypes(include=['object'])
encoded = ohe.fit_transform(d)
df_encoded = pd.concat([pd.DataFrame(encoded), df_encoded],ignore_index=True, axis=1)
df_encoded.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,19,27.9,0,16884.924
1,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,18,33.77,1,1725.5523
2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,28,33.0,3,4449.462
3,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,33,22.705,0,21984.47061
4,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,32,28.88,0,3866.8552


In [67]:
X = df_encoded.iloc[:, :-1]
y = df_encoded.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.25, random_state=7)
print(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}, y_test shape: {y_test.shape}")

lr = LinearRegression()

result = lr.fit(X_train, y_train)
result.score(X_train, y_train)

X_train shape: (1003, 11), X_test shape: (335, 11)
y_train shape: (1003,), y_test shape: (335,)


0.7504740384122718

In [68]:
print(f"Coeff: {result.coef_}, \nintercept: {result.intercept_}")

Coeff: [    97.18113355    -97.18113355 -11905.32547314  11905.32547314
    667.24158209     15.86338029   -233.63531263   -449.46964974
    251.90247991    353.38540435    465.2280675 ], 
intercept: -887.6917181755853


In [69]:
result.score(X_test, y_test)

0.7509741262661104

In [70]:
y_pred = result.predict(X_test)
y_pred[:5]

array([15248.874306  , 11126.97945225, -2048.68105088, 29282.63519248,
        9070.8295246 ])

In [71]:
r2_score(y_test, y_pred)

0.7509741262661104

In [74]:
scores = cross_val_score(lr, X,y, cv=10)
print(f"Mean: {scores.mean()}, std: {scores.std()}")

Mean: 0.7445006998667603, std: 0.04294480851618363
