# Task for Today  

***

## Medical Cost Prediction  

Given *patient data*, let's try to predict the **charges** a given patient will incur.  
  
We will use a variety of linear regression models to make our predictions.

# Getting Started

In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, RidgeCV, LassoCV, ElasticNetCV

In [None]:
data = pd.read_csv('../input/insurance/insurance.csv')

In [None]:
data

In [None]:
data.info()

# Preprocessing

In [None]:
print("Total missing values:", data.isna().sum().sum())

In [None]:
data['children'] = data['children'].astype(str)

In [None]:
print("Total non-numeric columns:", len(data.select_dtypes('object').columns))

In [None]:
{column: list(data[column].unique()) for column in data.select_dtypes('object').columns}

In [None]:
def binary_encode(df, column, positive_value):
    df = df.copy()
    df[column] = df[column].apply(lambda x: 1 if x == positive_value else 0)
    return df

def onehot_encode(df, column, prefix):
    df = df.copy()
    dummies = pd.get_dummies(df[column], prefix=prefix)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)
    return df

In [None]:
def preprocess_inputs(df, scaler, train_size=0.7):
    
    df = df.copy()
    
    # Binary encode sex and smoker columns
    df = binary_encode(df, 'sex', 'male')
    df = binary_encode(df, 'smoker', 'yes')
    
    # One-hot encode the children and region columns
    df = onehot_encode(df, 'children', 'ch')
    df = onehot_encode(df, 'region', 're')
    
    # Split df in X and y
    y = df['charges'].copy()
    X = df.drop('charges', axis=1).copy()
    
    # Scale X with the given scaler
    X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
    
    # Split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, random_state=123)
    
    return X_train, X_test, y_train, y_test

In [None]:
data

In [None]:
X_train, X_test, y_train, y_test = preprocess_inputs(data, StandardScaler(), train_size=0.7)

# Training

In [None]:
models = {
    '         OLS Model:': LinearRegression(),
    '          L2 Model:': Ridge(),
    '          L1 Model:': Lasso(),
    '  ElasticNet Model:': ElasticNet(),
    '       L2 CV Model:': RidgeCV(),
    '       L1 CV Model:': LassoCV(),
    'ElasticNetCV Model:': ElasticNetCV()
}

for model in models.values():
    model.fit(X_train, y_train)

In [None]:
print("Model R^2 Scores:\n-----------------")

for name, model in models.items():
    print(name, model.score(X_test, y_test))

# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/3GCv4Qq5DZQ