<a href="https://colab.research.google.com/github/rd9437/predicting_insurance_charges/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import numpy as np

In [None]:
insurance_data_path = '/content/sample_data/insurance.csv'
df = pd.read_csv(insurance_data_path)

In [None]:
def clean_data(df):
    df['sex'] = df['sex'].replace({'M': 'male', 'man': 'male', 'F': 'female', 'woman': 'female'})
    df['charges'] = df['charges'].replace({'\$': ''}, regex=True).astype(float)
    df = df[df["age"] > 0]
    df.loc[df["children"] < 0, "children"] = 0
    df["region"] = df["region"].str.lower()
    return df.dropna()

In [None]:
def train_model(df):
    X = df.drop('charges', axis=1)
    y = df['charges']
    cat_cols = ['sex', 'smoker', 'region']
    num_cols = ['age', 'bmi', 'children']

    X_cat = pd.get_dummies(X[cat_cols], drop_first=True)
    X_all = pd.concat([X[num_cols], X_cat], axis=1)

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_all)

    model = LinearRegression()
    pipe = Pipeline([("scaler", scaler), ("model", model)])
    pipe.fit(X_scaled, y)

    mse = -cross_val_score(pipe, X_scaled, y, cv=5, scoring='neg_mean_squared_error')
    r2 = cross_val_score(pipe, X_scaled, y, cv=5, scoring='r2')

    return pipe, np.mean(mse), np.mean(r2)

In [None]:
df_clean = clean_data(df)
model, avg_mse, avg_r2 = train_model(df_clean)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["region"] = df["region"].str.lower()


In [None]:
print("Mean MSE:", avg_mse)

Mean MSE: 37431001.52191915


In [None]:
print("Mean R2:", avg_r2)

Mean R2: 0.7450511466263761


In [None]:
val_path = '/content/sample_data/validation_dataset.csv'
val_df = pd.read_csv(val_path)

In [None]:
val_df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
0,18.0,female,24.09,1.0,no,southeast
1,39.0,male,26.41,0.0,yes,northeast
2,27.0,male,29.15,0.0,yes,southeast
3,71.0,male,65.502135,13.0,yes,southeast
4,28.0,male,38.06,0.0,no,southeast


In [None]:
val_df_proc = pd.get_dummies(val_df, columns=['sex', 'smoker', 'region'], drop_first=True)
val_preds = model.predict(val_df_proc)



In [None]:
val_df['predicted_charges'] = val_preds
val_df.loc[val_df['predicted_charges'] < 1000, 'predicted_charges'] = 1000

In [None]:
val_df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,predicted_charges
0,18.0,female,24.09,1.0,no,southeast,128624.195643
1,39.0,male,26.41,0.0,yes,northeast,220740.537449
2,27.0,male,29.15,0.0,yes,southeast,181357.588606
3,71.0,male,65.502135,13.0,yes,southeast,423490.68727
4,28.0,male,38.06,0.0,no,southeast,193247.431989
