In [1]:
import pandas as pd

# Load data
df = pd.read_csv("medical_insurance.csv")

# Drop duplicate records
df = df.drop_duplicates()

# Outlier capping for BMI using IQR method
Q1, Q3 = df['bmi'].quantile(0.25), df['bmi'].quantile(0.75)
IQR = Q3 - Q1
low_limit = Q1 - 1.5 * IQR
high_limit = Q3 + 1.5 * IQR
df['bmi'] = df['bmi'].clip(lower=low_limit, upper=high_limit)  # cap BMI outliers

# Verify no missing values remain
print(df.isnull().sum())



age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64


In [3]:
# Binary encoding for sex and smoker
df['sex'] = df['sex'].map({'female': 0, 'male': 1})
df['smoker'] = df['smoker'].map({'no': 0, 'yes': 1})

# One-hot encoding for region (add dummy columns for each region)
df = pd.get_dummies(df, columns=['region'], drop_first=True)


In [5]:
# Feature engineering: BMI category
def bmi_category(bmi):
    if bmi < 18.5:
        return "Underweight"
    elif bmi < 25:
        return "Normal"
    elif bmi < 30:
        return "Overweight"
    else:
        return "Obese"

df['BMI_class'] = df['bmi'].apply(bmi_category)
df = pd.get_dummies(df, columns=['BMI_class'], drop_first=True)  # one-hot encode BMI category

# Interaction feature: age * smoker
df['age_smoker_interaction'] = df['age'] * df['smoker']


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

# Separate features and target
X = df.drop('charges', axis=1)
y = df['charges']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify columns for scaling vs passthrough
numeric_cols = ['age', 'bmi', 'children']
categorical_cols = [col for col in X.columns if col not in numeric_cols]

# ColumnTransformer for scaling numeric features only (categoricals are already encoded as 0/1)
preprocessor = ColumnTransformer([
    ('num_scaler', StandardScaler(), numeric_cols)
], remainder='passthrough')  # leave other columns (already numeric dummies) as is

# Example pipeline for Linear Regression
linreg_pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('linreg', LinearRegression())
])

# Train the linear regression model
linreg_pipeline.fit(X_train, y_train)


The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [9]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Example evaluation for linear regression pipeline
y_pred = linreg_pipeline.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)
print(f"Linear Regression -> MAE: {mae:.2f}, RMSE: {rmse:.2f}, R²: {r2:.3f}")


Linear Regression -> MAE: 4329.17, RMSE: 6008.98, R²: 0.804




In [23]:
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split

# Assume your dataset is already loaded into 'df'
# Define features and target
X = df.drop("charges", axis=1)
y = df["charges"]

# Split into training and test sets
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Set experiment
mlflow.set_experiment("InsuranceChargesPrediction")

# Start MLflow run
with mlflow.start_run(run_name="RandomForest"):
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(x_train, y_train)
    preds = rf_model.predict(x_test)

    # Log model parameters and metrics
    mlflow.log_param("model_type", "RandomForest")
    mlflow.log_param("n_estimators", 100)
    mlflow.log_metric("RMSE", mean_squared_error(y_test, preds, squared=False))
    mlflow.log_metric("MAE", mean_absolute_error(y_test, preds))
    mlflow.log_metric("R2", r2_score(y_test, preds))

    # Log model
    mlflow.sklearn.log_model(rf_model, artifact_path="model")




In [41]:
import streamlit as st
import joblib
import numpy as np
import pandas as pd
!pip install pandas numpy scikit-learn joblib streamlit
# Load the trained model and pipeline
model = joblib.load("best_model.pkl")  # ✅ This model includes preprocessing steps

st.title("Medical Insurance Cost Predictor")
st.write("Enter your details to predict your yearly medical insurance cost.")

# Sidebar for inputs
st.sidebar.header("Input Features")
age = st.sidebar.slider("Age", 18, 100, 30)
sex = st.sidebar.selectbox("Sex", ("male", "female"))
bmi = st.sidebar.slider("BMI", 10.0, 50.0, 25.0)
children = st.sidebar.selectbox("Number of Children", list(range(0, 6)))
smoker = st.sidebar.selectbox("Smoker", ("yes", "no"))
region = st.sidebar.selectbox("Region", ("northeast", "northwest", "southeast", "southwest"))

# When 'Predict' is clicked
if st.sidebar.button("Predict"):
    input_data = pd.DataFrame({
        'age': [age],
        'sex': [1 if sex == "male" else 0],
        'bmi': [bmi],
        'children': [children],
        'smoker': [1 if smoker == "yes" else 0],
        'region_northwest': [1 if region == "northwest" else 0],
        'region_southeast': [1 if region == "southeast" else 0],
        'region_southwest': [1 if region == "southwest" else 0],
        # region_northeast is implied as 0 if all others are 0
    })

    prediction = model.predict(input_data)[0]
    st.subheader(f"Estimated Yearly Charge: **₹{prediction:,.2f}**")
    st.write("(This estimate is based on your inputs. Actual costs may vary.)")

# EDA (optional if df exists)
# avg_charges = df.groupby('smoker')['charges'].mean()
# st.bar_chart(avg_charges)




ModuleNotFoundError: No module named 'mport pandas as pd\r'