In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


In [2]:
df = pd.read_csv("../data_raw_insurance.csv")
df.head()

Unnamed: 0,age,gender,bmi,children,smoker,region,medical_history,family_medical_history,exercise_frequency,occupation,coverage_level,charges
0,46.0,male,21.45,5.0,yes,southeast,Diabetes,,Never,Blue collar,Premium,20460.307669
1,25.0,female,25.38,2.0,yes,northwest,Diabetes,High blood pressure,Occasionally,White collar,Premium,20390.899218
2,38.0,male,44.88,2.0,yes,southwest,,High blood pressure,Occasionally,Blue collar,Premium,20204.476302
3,25.0,male,19.89,0.0,no,northwest,,Diabetes,Rarely,White collar,Standard,11789.029843
4,49.0,male,38.21,3.0,yes,northwest,Diabetes,High blood pressure,Rarely,White collar,Standard,19268.309838


In [3]:
train_df = df.iloc[:700_000]
eval_df  = df.iloc[700_000:900_000]
prod_df  = df.iloc[900_000:]

train_df.shape, eval_df.shape, prod_df.shape

((700000, 12), (200000, 12), (100000, 12))

In [4]:
target = 'charges'

X_train = train_df.drop(columns=[target])
y_train = train_df[target]

X_eval = eval_df.drop(columns=[target])
y_eval = eval_df[target]

In [5]:
num_features = ['age', 'bmi', 'children']

cat_features = [
    'gender', 'smoker', 'region', 'medical_history',
    'family_medical_history', 'exercise_frequency',
    'occupation', 'coverage_level'
]

num_features, cat_features

(['age', 'bmi', 'children'],
 ['gender',
  'smoker',
  'region',
  'medical_history',
  'family_medical_history',
  'exercise_frequency',
  'occupation',
  'coverage_level'])

Numerical and categorical features were processed using separate pipelines to ensure appropriate transformations.

In [6]:
# Numerical
for col in num_features:
    X_train[col] = X_train[col].fillna(X_train[col].median())
    X_eval[col]  = X_eval[col].fillna(X_train[col].median())

# Categorical
for col in cat_features:
    X_train[col] = X_train[col].fillna('Unknown')
    X_eval[col]  = X_eval[col].fillna('Unknown')

Median imputation was used for numerical features due to skewed distributions.
Missing categorical values were treated as a separate “Unknown” category to preserve information and avoid data loss.

In [7]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_features),
        ('cat', categorical_transformer, cat_features)
    ]
)

One-hot encoding was used for categorical variables, and feature scaling was applied to numerical features to ensure balanced model learning.

In [8]:
X_train_processed = preprocessor.fit_transform(X_train)
X_eval_processed  = preprocessor.transform(X_eval)

In [9]:
import joblib

joblib.dump(preprocessor, "../models/preprocessor.pkl")

['../models/preprocessor.pkl']

In [10]:
np.save("../data/X_train.npy", X_train_processed)
np.save("../data/X_eval.npy", X_eval_processed)
np.save("../data/y_train.npy", y_train.values)
np.save("../data/y_eval.npy", y_eval.values)