In [None]:
# 1. Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error


In [3]:
import pandas as pd
# 2. Load the dataset
# If your file is in the same folder, just use "insurance.csv"
df = pd.read_csv("/content/drive/MyDrive/Datasets/insurance.csv")   # or r"/mnt/data/insurance.csv"

print("First 5 rows of data:")
print(df.head())
print("\nColumns:", df.columns)

First 5 rows of data:
   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520

Columns: Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')


In [5]:
# 3. Separate features (X) and target (y)
X = df.drop("charges", axis=1)   # all columns except charges
y = df["charges"]               # target column


In [6]:
# 4. Identify numeric and categorical columns
numeric_features = ["age", "bmi", "children"]
categorical_features = ["sex", "smoker", "region"]


In [8]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# 5. Preprocessing for numeric and categorical data

# For numeric columns: Standardization (mean=0, std=1)
numeric_transformer = Pipeline(
    steps=[
        ("scaler", StandardScaler())
    ]
)

# For categorical columns: One-hot encoding
categorical_transformer = OneHotEncoder(drop="first", handle_unknown="ignore")

# Combine both using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

In [11]:
from sklearn.linear_model import Lasso
# 6. Create the Lasso Regression model
lasso_model = Lasso(alpha=0.1, max_iter=10000, random_state=42)

In [13]:
# 7. Build the final Pipeline: preprocessing + model
model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("lasso", lasso_model)
    ]
)


In [15]:
from sklearn.model_selection import train_test_split
# 8. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [17]:
# 9. Train (fit) the model
model.fit(X_train, y_train)



In [19]:
# 10. Predict on test data
y_pred = model.predict(X_test)


In [22]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np
# 11. Evaluate the model
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print("\nLasso Regression Performance on Test Set:")
print(f"R² Score  : {r2:.4f}")
print(f"MAE       : {mae:.2f}")
print(f"MSE       : {mse:.2f}")
print(f"RMSE      : {rmse:.2f}")


Lasso Regression Performance on Test Set:
R² Score  : 0.7836
MAE       : 4181.30
MSE       : 33597789.30
RMSE      : 5796.36
