In [25]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, LassoCV, Lasso
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
import warnings


%matplotlib inline

In [27]:
df = pd.read_csv("C:/Users/sahay/Desktop/ML/Projects/Insurance Premium project/insurance (1).csv")

In [28]:
df.head()

Unnamed: 0,Age,Diabetes,BloodPressureProblems,AnyTransplants,AnyChronicDiseases,Height,Weight,KnownAllergies,HistoryOfCancerInFamily,NumberOfMajorSurgeries,PremiumPrice
0,45,0,0,0,0,155,57,0,0,0,25000
1,60,1,0,0,0,180,73,0,0,0,29000
2,36,1,1,0,0,158,59,0,0,1,23000
3,52,1,1,0,1,183,93,0,0,2,28000
4,38,0,0,0,1,166,88,0,0,1,23000


In [29]:
#binning age into categories
bins=[0,20,30,40,50,60,float('inf')]
labels=['<20 yrs','20 - 30 yrs','30 - 40 yrs','40 - 50 yrs', '50 - 60 yrs', '>60 yrs']
df['Age_category']=pd.cut(df['Age'],bins=bins,labels=labels)

In [30]:
#binning height into categories
bins=[0,144,150,160,170,float('inf')]
labels =['<144 cm','144-150 cm','150-160 cm','160-170 cm','>170']
df['Height_Category'] = pd.cut(df['Height'], bins=bins, labels=labels)


In [31]:
#binning weight into categories
bins=[0,50,70,90,110,130,140]
labels=['< 50 kgs','50-70 kgs','70-90 kgs','90-110 kgs','110-130 kgs','130-140 kgs']
df['weight_category']=pd.cut(df['Weight'],labels=labels,bins=bins)

In [32]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [33]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Encode categorical columns (ordinal and nominal features)
le = LabelEncoder()
df['Age_category'] = le.fit_transform(df['Age_category'])
df['Height_Category'] = le.fit_transform(df['Height_Category'])
df['weight_category'] = le.fit_transform(df['weight_category'])



# Scale numeric features
scaler = StandardScaler()
df[['Height', 'Weight', 'NumberOfMajorSurgeries']] = scaler.fit_transform(df[['Height', 'Weight', 'NumberOfMajorSurgeries']])



# Now, we have data split, but the preprocessing has already been applied to the entire dataset.

In [34]:
# Split the data into features and target variable
from sklearn.model_selection import train_test_split

X = df.drop('PremiumPrice', axis=1)  # Features
y = df['PremiumPrice']  # Target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Creating a base line model for feature selection

In [35]:
#creating am Evaluate function to give all metrics after Model Training

def evaluate_model(true,predicted):
    mae=mean_absolute_error(true,predicted)
    mse=mean_squared_error(true,predicted)
    rmse=np.sqrt(mean_squared_error(true,predicted))
    r2_square=r2_score(true,predicted)
    return mae,mse,r2_square

In [36]:
lr=LinearRegression()
lr.fit(X_train,y_train)
y_train_pred=lr.predict(X_train)
y_test_pred=lr.predict(X_test)

In [37]:
# Evaluate Train and Test dataset
lr_train_mae , lr_train_rmse, lr_train_r2 = evaluate_model(y_train, y_train_pred)

lr_test_mae , lr_test_rmse, lr_test_r2 = evaluate_model(y_test, y_test_pred)

In [38]:
print('Model performance for Training set')
print("- Root Mean Squared Error: {:.4f}".format(lr_train_rmse))
print("- Mean Absolute Error: {:.4f}".format(lr_train_mae))
print("- R2 Score: {:.4f}".format(lr_train_r2))

print('----------------------------------')

print('Model performance for Test set')
print("- Root Mean Squared Error: {:.4f}".format(lr_test_rmse))
print("- Mean Absolute Error: {:.4f}".format(lr_test_mae))
print("- R2 Score: {:.4f}".format(lr_test_r2))

Model performance for Training set
- Root Mean Squared Error: 14268810.2803
- Mean Absolute Error: 2654.8375
- R2 Score: 0.6251
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 12299208.0195
- Mean Absolute Error: 2567.6658
- R2 Score: 0.7116


In [39]:
# Lasso with cross-validation
lasso = LassoCV(cv=5, random_state=42)
lasso.fit(X_train, y_train)

# Get selected features (non-zero coefficients)
lasso_coef = pd.Series(lasso.coef_, index=X_train.columns)
selected_features_lasso = lasso_coef[lasso_coef != 0].index.tolist()

print("Selected features from Lasso:\n", selected_features_lasso)

Selected features from Lasso:
 ['Age', 'Diabetes', 'AnyTransplants', 'AnyChronicDiseases', 'Weight', 'HistoryOfCancerInFamily', 'NumberOfMajorSurgeries', 'Age_category']


In [40]:
X_train_selected=X_train[selected_features_lasso]
X_test_selected=X_test[selected_features_lasso]

In [41]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(),
    "AdaBoost Regressor": AdaBoostRegressor()
}

model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train_selected,y_train)

    #make predictions
    y_train_pred=model.predict(X_train_selected)
    y_test_pred=model.predict(X_test_selected)

    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)


    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')

    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)

    print('='*35)
    print('\n')

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 14284524.9035
- Mean Absolute Error: 2654.2971
- R2 Score: 0.6247
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 12323246.7431
- Mean Absolute Error: 2564.1970
- R2 Score: 0.7110


Lasso
Model performance for Training set
- Root Mean Squared Error: 14284568.5702
- Mean Absolute Error: 2654.1086
- R2 Score: 0.6247
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 12330491.0843
- Mean Absolute Error: 2564.9953
- R2 Score: 0.7108


Ridge
Model performance for Training set
- Root Mean Squared Error: 14286290.0649
- Mean Absolute Error: 2654.5191
- R2 Score: 0.6247
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 12372660.6192
- Mean Absolute Error: 2569.4349
- R2 Score: 0.7099


K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 10223553.2995
- Mea

### Results

In [42]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)


Unnamed: 0,Model Name,R2_Score
5,Random Forest Regressor,0.859581
6,XGBRegressor,0.747401
4,Decision Tree,0.716105
3,K-Neighbors Regressor,0.71594
0,Linear Regression,0.711012
1,Lasso,0.710842
2,Ridge,0.709853
7,AdaBoost Regressor,0.557135
