<a href="https://colab.research.google.com/github/pmargarete/CCADMACL_EXERCISES_COM222/blob/main/Exercise2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [141]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_log_error
import xgboost as xgb

warnings.filterwarnings("ignore")

In [142]:
!pip install --user xgboost



In [143]:
#LOAD
train = pd.read_csv("train.csv")

In [144]:
test = pd.read_csv("test.csv")

In [145]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200000 entries, 0 to 1199999
Data columns (total 21 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   id                    1200000 non-null  int64  
 1   Age                   1181295 non-null  float64
 2   Gender                1200000 non-null  object 
 3   Annual Income         1155051 non-null  float64
 4   Marital Status        1181471 non-null  object 
 5   Number of Dependents  1090328 non-null  float64
 6   Education Level       1200000 non-null  object 
 7   Occupation            841925 non-null   object 
 8   Health Score          1125924 non-null  float64
 9   Location              1200000 non-null  object 
 10  Policy Type           1200000 non-null  object 
 11  Previous Claims       835971 non-null   float64
 12  Vehicle Age           1199994 non-null  float64
 13  Credit Score          1062118 non-null  float64
 14  Insurance Duration    1199999 non-

In [146]:
train.sample()

Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,...,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount
84800,84800,25.0,Female,1223.0,Divorced,2.0,Bachelor's,Unemployed,9.371225,Rural,...,2.0,7.0,786.0,7.0,2022-12-06 15:21:39.219432,Good,Yes,Weekly,Apartment,811.0


In [147]:
#EXTRACT FEATURES

train['Premium Amount'] = np.log1p(train['Premium Amount'])

X, y = train.drop(['Customer Feedback', 'Policy Start Date', 'Premium Amount'], axis=1), train[['Premium Amount']]

In [148]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [149]:

categorical_features = [col for col in X_train.select_dtypes(include=["object"]).columns]
numerical_features = [col for col in X_train.select_dtypes(include=["float64", "int64"]).columns]

In [150]:
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_features),
        ("num", Pipeline([
            ('imputer', SimpleImputer(strategy='mean')), # Impute missing numerical values with the mean
            ('scaler', StandardScaler())
        ]), numerical_features),
    ]
)

In [153]:
# Create regression matrices
# Convert 'object' columns to 'category' before creating DMatrix
#for col in X_train.select_dtypes(include=['object']).columns:
#    X_train[col] = X_train[col].astype('category')
#    X_test[col] = X_test[col].astype('category')

In [154]:
#PREPROCESS
X_train_encoded = preprocessor.fit_transform(X_train)
X_test_encoded = preprocessor.transform(X_test)

In [155]:
# PROCESSED DATA to DF for DMATRIX
X_train_encoded = pd.DataFrame(X_train_encoded, columns=preprocessor.get_feature_names_out())
X_test_encoded = pd.DataFrame(X_test_encoded, columns=preprocessor.get_feature_names_out())


In [156]:
#Dmatrix
dtrain_reg = xgb.DMatrix(X_train_encoded, y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test_encoded, y_test, enable_categorical=True)

In [157]:
# SET XGB PARAMETERS
params = {
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "learning_rate": 0.01,
    "max_depth": 6,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "min_child_weight": 3,
    "lambda": 1.0,  # L2 regularization
    "alpha": 0.5    # L1 regularization
}

In [158]:

# TRAIN THE MODEL
evals = [(dtrain_reg, "train"), (dtest_reg, "valid")]
model = xgb.train(
    params=params,
    dtrain=dtrain_reg,
    num_boost_round=200,
    evals=evals,
    early_stopping_rounds=50
)

[0]	train-rmse:1.09497	valid-rmse:1.09572
[1]	train-rmse:1.09429	valid-rmse:1.09504
[2]	train-rmse:1.09397	valid-rmse:1.09472
[3]	train-rmse:1.09342	valid-rmse:1.09417
[4]	train-rmse:1.09288	valid-rmse:1.09364
[5]	train-rmse:1.09280	valid-rmse:1.09357
[6]	train-rmse:1.09279	valid-rmse:1.09356
[7]	train-rmse:1.09254	valid-rmse:1.09331
[8]	train-rmse:1.09245	valid-rmse:1.09323
[9]	train-rmse:1.09193	valid-rmse:1.09271
[10]	train-rmse:1.09142	valid-rmse:1.09220
[11]	train-rmse:1.09134	valid-rmse:1.09213
[12]	train-rmse:1.09084	valid-rmse:1.09163
[13]	train-rmse:1.09022	valid-rmse:1.09102
[14]	train-rmse:1.08962	valid-rmse:1.09043
[15]	train-rmse:1.08960	valid-rmse:1.09041
[16]	train-rmse:1.08901	valid-rmse:1.08983
[17]	train-rmse:1.08863	valid-rmse:1.08945
[18]	train-rmse:1.08836	valid-rmse:1.08918
[19]	train-rmse:1.08779	valid-rmse:1.08862
[20]	train-rmse:1.08724	valid-rmse:1.08807
[21]	train-rmse:1.08669	valid-rmse:1.08753
[22]	train-rmse:1.08649	valid-rmse:1.08733
[23]	train-rmse:1.085

In [166]:
original_test = pd.read_csv("test.csv")

In [167]:
# DROP SAME COLUMNS
original_test = original_test.drop(['Policy Start Date', 'Customer Feedback'], axis=1)
print(f"Test size after dropping columns: {original_test.shape}")

Test size after dropping columns: (800000, 18)


In [168]:
test_encoded = preprocessor.transform(original_test)
test_encoded = pd.DataFrame(test_encoded, columns=preprocessor.get_feature_names_out())


In [169]:
#CHECK
print(f"Test size after preprocessing: {test_encoded.shape}")

Test size after preprocessing: (800000, 36)


In [171]:
# CREATE DMATRIX FOR PREDICTION
dtest_pred = xgb.DMatrix(test_encoded, enable_categorical=True, feature_names=list(test_encoded.columns))
print(f"DMatrix rows: {dtest_pred.num_row()}")

DMatrix rows: 800000


In [172]:
preds = model.predict(dtest_pred)


In [173]:
#REVERSE LOG TRANSFORMATION
preds_original = np.expm1(preds)


In [174]:
y_test_original = np.expm1(y_test)
rmse = mean_squared_log_error(y_test_original, np.expm1(model.predict(dtest_reg)), squared=False)  # RMSLE
print(f"RMSLE of the model: {rmse:.4f}")

RMSLE of the model: 1.0620


In [175]:
print(f"Test size: {test_encoded.shape}")

Test size: (800000, 36)


In [176]:
# SAVE PREDICTIONS TO CSV
submission_df = pd.DataFrame({
    "id": original_test["id"],
    "Premium Amount": preds_original
})
submission_df.to_csv("submission_file.csv", index=False)
print("Submission file created: submission_file.csv")

Submission file created: submission_file.csv
