### Goal
The objective of this competition is to predict which customers respond positively to an automobile insurance offer.

### Evaluation
Submissions are evaluated using area under the ROC curve using the predicted probabilities and the ground truth targets.

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    log_loss,
    roc_auc_score,
    roc_curve,
)
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler

### Exploratory Data Analysis

In [None]:
df = pd.read_csv("train.csv")

df[
    [
        "Gender",
        # "Driving_License",
        # "Previously_Insured",
        "Vehicle_Age",
        "Vehicle_Damage",
        # "Response",
    ]
] = df[
    [
        "Gender",
        # "Driving_License",
        # "Previously_Insured",
        "Vehicle_Age",
        "Vehicle_Damage",
        # "Response",
    ]
].astype(
    "category"
)

df.set_index("id", inplace=True)

df.head()

In [3]:
categorical_df = df.select_dtypes(include=["category", "object"])
numeric_df = df.select_dtypes(include=["number"])

In [None]:
print(df.shape)
df.drop_duplicates()
print(df.shape)

In [None]:
df.info()

In [None]:
df.nunique()

In [None]:
df.skew(numeric_only=True)

In [8]:
class_counts = categorical_df.value_counts()

In [None]:
for column in categorical_df.columns:
    plt.figure(figsize=(8, 4))
    sns.countplot(x=column, data=categorical_df)
    plt.title(f'Distribution of {column}')
    plt.show()

In [None]:
# df.hist()
# plt.tight_layout()
# plt.show()
for column in numeric_df.columns:
    sns.histplot(
        x=numeric_df.columns[1],
        hue=df["Response"],
        data=numeric_df,
        kde=True,
    )
    plt.show()

In [None]:
corr_matrix = df.corr(numeric_only=True)
mask = np.triu(np.ones_like(corr_matrix, dtype=bool), k=1)
plt.figure(figsize=(8, 4))
sns.heatmap(
    corr_matrix,
    annot=True,
    cmap="coolwarm",
    mask=mask,
    linewidths=0.7,
    center=0,
)
plt.title("Correlation Matrix")
plt.show()

In [None]:
sns.histplot(
    x="Age",
    bins=10,
    hue="Response",
    stat="count",
    color="green",
    kde=True,
    multiple="layer",
    data=df,
)
plt.show()

In [None]:
sns.boxplot(
    x="Vehicle_Age",
    y=df["Annual_Premium"],
    hue="Response",
    palette="cool",
    fill=False,
    linewidth=1.5,
    width=0.4,
    legend=True,
    data=df,
)
plt.show()

### Preprocessing

In [20]:
scale_columns = ["Age", "Region_Code","Annual_Premium","Policy_Sales_Channel","Vintage"]

In [21]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df[scale_columns])
scaled_df = pd.DataFrame(X_scaled, columns=scale_columns)

In [None]:
scaled_df.head()

In [None]:
sns.histplot(
    x="Age",
    bins=10,
    hue=df["Response"],
    stat="count",
    color="green",
    kde=True,
    multiple="layer",
    data=scaled_df,
)
plt.show()

In [None]:
sns.boxplot(
    x=df["Vehicle_Age"],
    y="Annual_Premium",
    hue=df["Response"],
    palette="cool",
    fill=False,
    linewidth=1.5,
    width=0.4,
    legend=True,
    data=scaled_df,
)
plt.show()

In [24]:
drop_cols = scaled_df.columns.append(categorical_df.columns)

In [None]:
combined_df = pd.concat([scaled_df, categorical_df,df.drop(drop_cols, axis=1)], axis=1)

combined_df.head()

In [None]:
combined_df.columns

In [None]:
categorical_df.columns

In [None]:
encoded_df = pd.get_dummies(
    combined_df[
        [
            "Age",
            "Region_Code",
            "Annual_Premium",
            "Policy_Sales_Channel",
            "Vintage",
            "Gender",
            "Vehicle_Age",
            "Vehicle_Damage",
            "Driving_License",
            "Previously_Insured",
            "Response",
        ]
    ]
)
encoded_df.head()

In [29]:
encoded_df.rename(
    columns={
        "Vehicle_Age_1-2 Year": "Vehicle_Age_1_2_Year",
        "Vehicle_Age_< 1 Year": "Vehicle_Age_less_1_Year",
        "Vehicle_Age_> 2 Years": "Vehicle_Age_greater_2_Years",
    },
    inplace=True,
)

### Train Model

In [30]:
X = encoded_df.drop(columns=["Response"])
y = encoded_df["Response"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
# Initialize the XGBoost classifier with the specified parameters
model = xgb.XGBClassifier(enable_categorical=True)

# Train the model using the training data
model.fit(X_train, y_train)

In [32]:
# Predict on the test set
y_pred_proba = model.predict_proba(X_test)[:, 1]  # Get the probabilities for the positive class
y_pred = model.predict(X_test) 

In [None]:
# Evaluate the model using ROC-AUC
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f'ROC-AUC Score: {roc_auc:.2f}')

# Plot the ROC curve
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
plt.plot(fpr, tpr, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')  # Diagonal line
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.show()

In [None]:
accuracy_score(y_test,y_pred)

In [None]:
report = classification_report(
    y_test,
    y_pred,
)
print(report)

In [None]:
print(f"Log Loss: {log_loss(y_test, y_pred_proba)}")

In [None]:
feature_importance = model.feature_importances_
features = X_train.columns

# Create a horizontal bar chart
plt.figure(figsize=(10, 6))
plt.barh(features, feature_importance)
plt.xlabel("Importance")
plt.ylabel("Features")
plt.title("Feature Importance")
plt.show()