In [None]:
# Importing libraries

import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report, auc, RocCurveDisplay
from imblearn.over_sampling import SMOTENC, RandomOverSampler
from sklearn.pipeline import Pipeline
from catboost import CatBoostClassifier
from sklearn.compose import ColumnTransformer

In [None]:
# Import datasets
BASE_URL = os.getcwd()

# Constructing file paths using os.path.join
train_file_path = os.path.join(BASE_URL, 'train.csv')
test_file_path = os.path.join(BASE_URL, 'test.csv')

# Reading CSV files
df_train = pd.read_csv(train_file_path)
df_test = pd.read_csv(test_file_path)


In [None]:
"""
BASE_URL = 'C:/Users/stede/OneDrive/locdocs/professionAI/fondamenti di machine learning/progetto finale/'
df_train = pd.read_csv(BASE_URL + 'train.csv')
df_test = pd.read_csv(BASE_URL + 'test.csv')
"""

### Data Preprocessing

In [None]:
# Cleaning datasets from possible typing errors, and dropping the 'id' columns

df_train.columns = df_train.columns.str.strip().str.replace(' ', '_')
df_test.columns = df_test.columns.str.strip().str.replace(' ', '_')

df_train = df_train.drop(['id'], axis=1)

In [None]:
df_train.head()

In [None]:
# Checking any possible null value

df_train.isnull().sum()

### Data Visualization

In [None]:
# Distribution of the target variable 'Response'

"""
From the graph we can see that the target variable 'Response' is strongly unbalanced.
"""

sns.countplot(x='Response', data=df_train)
plt.title('Distribution of Response')
plt.show()

In [None]:
# Distribution of 'Age'

sns.countplot(x='Age', data=df_train)
plt.title('Age Distribution')
plt.xticks(range(0, df_train['Age'].max()+1, 5))
plt.show()

In [None]:
# Dividing categorical variables ('cat_var') and numerical variables ('num_var')

cat_var = ['Gender', 'Driving_License', 'Region_Code', 'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Policy_Sales_Channel']
num_var = ['Age', 'Vintage', 'Annual_Premium']

In [None]:
# Distributions of the categorical features

for var in cat_var:
    sns.countplot(x = var, data = df_train)
    plt.title(f'Distribution of {var}')
    plt.show()

In [None]:
# Relationship between quantitative features and the target variable

"""
We can notice that the Age can be a factor on the forecasted response of 
the customer, as the older ones tend to respond more positively to the offer.
"""

for var in num_var:
    sns.boxplot(x = 'Response', y = var, data = df_train)
    plt.title(f'{var} vs. Response')
    plt.show()


### Feature Encoding

In [None]:
# Encoding the categorical features with the LabelEncoder

LabEnc = LabelEncoder()
df_train['Gender'] = LabEnc.fit_transform(df_train['Gender'])
df_test['Gender'] = LabEnc.transform(df_test['Gender'])

LabEnc = LabelEncoder()
df_train['Vehicle_Damage'] = LabEnc.fit_transform(df_train['Vehicle_Damage'])
df_test['Vehicle_Damage'] = LabEnc.transform(df_test['Vehicle_Damage'])

df_train["Region_Code"] = df_train["Region_Code"].astype("str")
df_test["Region_Code"] = df_test["Region_Code"].astype("str")
LabEnc = LabelEncoder()
df_train['Region_Code'] = LabEnc.fit_transform(df_train['Region_Code'])
df_test['Region_Code'] = LabEnc.transform(df_test['Region_Code'])

Vehicle_Age = {'< 1 Year':0, '1-2 Year':1, '> 2 Years':2}
df_train['Vehicle_Age'] = df_train['Vehicle_Age'].map(Vehicle_Age)
df_test['Vehicle_Age'] = df_test['Vehicle_Age'].map(Vehicle_Age)

In [None]:
StdScl = StandardScaler()

df_train[num_var] = StdScl.fit_transform(df_train[num_var])
df_test[num_var] = StdScl.transform(df_test[num_var])

In [None]:
"""
The features that are most correlated with the target variable seem to be 
'Vehicle_Damage' (positively correlated) and 'Previously_Insured' 
(negatively correlated). This makes sense, as customers who have had an 
incident in the past will be more likely to take out an insurance policy. 
Those who already have an insured vehicle, are more unlikely to have an 
additional vehicle to insure.
"""

plt.figure(figsize=(10, 6))
sns.heatmap(df_train.corr(), annot = True, annot_kws = {'size': 9}, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# Checking if the feature encoding had effect on the dataset

df_train.head()

### Training models

In [None]:
# Setting a constant 'RS' (RANDOM_SEED) for the 'random_state' attirbute
RS = 25

# Splitting the variables into features and target
X = df_train.drop(['Response'], axis=1)
y = df_train['Response']

In [None]:
"""
We use the 'stratify' parameter to ensure that the distribution of the target variable 'Response' 
is preserved in both the training and testing sets. 
By setting the 'shuffle' parameter as True, we want to avoid any possible ordering in the dataset,
like taking all the observations for our test set from one specific Region Code.
"""

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=RS, stratify=y, shuffle=True)

In [None]:
"""
By performing oversampling with the SMOTE_NC method (an extension of the SMOTE algorithm) 
we are able to effectively handles datasets with both quantitative and categorical features.
https://imbalanced-learn.org/stable/over_sampling.html
"""

cat_var_mask = X.columns.isin(cat_var)

smotenc = SMOTENC(categorical_features=cat_var_mask, random_state=RS)
X, y = smotenc.fit_resample(X_train, y_train)

### RandomForestClassifier

In [None]:
RanFor = RandomForestClassifier(n_estimators=62, max_depth=17, random_state=RS, class_weight='balanced')

RanFor.fit(X_train, y_train)

In [None]:
y_pred_train = RanFor.predict(X_train)
y_proba_train = RanFor.predict_proba(X_train)
y_pred_test = RanFor.predict(X_test)
y_proba_test = RanFor.predict_proba(X_test)

print("TRAIN REPORT - RandomForestClassifier")
print(classification_report(y_train, y_pred_train))
print("TEST REPORT - RandomForestClassifier")
print(classification_report(y_test, y_pred_test))

In [None]:
y_proba_train = RanFor.predict_proba(X_train)[:, 1]
y_proba_test = RanFor.predict_proba(X_test)[:, 1]

print(roc_auc_score(y_train, y_proba_train))
print(roc_auc_score(y_test, y_proba_test))

In [None]:
"""
Let's comment the result obtained with the RandomForestClassifier:

For class 0, we have high precision but low recall. This means that when 
the model predicts a customer is in class 0 (not interested), it is often 
correct, but it misses a large portion of the total actual 0s, so it 
wrongly classifies many actual 0s (not interested) as 1s (interested).

For class 1, we have lower precision but higher recall. This means the model 
identifies a high proportion of actual 1s (interested) correctly, but in 
doing so, it also wrongly identifies many 0s (not interested) as 1s 
(interested), leading to a high number of false positives."

The ROC-AUC score of 92.21% on the test is actually not a bad result.
Being this model more liberal on class 1, it could be useful for 
initial broad-based marketing efforts.

Let's see if we can find a more precise model though...
"""

### CatBoostClassifier

In [None]:
CatBst = CatBoostClassifier(verbose=100)

CatBst.fit(X_train, y_train, eval_set = (X_test, y_test), early_stopping_rounds=10)

In [None]:
y_pred_train = CatBst.predict(X_train)
y_proba_train = CatBst.predict_proba(X_train)
y_pred_test = CatBst.predict(X_test)
y_proba_test = CatBst.predict_proba(X_test)

print("TRAIN REPORT - CatBoostClassifier")
print(classification_report(y_train, y_pred_train))
print("TEST REPORT - CatBoostClassifier")
print(classification_report(y_test, y_pred_test))

In [None]:
y_proba_train = CatBst.predict_proba(X_train)[:, 1]
y_proba_test = CatBst.predict_proba(X_test)[:, 1]

print(roc_auc_score(y_train, y_proba_train))
print(roc_auc_score(y_test, y_proba_test))

In [None]:
"""
The CatBoostClassifier gave a ROC-AUC score of 96.27% on the test set.
It suggests that the model can accurately identify positive instances 
while keeping the number of false positives low.
"""

### XGBClassifier

In [None]:
"""
XGBC = XGBClassifier(random_state=RS)
XGBC.fit(X_train, y_train)

In [None]:
"""
y_pred_train = XGBC.predict(X_train)
y_proba_train = XGBC.predict_proba(X_train)
y_pred_test = XGBC.predict(X_test)
y_proba_test = XGBC.predict_proba(X_test)

print("TRAIN REPORT - XGBClassifier")
print(classification_report(y_train, y_pred_train))
print("TEST REPORT - XGBClassifier")
print(classification_report(y_test, y_pred_test))

In [None]:
"""
y_proba_train = XGBC.predict_proba(X_train)[:, 1]
y_proba_test = XGBC.predict_proba(X_test)[:, 1]

print(roc_auc_score(y_train, y_proba_train))
print(roc_auc_score(y_test, y_proba_test))

In [None]:
""" 
With a ROC-AUC of 96.51% on the test set, the XGBClassifier represents
an improvement on the CatBoostClassifier, and it also proved to be
less computationally expensive.
We are going to use this model for our predictions on the test set:
"""

In [None]:
"""
# Applying the model to the test data
df_test_id = df_test['id']
df_test = df_test.drop(['id'], axis=1)
df_test_pred = XGBC.predict(df_test)
df_test_pred_proba = XGBC.predict_proba(df_test)

# Creating a new dataframe for the predictions
df_predictions = pd.DataFrame({
    'id': df_test_id,
    'Response': df_test_pred,
    'Probability_0': df_test_pred_proba[:, 0],
    'Probability_1': df_test_pred_proba[:, 1]
})

df_predictions.to_excel(BASE_URL + 'vehicle_insurance_predictions.xlsx', index=False)


### GradientBoostingClassifier

In [None]:
GrdBst = GradientBoostingClassifier(random_state=RS)

GrdBst.fit(X_train, y_train)

In [None]:
y_pred_train = GrdBst.predict(X_train)
y_proba_train = GrdBst.predict_proba(X_train)
y_pred_test = GrdBst.predict(X_test)
y_proba_test = GrdBst.predict_proba(X_test)

print("TRAIN REPORT - GradientBoostingClassifier")
print(classification_report(y_train, y_pred_train))
print("TEST REPORT - GradientBoostingClassifier")
print(classification_report(y_test, y_pred_test))

In [None]:
y_proba_train = GrdBst.predict_proba(X_train)[:, 1]
y_proba_test = GrdBst.predict_proba(X_test)[:, 1]

print(roc_auc_score(y_train, y_proba_train))
print(roc_auc_score(y_test, y_proba_test))

In [None]:
"""
The GradientBoostingClassifier shows many false positive, even more than
the RandomForestClassifier, and it actually doesn't represent an 
improvement over any of the model we saw above.
"""