# Introduction
This project predicts whether a customer will purchase vehicle insurance.

## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

## Load Dataset

In [None]:
df = pd.read_csv('train.csv')
df.head()

## Exploratory Data Analysis (EDA)

### Dataset Overview

In [None]:
df.info()
df.describe(include='all').T

### Missing Values & Duplicates

In [None]:
print(df.isnull().sum())
print('Duplicates:', df.duplicated().sum())

### Target Distribution

In [None]:
plt.figure(figsize=(5,4))
sns.countplot(x='Response', data=df)
plt.title('Target Distribution')
plt.show()

### Numerical Feature Distributions

In [None]:
num_cols = df.select_dtypes(include=['int64','float64']).columns.tolist()
for col in num_cols:
    plt.figure(figsize=(6,3))
    sns.histplot(df[col], kde=True)
    plt.title(col)
    plt.tight_layout()
    plt.show()

### Categorical Feature Distributions

In [None]:
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
for col in cat_cols:
    plt.figure(figsize=(6,3))
    sns.countplot(y=col, data=df, order=df[col].value_counts().index)
    plt.title(col)
    plt.tight_layout()
    plt.show()

### Correlation Heatmap

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(), annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

### Feature vs Target Example

In [None]:
plt.figure(figsize=(6,4))
sns.boxplot(x='Response', y='Annual_Premium', data=df)
plt.title('Annual Premium by Response')
plt.show()

## Preprocessing

In [None]:
df = df.dropna()
le = LabelEncoder()
for col in ['Gender','Vehicle_Age','Vehicle_Damage']:
    df[col] = le.fit_transform(df[col])
X = df.drop('Response', axis=1)
y = df['Response']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

## Train Models

In [None]:
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

xgb = XGBClassifier(eval_metric='logloss')
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)

## Evaluation

In [None]:
def evaluate_model(name, y_true, y_pred):
    print(f"\n=== {name} ===")
    print(classification_report(y_true, y_pred))
    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'{name} Confusion Matrix')
    plt.show()

evaluate_model('Random Forest', y_test, rf_pred)
evaluate_model('XGBoost', y_test, xgb_pred)

## ROC-AUC

In [None]:
rf_proba = rf.predict_proba(X_test)[:,1]
xgb_proba = xgb.predict_proba(X_test)[:,1]
print('Random Forest AUC:', roc_auc_score(y_test, rf_proba))
print('XGBoost AUC:', roc_auc_score(y_test, xgb_proba))

## Conclusion

In [None]:
print('XGBoost usually performs best. Replace with your metrics.')