# Life Insurance Cross-Sell Prediction

This notebook demonstrates an end-to-end machine learning workflow for predicting which existing insurance customers are most likely to purchase life insurance.
We use a synthetic dataset (`cross_sell_data.csv`) to simulate real-world cross-sell modeling.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay


In [None]:
# Load dataset
df = pd.read_csv('cross_sell_data.csv')
df.head()


In [None]:
# Exploratory Data Analysis
print(df.describe(include='all'))

# Distribution of target
sns.countplot(data=df, x='cross_sell_flag')
plt.title('Cross-Sell Target Distribution')
plt.show()

# Age distribution
sns.histplot(df['age'], bins=20, kde=True)
plt.title('Age Distribution')
plt.show()


In [None]:
# Encode categorical variable
df_encoded = df.copy()
le = LabelEncoder()
df_encoded['income_bracket'] = le.fit_transform(df_encoded['income_bracket'])

# Features and target
X = df_encoded.drop(columns=['customer_id','cross_sell_flag','life_policies_2024','life_policies_2025'])
y = df_encoded['cross_sell_flag']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)


In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=500),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42)
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:,1]
    auc = roc_auc_score(y_test, y_proba)
    print(f"\n{name} - AUC: {auc:.3f}")
    print(classification_report(y_test, y_pred))
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot()
    plt.title(f'{name} - Confusion Matrix')
    plt.show()


In [None]:
# Feature importance from Random Forest
rf = models['Random Forest']
importances = pd.Series(rf.feature_importances_, index=X.columns)
importances.sort_values().plot(kind='barh')
plt.title('Feature Importance - Random Forest')
plt.show()


In [None]:
# Business Insights
print("""Key takeaways:
- Gradient Boosting / Random Forest typically outperform Logistic Regression in predictive power.
- Policy mix (number of home/auto/farm policies) and income bracket are strong indicators of cross-sell success.
- Customers with no recent claims are more receptive to new offers.
"""")