# Term Deposit Subscription Prediction

This notebook follows the full pipeline to predict client subscription to a term deposit product using marketing campaign data.

In [None]:

import pandas as pd

# Load data
df = pd.read_csv("bank-additional-full.csv", sep=';')
df.head()


## Exploratory Data Analysis (EDA)

In [None]:

df.info()
df['y'].value_counts(normalize=True)
df.describe()


## Feature Engineering

In [None]:

df['contacted_before'] = df['pdays'].apply(lambda x: 0 if x == 999 else 1)
df = df.drop(columns=['duration'])  # Prevent data leakage
df['y'] = df['y'].map({'no': 0, 'yes': 1})
categorical_cols = df.select_dtypes(include='object').columns
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
X = df_encoded.drop(columns='y')
y = df_encoded['y']
X.shape, y.shape


## Model Training and Evaluation

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


## Feature Importance

In [None]:

import numpy as np
import matplotlib.pyplot as plt

importances = model.feature_importances_
feature_names = X.columns
indices = np.argsort(importances)[::-1]
top_n = 15
top_features = feature_names[indices][:top_n]
top_importances = importances[indices][:top_n]

plt.figure(figsize=(10, 6))
plt.title("Top 15 Most Important Features")
plt.barh(range(top_n), top_importances[::-1], align="center")
plt.yticks(range(top_n), top_features[::-1])
plt.xlabel("Feature Importance Score")
plt.tight_layout()
plt.show()
