In [None]:
# Step 1: Load Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Step 2: Load the Dataset
file_path = './data/bank.csv'
data = pd.read_csv(file_path)



In [None]:
# Step 3: Clean and map target column
data['deposit'] = data['deposit'].str.strip().str.lower()
data['deposit'] = data['deposit'].map({'yes': 1, 'no': 0})

print("Cleaned target values:\n", data['deposit'].value_counts())

In [None]:
# Step 4: Basic Info
print("\nDataset Info:")
print(data.info())
print("\nStatistical Summary:")
print(data.describe())

In [None]:
# Step 5: Encode Categorical Features (FIXED)
categorical_cols = data.select_dtypes(include=['object']).columns
data_encoded = pd.get_dummies(data, columns=categorical_cols, drop_first=True)

In [None]:
# Step 6: Scale Numerical Features
numerical_cols = ['age', 'balance', 'day', 'campaign', 'pdays', 'previous']
scaler = StandardScaler()
data_encoded[numerical_cols] = scaler.fit_transform(data_encoded[numerical_cols])

In [None]:
# Step 7: Train-Test Split
X = data_encoded.drop('deposit', axis=1)
y = data_encoded['deposit']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("\nTrain/Test split completed.")
print(f"Training samples: {X_train.shape[0]}, Test samples: {X_test.shape[0]}")

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Step 8: Initialize and fit the Logistic Regression model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Step 9: Predict on the test set using Logistic Regression
y_pred = model.predict(X_test)

# Step 10: Evaluate Logistic Regression model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print(f"\nAccuracy: {accuracy_score(y_test, y_pred):.4f}")


In [None]:
model = LogisticRegression(max_iter=2000, solver='liblinear', random_state=42)
model.fit(X_train, y_train)

# Step 11: Predict and evaluate with tuned Logistic Regression (liblinear)
y_pred = model.predict(X_test)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print(f"\nAccuracy: {accuracy_score(y_test, y_pred):.4f}")



In [None]:
# Step 12: Cross-validation score for Logistic Regression
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print(f"CV Accuracy: {scores.mean():.4f} ± {scores.std():.4f}")


In [None]:
# Step 13: Cross-validation score for Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_scores = cross_val_score(rf_model, X, y, cv=5, scoring='accuracy')
print(f"Random Forest CV Accuracy: {rf_scores.mean():.4f} ± {rf_scores.std():.4f}")


In [None]:
# Step 14: Feature importance from Random Forest
rf_model.fit(X_train, y_train)
importances = pd.Series(rf_model.feature_importances_, index=X.columns)
print(importances.sort_values(ascending=False).head(10))

In [None]:
import matplotlib.pyplot as plt

# Data
subscribed = 5289
not_subscribed = 5873
subscription_counts = pd.Series({
    'Subscribed (1)': subscribed,
    'Not Subscribed (0)': not_subscribed
})

# Plot
plt.figure(figsize=(6, 4))
subscription_counts.plot(kind='bar', color=['green', 'red'])
plt.title('Client Subscription Distribution')
plt.ylabel('Number of Clients')
plt.xticks(rotation=0)
plt.tight_layout()
plt.savefig("plot_subscription_distribution.png")
plt.show()


In [None]:
# Data
sizes = [subscribed, not_subscribed]
labels = ['Subscribed', 'Not Subscribed']
colors = ['green', 'red']

# Plot
plt.figure(figsize=(5, 5))
plt.pie(sizes, labels=labels, autopct='%1.1f%%', colors=colors, startangle=140)
plt.title('Subscription Percentage')
plt.tight_layout()
plt.savefig("plot_subscription_pie.png")
plt.show()


In [None]:
import seaborn as sns

# Metrics
model_accuracy = 0.8271
precision_1 = 0.83
recall_1 = 0.80

# Plot
plt.figure(figsize=(6, 4))
sns.barplot(
    x=['Accuracy', 'Precision (Subscribed)', 'Recall (Subscribed)'],
    y=[model_accuracy, precision_1, recall_1],
    palette='Blues_d'
)
plt.ylim(0, 1)
plt.ylabel('Score')
plt.title('Model Performance Metrics')
plt.tight_layout()
plt.savefig("plot_model_metrics.png")
plt.show()


In [None]:
from sklearn.metrics import classification_report, accuracy_score

# Assuming `data_encoded` is your processed dataset
# and `deposit` is the binary target column

# Step 15: Find correlation with target
correlation = data_encoded.corr()['deposit'].drop('deposit').abs().sort_values(ascending=False)
most_correlated_feature = correlation.idxmax()
print(f"Most correlated feature: {most_correlated_feature} (Correlation: {correlation.max():.4f})")

# Step 16: Prepare single-feature dataset
X_single = data_encoded[[most_correlated_feature]]
y = data_encoded['deposit']

# Step 17: Train/Test split
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(
    X_single, y, test_size=0.2, random_state=42, stratify=y
)

# Step 18: Train logistic regression model
single_feature_model = LogisticRegression(max_iter=1000)
single_feature_model.fit(X_train_s, y_train_s)

# Step 19: Evaluate model
y_pred_s = single_feature_model.predict(X_test_s)
accuracy_s = accuracy_score(y_test_s, y_pred_s)

print("\nSingle Feature Model Performance:")
print(f"Feature Used: {most_correlated_feature}")
print(f"Accuracy: {accuracy_s:.4f}")
print(classification_report(y_test_s, y_pred_s))
