In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.pipeline import Pipeline

# Load the data
data = pd.read_csv('bank-full.csv', sep=';')

# Visualizing categorical variables
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(18, 12))

sns.countplot(x='job', data=data, ax=axes[0, 0])
axes[0, 0].set_title('Job Distribution')
axes[0, 0].set_xticklabels(axes[0, 0].get_xticklabels(), rotation=90)

sns.countplot(x='marital', data=data, ax=axes[0, 1])
axes[0, 1].set_title('Marital Status Distribution')

sns.countplot(x='education', data=data, ax=axes[0, 2])
axes[0, 2].set_title('Education Level Distribution')

sns.countplot(x='housing', data=data, ax=axes[1, 0])
axes[1, 0].set_title('Housing Loan Status')

sns.countplot(x='loan', data=data, ax=axes[1, 1])
axes[1, 1].set_title('Personal Loan Status')

sns.countplot(x='y', data=data, ax=axes[1, 2])
axes[1, 2].set_title('Subscription Status')

plt.tight_layout()
plt.show()

# Visualizing continuous variables
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(14, 10))

sns.histplot(data['age'], bins=30, kde=True, ax=axes[0, 0])
axes[0, 0].set_title('Age Distribution')
axes[0, 0].set_xlabel('Age')

sns.histplot(data['balance'], bins=30, kde=True, ax=axes[0, 1])
axes[0, 1].set_title('Balance Distribution')
axes[0, 1].set_xlabel('Balance')

sns.histplot(data['duration'], bins=30, kde=True, ax=axes[1, 0])
axes[1, 0].set_title('Duration of Call Distribution')
axes[1, 0].set_xlabel('Duration (seconds)')

sns.histplot(data['campaign'], bins=30, kde=True, ax=axes[1, 1])
axes[1, 1].set_title('Number of Contacts During Campaign')
axes[1, 1].set_xlabel('Number of Contacts')

plt.tight_layout()
plt.show()

# Data preprocessing
data['y'] = data['y'].apply(lambda x: 1 if x == 'yes' else 0)
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

# Splitting the data
X = data.drop('y', axis=1)
y = data['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Modeling and evaluation
models = {
    "KNN": KNeighborsClassifier(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "SVM": SVC()
}

evaluation_metrics = []

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    evaluation_metrics.append({
        "Model": model_name,
        "Accuracy": accuracy,
        "F1-Score": f1
    })
    
    print(f"{model_name} Classification Report:\n")
    print(classification_report(y_test, y_pred))
    print("="*60)

# Convert evaluation metrics to DataFrame
evaluation_df = pd.DataFrame(evaluation_metrics)
print(evaluation_df)

# Plotting the evaluation metrics
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14, 6))

sns.barplot(x='Model', y='Accuracy', data=evaluation_df, ax=axes[0])
axes[0].set_title('Model Accuracy Comparison')
axes[0].set_ylim(0, 1)

sns.barplot(x='Model', y='F1-Score', data=evaluation_df, ax=axes[1])
axes[1].set_title('Model F1-Score Comparison')
axes[1].set_ylim(0, 1)

plt.tight_layout()
plt.show()




: 