In [None]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

In [None]:
data = pd.read_csv("datasets/deposit.csv")

In [None]:
print(data.shape)
print(data.index)
print(data.columns)
print(len(data.columns))
data.head()

In [None]:
data.drop(['contact','duration-sec'], axis=1, inplace=True)

In [None]:
data.head()

In [None]:
print("Missing values before handling:")
print(data.isnull().sum())

In [None]:
mean=data["age"].mean()
mod_job=data["job"].mode()[0]
mod_edu=data["education"].mode()[0]
print(mean)
print(mod_job) 
print(mod_edu)

In [None]:
data["age"].fillna(mean, inplace=True)
data["job"].fillna(mod_job, inplace=True)
data["education"].fillna(mod_edu, inplace=True)

In [None]:
print("Missing values after handling:")
print(data.isnull().sum())

>## Label encoding

In [None]:
data.head()

In [None]:
print("job")
print(data["job"].unique().tolist())
print(data["job"].value_counts().tolist())

print("marital")
print(data["marital"].unique().tolist())
print(data["marital"].value_counts().tolist())

print("education")
print(data["education"].unique().tolist())
print(data["education"].value_counts().tolist())

print("default")
print(data["default"].unique().tolist())
print(data["default"].value_counts().tolist())

print("housing")
print(data["housing"].unique().tolist())
print(data["housing"].value_counts().tolist())

print("loan")
print(data["loan"].unique().tolist())
print(data["loan"].value_counts().tolist())

print("deposit")
print(data["deposit"].unique().tolist())
print(data["deposit"].value_counts().tolist())

In [None]:
label_encoder = LabelEncoder()

In [None]:
data["deposit"] = label_encoder.fit_transform(data["deposit"])
data["loan"] = label_encoder.fit_transform(data["loan"])
data["housing"] = label_encoder.fit_transform(data["housing"])
data["default"] = label_encoder.fit_transform(data["default"])

In [None]:
data.head()

In [None]:
data = pd.get_dummies(data, columns=['job'])
data = pd.get_dummies(data, columns=['marital'])
data = pd.get_dummies(data, columns=['education'])

In [None]:
data.head()

In [None]:
data["job_admin"] = label_encoder.fit_transform(data["job_admin"])
data["job_blue-collar"] = label_encoder.fit_transform(data["job_blue-collar"])
data["job_entrepreneur"] = label_encoder.fit_transform(data["job_entrepreneur"])
data["job_housemaid"] = label_encoder.fit_transform(data["job_housemaid"])
data["job_management"] = label_encoder.fit_transform(data["job_management"])
data["job_retired"] = label_encoder.fit_transform(data["job_retired"])
data["job_self-employed"] = label_encoder.fit_transform(data["job_self-employed"])
data["job_services"] = label_encoder.fit_transform(data["job_services"])
data["job_student"] = label_encoder.fit_transform(data["job_student"])
data["job_technician"] = label_encoder.fit_transform(data["job_technician"])
data["job_unemployed"] = label_encoder.fit_transform(data["job_unemployed"])
data.drop(columns="job_unknown", inplace=True)

data["marital_married"] = label_encoder.fit_transform(data["marital_married"])
data["marital_single"] = label_encoder.fit_transform(data["marital_single"])
data.drop(columns="marital_divorced", inplace=True)

data["education_primary"] = label_encoder.fit_transform(data["education_primary"])
data["education_secondary"] = label_encoder.fit_transform(data["education_secondary"])
data["education_tertiary"] = label_encoder.fit_transform(data["education_tertiary"])
data.drop(columns="education_unknown", inplace=True)

In [None]:
data.head()

In [None]:
y = data["deposit"]  # Feature Vector
X = data.drop("deposit",axis=1)  # Feature Matrix

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

# Initialize the MinMaxScaler
# scaler = MinMaxScaler()
scaler = StandardScaler()

# Scale all columns
X = scaler.fit_transform(X)

In [None]:
from sklearn.model_selection import train_test_split


# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the train and test sets
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

In [None]:
import matplotlib.pyplot as plt

# Calculate the sizes of each set
sizes = [len(X_train), len(X_test)]
labels = ['Training Set', 'Testing Set']

# Create a pie chart
plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=140)
plt.title('Proportion of Data in Training and Testing Sets')
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle
plt.show()

># Classification

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Initialize and train the SVM classifier
# model = SVC(kernel='linear',probability=True)
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
import numpy as np
import matplotlib.pyplot as plt

# Function to plot ROC curves for each class
def plot_roc_curves_for_classes(model, X, y):
    # Binarize the target variable
    y_bin = label_binarize(y, classes=np.unique(y))

    # Perform cross-validated predictions for probabilities of each class
    y_scores_cv = cross_val_predict(model, X, y, cv=5, method='predict_proba')

    # Get the number of classes
    n_classes = y_bin.shape[1]

    # Initialize figure
    # plt.figure(figsize=(8, 6))

    # For each class, compute ROC curve and AUC
    for i in range(n_classes):
        fpr, tpr, _ = roc_curve(y_bin[:, i], y_scores_cv[:, i])
        roc_auc = auc(fpr, tpr)
        plt.plot(tpr, fpr, lw=2, label='Class {} (AUC = {:.2f})'.format(i, roc_auc))

    # Plot diagonal line (random classifier)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')

    # Set plot parameters
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC curves for each class')
    plt.legend(loc="lower right")
    plt.show()

# Call the function to plot ROC curves for each class
plot_roc_curves_for_classes(model, X_train, y_train)

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

# Plot confusion matrix
def plot_confusion_matrix(y_true, y_pred, labels):
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    # plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.show()

# Plot distribution of actual vs predicted values
def plot_actual_vs_predicted(y_true, y_pred, labels):
    # plt.figure(figsize=(10, 6))
    sns.histplot(y_true, label='Actual', kde=True, alpha=0.5)
    sns.histplot(y_pred, label='Predicted', kde=True, alpha=0.5)
    plt.xlabel('Frequency of feeling depressed or down')
    plt.ylabel('Count')
    plt.title('Actual vs Predicted')
    plt.legend()
    plt.xticks(rotation=45)
    plt.xticks(labels)
    plt.show()

# Plot confusion matrix
plot_confusion_matrix(y_test, y_pred, labels=data["deposit"].unique().tolist())

# Plot distribution of actual vs predicted values
plot_actual_vs_predicted(y_test, y_pred, labels=data["deposit"].unique().tolist())