In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

## <b>Step 1: Load the Telco data</b>

In [2]:
# Load the churn dataset
def Load_telco():
    data = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
    pd.set_option('display.max_columns', None)
    # Preview the data
    # data.head(10)
    data.drop('customerID', axis=1, inplace=True) # Drop the customerID column
    Features = data.drop('Churn', axis=1) # Features
    Labels = data['Churn'] # Labels
    map_Labels = {'Yes': 1, 'No': 0} # Map the labels to 0 and 1
    Labels = [map_Labels[i] for i in Labels] # Replace the labels with 0 and 1
    Labels = pd.Series(Labels) # Convert the labels to a pandas series
    Features['TotalCharges'] = pd.to_numeric(Features['TotalCharges'], errors='coerce') # Convert TotalCharges to numeric
    Features['MultipleLines'] = Features['MultipleLines'].replace('No phone service', 'No') # Replace 'No phone service' with 'No'
    # Features.isnull().sum() # Count the number of missing values in each column
    Features.fillna(Features.mean(numeric_only=True), inplace=True) # Fill the missing values with the mean of the column
    # Features
    categorical_columns = Features.select_dtypes(include=['object']).columns # Select the categorical columns
    for column in categorical_columns:
        Features[column] = Features[column].astype('category')
    Features = pd.get_dummies(Features) # One-hot encode the categorical columns
    scaler = MinMaxScaler() # Create a MinMaxScaler object
    Features = pd.DataFrame(scaler.fit_transform(Features), columns=Features.columns) # Normalize the features
    Labels_array = Labels.to_numpy() # Convert the labels to a numpy array
    return Features, Labels_array

## <b>Step 1: Load the Adult data</b>

In [3]:
# Load the adult.data file
def Load_adult():
    train = pd.read_csv('adult/adult.data', header=None, skipinitialspace=True)
    test = pd.read_csv('adult/adult.test', header=None, skipinitialspace=True, skiprows=1)
    print(train.shape)
    print(test.shape)
    # un = test['income'].unique() 
    # print(un)
    data = pd.concat([train, test])
    data.columns =  ['age', 'workclass', 'fnlwgt', 'education', 'education-num',
                        'marital-status', 'occupation', 'relationship', 'race',
                        'sex', 'capital-gain', 'capital-loss', 'hours-per-week',
                        'native-country', 'income']
    # for column in data.columns:
    #     if data[column].dtype == 'object':
    #         print (column)
    #         print(data[column].unique()) # Print the unique values in the column.
    #         There are missing values in the dataset. The missing values are represented by a question mark '?' in workclass, occupation and native-country columns.
    data = data.replace('?', np.nan) # Replace '?' with np.nan
    data = data.apply(lambda x: x.str.rstrip('.') if x.dtype == "object" else x) # Remove the '.' at the end of the values in the columns
    # data
    Features = data.drop('income', axis=1) # Features
    Labels = data['income'] # Labels
    map_Labels = {'>50K': 1, '<=50K': 0} # Map the labels to 0 and 1
    Labels = [map_Labels[i] for i in Labels] # Replace the labels with 0 and 1
    # Features.isnull().sum() # Count the number of missing values in each column
    Features.fillna(Features.mode().iloc[0], inplace=True) # Fill the missing values with the mode of the column
    # Features.isnull().sum() # Count the number of missing values in each column
    categorical_columns = Features.select_dtypes(include=['object']).columns # Select the categorical columns
    for column in categorical_columns:
        Features[column] = Features[column].astype('category')
    Features = pd.get_dummies(Features) # One-hot encode the categorical columns
    scaler = MinMaxScaler() # Create a MinMaxScaler object
    Features = pd.DataFrame(scaler.fit_transform(Features), columns=Features.columns) # Normalize the features  
    # Features
    Labels_array = np.array(Labels) # Convert the labels to a numpy array
    # Labels_array
    return Features, Labels_array

## <b>Step 1: Load the CreditCard data</b>

In [4]:
# Load the creditcard.csv file
def Load_creditcard():
    data = pd.read_csv('creditcard.csv')
    # data['Class'].value_counts()
    # data
    pd.set_option('display.max_columns', None)
    frauds = data[data['Class'] == 1]
    not_frauds = data[data['Class'] == 0]
    not_frauds = not_frauds.sample(n=20000, random_state=42)
    data = pd.concat([frauds, not_frauds])
    data = data.sample(frac=1, random_state=42).reset_index(drop=True)
    Features = data.drop('Class', axis=1) # Features
    Labels = data['Class'] # Labels
    scaler = MinMaxScaler() # Create a MinMaxScaler object
    Features = pd.DataFrame(scaler.fit_transform(Features), columns=Features.columns) # Normalize the features
    Labels_array = np.array(Labels) # Convert the labels to a numpy array
    return Features, Labels_array
    # Features

## <b>Logistic Regression Class</b>

In [6]:
class LogisticRegression:
    
    @staticmethod
    def sigmoid(z):
        return 1 / (1 + np.exp(-z))
    
    @staticmethod
    def initialize_params(n_features):
        weights = np.zeros(n_features) # Initialize weights to zeros n_features is the number of features
        bias = 0
        return weights, bias # rerturn the initial weights and bias
    
    @staticmethod
    def compute_loss(y, y_pred):
        n_samples = y.shape[0]
        loss = -(1 / n_samples) * np.sum(y * np.log(y_pred) + (1 - y) * np.log(1 - y_pred)) # Compute the loss
        return loss
    
    @staticmethod
    def gradient_descent(X, y, weights, bias, learning_rate, n_iters):
        n_samples = X.shape[0]
        
        for i in range(n_iters):
            # Linear model: z = X * weights + bias
            z = np.dot(X, weights) + bias
            y_pred = LogisticRegression.sigmoid(z)
            
            # Compute gradients
            dw = (1 / n_samples) * np.dot(X.T, (y_pred - y))
            db = (1 / n_samples) * np.sum(y_pred - y)
            # print("iteration:", i) # Print the iteration    
            # print(LogisticRegression.compute_loss(y, y_pred)) # Print the loss
            # Update weights and bias
            weights -= learning_rate * dw
            bias -= learning_rate * db
            
        return weights, bias
    
    @staticmethod
    def fit(X, y, learning_rate=0.001, n_iters=1000):
        n_features = X.shape[1] # Taking only the number of columns
        
        # Initialize weights and bias
        weights, bias = LogisticRegression.initialize_params(n_features)
        
        # Perform gradient descent
        weights, bias = LogisticRegression.gradient_descent(X, y, weights, bias, learning_rate, n_iters)
        
        return weights, bias
    
    @staticmethod
    def predict(X, weights, bias):
        # Linear model and applying sigmoid to get probabilities
        z = np.dot(X, weights) + bias
        y_pred = LogisticRegression.sigmoid(z)
        
        # Convert probabilities to binary class (0 or 1)
        return [1 if i > 0.5 else 0 for i in y_pred]



## <b> Splitting the dataset into Train, Validation & Test Dataset </b>
Here for 3 datasets, we have to comment the 2 datasets not needed and uncomment the target dataset in 2,3 & 4 number lines

In [7]:
from sklearn.model_selection import train_test_split

# Features, Labels_array = Load_telco()
Features, Labels_array = Load_adult()
# Features, Labels_array = Load_creditcard()

# Split the data into training+validation and testing sets
X_tr_val, X_test, y_tr_val, y_test = train_test_split(Features, Labels_array, test_size=0.2, random_state=42)

# X_tr_val = Features[:32561]
# X_test = Features[32561:]
# y_tr_val = Labels_array[:32561]
# y_test = Labels_array[Labels_array[32561:]]
# Split the training+validation set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_tr_val, y_tr_val, test_size=0.2, random_state=42)

print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)
print(X_test.shape, y_test.shape)

# Training the model
# weights, bias = LogisticRegression.fit(X_train, y_train, learning_rate=0.95, n_iters=1000) # lr = 0.95 (creditcard), lr = 0.8 (adult), lr = 0.5 (telco)

# # Predicting
# predictions = LogisticRegression.predict(X_test, weights, bias)

# print("Predictions:", predictions)
# # Evaluate the model
# accuracy = np.mean(predictions == y_test)
# print(f"Accuracy: {accuracy * 100:.2f}%")
# print(np.unique(y_test))

(32561, 15)
(16281, 15)
(31258, 105) (31258,)
(7815, 105) (7815,)
(9769, 105) (9769,)


## <b> Bagging </b>

In [8]:
from sklearn.utils import resample

def bagging(X_train, y_train):
    logistic_models = []
    for i in range(9):
        X_train_resampled, y_train_resampled = resample(X_train, y_train, replace=True, random_state=i)
        # print(X_train_resampled.shape, y_train_resampled.shape)
        weights, bias = LogisticRegression.fit(X_train_resampled, y_train_resampled, learning_rate=0.5, n_iters=1000)
        logistic_models.append((weights, bias))
        # print("Model", i, "trained weights and bias", weights)
        # print("Model", i, "trained bias", bias)
    return logistic_models

def create_prediction_matrix(logistic_models, X_val):
    n_models = len(logistic_models)
    n_samples = X_val.shape[0]
    
    # Initialize an empty matrix to hold predictions
    prediction_matrix = np.zeros((n_models, n_samples))
    
    # Fill the matrix with predictions from each model
    for i, (weights, bias) in enumerate(logistic_models):
        # Get predictions from the ith model and assign it to the ith row
        predictions = LogisticRegression.predict(X_val, weights, bias)
        prediction_matrix[i, :] = predictions
    
    return prediction_matrix

logistic_models = bagging(X_train, y_train)
# prediction_matrix_df = pd.DataFrame(prediction_matrix)
# print(prediction_matrix_df)




## <b> testing all the 9 models on test set </b>

In [9]:
from sklearn.metrics import confusion_matrix as cm, roc_auc_score, precision_recall_curve, auc
accuracy = []
senstivity = []
specificity = []
precision = []
f1_score = []
auroc = []
aupr = []
for weights,bias in logistic_models:
    predictions = LogisticRegression.predict(X_test, weights, bias)
    accuracy.append(np.mean(predictions == y_test))
    tn, fp, fn, tp = cm(y_test, predictions).ravel()
    senstivity.append(tp/(tp+fn))
    specificity.append(tn/(tn+fp))
    precision.append(tp/(tp+fp))
    # print(precision, senstivity)
    f1_score.append(2*tp/(2*tp+fp+fn))
    # auroc.append(roc_auc_score(y_test, predictions))
    precision_, recall_, x = precision_recall_curve(y_test, predictions)
    # print(precision_, recall_)
    aupr.append(auc(recall_, precision_))

## <b> Violin Plots </b>

In [10]:
metrics = pd.DataFrame({
    'Accuracy': accuracy,
    'Precision': precision,
    'Sensitivity': senstivity,
    'Specificity': specificity,
    'F1 Score': f1_score,
    'AUROC': auroc,
    'AUPR': aupr
})

import seaborn as sns
import matplotlib.pyplot as plt

# Set the aesthetic style of the plots
sns.set(style="whitegrid")

# Create a figure
plt.figure(figsize=(12, 8))

# Melt the DataFrame to long format
metrics_melted = metrics.melt(var_name='Metric', value_name='Score')

# Define a color palette for the metrics
color_palette = {
    'Accuracy': 'skyblue',
    'Precision': 'lightgreen',
    'Sensitivity': 'salmon',
    'Specificity': 'red',
    'F1 Score': 'gold',
    'AUROC': 'violet',
    'AUPR': 'deepskyblue'
}

# Draw the violin plot with the specified colors
sns.violinplot(x='Metric', y='Score', data=metrics_melted, palette=color_palette)

# Set plot title and labels
plt.title('Performance Metrics for Bagging LR Learners', fontsize=16)
plt.ylabel('Score', fontsize=14)
plt.xlabel('Metrics', fontsize=14)

# Show the plot
plt.xticks(rotation=45)
plt.show()

ValueError: All arrays must be of the same length

## <b> Taking the average and std deviation </b>

In [11]:
# print(f"Accuracy: {np.mean(accuracy)} ± {np.std(accuracy)}")
# print(f"Senstivity: {np.mean(senstivity)} ± {np.std(senstivity)}")
# print(f"Specificity: {np.mean(specificity)} ± {np.std(specificity)}")
# print(f"Precision: {np.mean(precision)} ± {np.std(precision)}")
# print(f"F1 Score: {np.mean(f1_score)} ± {np.std(f1_score)}")
# print(f"AUROC: {np.mean(auroc)} ± {np.std(auroc)}")
# print(f"AUPR: {np.mean(aupr)} ± {np.std(aupr)}")

accuracy = str(np.mean(accuracy)) + ' ± ' + str(np.std(accuracy))
senstivity = str(np.mean(senstivity)) + ' ± ' + str(np.std(senstivity))
specificity = str(np.mean(specificity)) + ' ± ' + str(np.std(specificity))
precision = str(np.mean(precision)) + ' ± ' + str(np.std(precision))
f1_score = str(np.mean(f1_score)) + ' ± ' + str(np.std(f1_score))
auroc = str(np.mean(auroc)) + ' ± ' + str(np.std(auroc))
aupr = str(np.mean(aupr)) + ' ± ' + str(np.std(aupr))

lrdata = {
    '': ['LR'],
    'Accuracy': [accuracy],
    'Sensitivity': [senstivity],
    'Specificity': [specificity],
     'Precision': [precision],
    'F1 Score': [f1_score],
    'AUROC': [auroc],
    'AUPR': [aupr]
}

lrdata = pd.DataFrame(lrdata)
# print(lrdata)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


In [12]:
from scipy import stats

def majority_voting(logistic_models, X_test):
    prediction_matrix = create_prediction_matrix(logistic_models, X_test)
    prediction_matrix = prediction_matrix.T
    majority_voting_predictions = stats.mode(prediction_matrix, axis=1)[0].flatten()
    pm = pd.DataFrame(prediction_matrix)
    mv = pd.DataFrame(majority_voting_predictions)
    dt = pd.concat([pm, mv], axis=1)
    # print(dt.head(20))
    return majority_voting_predictions


## <b> Majority Voting </b>

In [13]:
from sklearn.metrics import confusion_matrix as cm, f1_score, roc_auc_score, precision_recall_curve, auc

y_pred = majority_voting(logistic_models, X_test)
# ydf = pd.DataFrame(y_pred)
# ydf.columns = ['Class']
# yt = pd.DataFrame(y_test)
# yt.columns = ['Class_test']
# dt = pd.concat([ydf, yt], axis=1)
# print(dt)
accuracy = np.mean(y_pred == y_test)
tn, fp, fn, tp = cm(y_test, y_pred).ravel()
precision = tp / (tp + fp)
recall = tp / (tp + fn)
specificity = tn / (tn+fp)
f1_score = f1_score(y_test, y_pred)
roc_auc_score = roc_auc_score(y_test, y_pred)
precision_, recall_, x_ = precision_recall_curve(y_test, y_pred)
aupr_voting = auc(recall_, precision_)

# print("Output of Voting Ensemble :")
data = {
    '': ['Voting ensemble'],
    'Accuracy' : [accuracy], 
    'Sensitivity' : [recall],
    'Specificity' : [specificity],
    'Precision' : [precision],
    'F1 Score' : [f1_score],
    'AUROC' : [roc_auc_score], 
    'AUPR' : [aupr_voting]
}

# Convert to a DataFrame
data = pd.DataFrame(data)
# metrics_df = metrics_df.style.set_table_styles(
#     [{'selector': 'table', 'props': [('border', '1px solid black')]},
#      {'selector': 'th', 'props': [('border', '1px solid black')]},
#      {'selector': 'td', 'props': [('border', '1px solid black')]}]
# )
# # Display the table
# metrics_df

## <b> Stacking </b>

In [14]:
def train_meta_classifier(y_val, prediction_matrix):
    prediction_matrix = np.concatenate((X_val,prediction_matrix.T),axis=1)
    prediction_matrix_df = pd.DataFrame(prediction_matrix)
    # print(prediction_matrix_df)
    weights, bias = LogisticRegression.fit(prediction_matrix, y_val, learning_rate=0.5, n_iters=1000)
    return weights, bias

prediction_matrix = create_prediction_matrix(logistic_models, X_val)
weights, bias = train_meta_classifier(y_val, prediction_matrix)
# print("Meta classifier trained weights and bias", weights, bias)

In [15]:
def test_meta_classifier(logistic_models, weights, bias, X_test):
    prediction_matrix = create_prediction_matrix(logistic_models, X_test)
    prediction_matrix = np.concatenate((X_test,prediction_matrix.T),axis=1) 
    y_pred = LogisticRegression.predict(prediction_matrix, weights, bias)
    return y_pred


In [16]:
from sklearn.metrics import confusion_matrix as cm , f1_score, roc_auc_score, precision_recall_curve, auc

y_pred = test_meta_classifier(logistic_models, weights, bias, X_test)
accuracy = np.mean(y_pred == y_test)
tn, fp, fn, tp = cm(y_test, y_pred).ravel()
precision = tp / (tp + fp)
recall = tp / (tp + fn)
specificity = tn / (tn+fp)
f1_score = f1_score(y_test, y_pred)
roc_auc_score = roc_auc_score(y_test, y_pred)
precision_, recall_, x_ = precision_recall_curve(y_test, y_pred)
aupr_stack = auc(recall_, precision_)
stdata = {
    '': ['Stacking ensemble'],
    'Accuracy' : [accuracy], 
    'Sensitivity' : [recall],
    'Specificity' : [specificity],
    'Precision' : [precision],
    'F1 Score' : [f1_score],
    'AUROC' : [roc_auc_score], 
    'AUPR' : [aupr_stack]
}

stdata = pd.DataFrame(stdata)

combined_df = pd.concat([ lrdata, data, stdata], ignore_index=True)
# Convert to a DataFrame
metrics_df = pd.DataFrame(combined_df)
metrics_df = metrics_df.style.set_table_styles(
    [{'selector': 'table', 'props': [('border', '1px solid black')]},
     {'selector': 'th', 'props': [('border', '1px solid black')]},
     {'selector': 'td', 'props': [('border', '1px solid black')]}]
)
# Display the table
metrics_df

Unnamed: 0,Unnamed: 1,Accuracy,Sensitivity,Specificity,Precision,F1 Score,AUROC,AUPR
0,LR,0.8338053479828482 ± 0.0009628215072652968,0.5111582920500118 ± 0.0071637384870898045,0.9362917003866558 ± 0.0018406159059849558,0.7182449000656574 ± 0.0037950714417612176,0.5972164896409535 ± 0.004345605749975847,nan ± nan,0.673623810318364 ± 0.002367444058341465
1,Voting ensemble,0.833965,0.508705,0.937281,0.720385,0.596317,0.722993,0.673763
2,Stacking ensemble,0.834067,0.515499,0.935258,0.716647,0.599654,0.725378,0.674472
