In [95]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder,StandardScaler,MinMaxScaler
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc, precision_recall_curve,accuracy_score
from sklearn.utils import resample


LR helper functions

In [96]:
dataframe=None

def sigmoid(z):
    z = np.array(z, dtype=float)
    return 1 / (1 + np.exp(-z))

def logisticRegression(X, y, theta, bias, learning_rate, maxIteration, noFeatures):
    # print('hi')
    m=len(X)
    for iteration in range(maxIteration):
        # matrix multiplication
        h = sigmoid(np.dot(X,theta)+bias)
        gradient = np.dot(X.T,(h - y))/m
        db=np.sum(h-y)/m
        theta -= learning_rate * gradient
        bias -= learning_rate * db
    return theta,bias

def predict(X, theta,bias):
    probabilities = sigmoid(np.dot(X,theta)+bias)
    predictions = [1 if prob >= 0.5 else 0 for prob in probabilities]
    return np.array(predictions)

def normalize(X):
    return (X - X.mean()) / X.std()



Preprocessing helpers

In [97]:
def scalingFunction(scaling='standard'):
    if scaling == 'standard':
        scaler = StandardScaler()
    elif scaling == 'minmax':
        scaler = MinMaxScaler()
    
    return scaler

def preprocessing(target_col_name):
    # drop null and fill null
    dataframe.dropna(subset=[target_col_name], inplace=True)
    dataframe.fillna(dataframe.mean(numeric_only=True),inplace=True)
    # fill null for non-numeric columns
    non_numerical_columns = dataframe.select_dtypes(include=['object']).columns
    for column in non_numerical_columns:
        mode_value = dataframe[column].mode()[0]
        dataframe[column].fillna(mode_value,inplace=True)
    # drop duplicates
    dataframe.drop_duplicates(inplace=True)
    # feature and target
    features=dataframe.drop(target_col_name,axis=1)
    target=dataframe[target_col_name]
    # label encoding the target
    encoder=LabelEncoder()
    target=encoder.fit_transform(target)
    # categorization and one-hot encoding
    categorical_columns=features.select_dtypes(include=['object']).columns
    for col in categorical_columns:
        features[col]=features[col].astype('category')
    features=pd.get_dummies(features,columns=categorical_columns)
    # scaling
    candidate_columns=features.select_dtypes(exclude=['bool']).columns
    # scaler=scalingFunction('standard')
    scaler=scalingFunction('minmax')
    features_scaled=features.copy()
    features_scaled[candidate_columns]=scaler.fit_transform(features[candidate_columns])
    return features_scaled,target


Train with LR

In [98]:
def split(features,target):
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    return X_train,X_test,X_val,y_train,y_test,y_val

In [99]:
def output(X_test,y_test,theta_final,bias_final):
    X_test_np = X_test.to_numpy()
    y_test_np = y_test.to_numpy().flatten()

    predictions = predict(X_test_np, theta_final,bias_final)
    accuracy = np.mean(predictions == y_test_np)
    print(f"Accuracy: {accuracy}")
    sensitivity = recall_score(y_test, predictions)
    print("Sensitivity/Recall: ", sensitivity)
    tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
    specificity = tn / (tn + fp)
    print("Specificity: ", specificity)
    precision = precision_score(y_test, predictions)
    print("Precision: ", precision)
    f1 = f1_score(y_test, predictions)
    print("F1 Score: ", f1)
    y_prob = sigmoid(np.dot(X_test_np,theta_final))
    auroc = roc_auc_score(y_test, y_prob)
    print("AUROC: ", auroc)
    precision_values, recall_values, _ = precision_recall_curve(y_test, y_prob)
    aupr = auc(recall_values, precision_values)
    print("AUPR: ", aupr)

In [100]:
def train(features_scaled,target,target_col_name):
    # transform to dataframe
    features_df=pd.DataFrame(features_scaled,columns=features_scaled.columns)
    target_df=pd.DataFrame(target,columns=[target_col_name])
    features_df_normalized=normalize(features_df)

    X_train, X_test, X_val,y_train, y_test, y_val = split(features_df_normalized, target_df)
    # to numpy array
    X = X_train.to_numpy()
    y = y_train.to_numpy().reshape(-1,1)
    maxIteration=1000
    # noFeatures=features_df.shape[1]
    noFeatures=features_df_normalized.shape[1]
    # I was stuck here....
    theta = np.zeros((noFeatures,1))
    # print(theta.shape)
    bias=0
    learning_rate = 0.01

    theta_final,bias_final= logisticRegression(X, y, theta,bias, learning_rate, maxIteration, noFeatures)
    output(X_test,y_test,theta_final,bias_final)
    

In [101]:
def stacking(X_val,X_test,y_val,y_test,theta_list,bias_list,n_base_learners,learning_rate,maxIteration):
    meta_features = np.zeros((X_val.shape[0], n_base_learners))
    for i in range(n_base_learners):
        # Make predictions on the validation set using each base learner
        meta_features[:, i] = predict(X_val.to_numpy(), theta_list[i], bias_list[i])
    print(meta_features.shape)
    # Step 3: Train a meta-model (another Logistic Regression) using the meta-features
    theta_meta = np.zeros((n_base_learners, 1))  # Initialize theta for meta-model (number of base learners as features)
    bias_meta = 0

    # Train the meta-model on meta-features and validation labels
    theta_meta_final, bias_meta_final = logisticRegression(meta_features, y_val.to_numpy(), theta_meta, bias_meta, learning_rate, maxIteration, n_base_learners)

    # Step 5: Make final predictions on the test set using the base learners and the meta-model
    # Generate meta-features from the test set
    meta_features_test = np.zeros((X_test.shape[0], n_base_learners))

    for i in range(n_base_learners):
        # Use each base learner to predict on the test set
        meta_features_test[:, i] = predict(X_test.to_numpy(), theta_list[i], bias_list[i])

    # Make final predictions using the meta-model
    stacking_predictions = predict(meta_features_test, theta_meta_final, bias_meta_final)

    # Step 6: Evaluate the performance of the stacking ensemble
    stacking_accuracy = np.mean(stacking_predictions == y_test.to_numpy())

    print(f"Stacking Ensemble Accuracy: {stacking_accuracy}")

In [102]:
def bagging(features_scaled,target,target_col_name):
    features_df=pd.DataFrame(features_scaled,columns=features_scaled.columns)
    target_df=pd.DataFrame(target,columns=[target_col_name])
    features_df_normalized=normalize(features_df)

    X_train, X_test, X_val,y_train, y_test, y_val = split(features_df_normalized, target_df)
    theta_list = []
    bias_list = []
    n_base_learners = 9
    maxIteration = 1000 
    learning_rate = 0.01
    noFeatures = X_train.shape[1]
    for i in range(n_base_learners):
        X_bootstrap, y_bootstrap = resample(X_train, y_train, random_state=i)
        X = X_bootstrap.to_numpy()
        y = y_bootstrap.to_numpy().reshape(-1,1)
        # Initialize theta and bias for the current model
        theta = np.zeros((noFeatures, 1))
        bias = 0

        # Train Logistic Regression on the bootstrap sample
        theta_final, bias_final = logisticRegression(X, y, theta, bias, learning_rate, maxIteration, noFeatures)
        
        # Store the parameters (theta and bias) of the trained base learner
        theta_list.append(theta_final)
        bias_list.append(bias_final)
        output(X_test,y_test,theta_final,bias_final)
    stacking(X_val,X_test,y_val,y_test,theta_list,bias_list,n_base_learners,learning_rate,maxIteration)
    


In [103]:
# model = LogisticRegression(max_iter=maxIteration)
    # model.fit(X_train, y_train)
    # y_pred = model.predict(X_test_np)
    # print(accuracy_score(y_test,y_pred))

Read datasets

In [104]:
def read_2():
    global dataframe
    column_file = 'adult/adult.names'
    data_file = 'adult/adult.data'
    columns = []
    with open(column_file, 'r') as f:
        for line in f:
            if '|' not in line:  # Ignore lines starting with '|'
                if ':' in line:
                    # Extract the column name before the ':' punctuation mark
                    col_name = line.split(':')[0].strip()
                    columns.append(col_name)
    columns.append('income-exceeds')
    dataframe = pd.read_csv(data_file, header=None)
    # print(dataframe)
    dataframe.columns = columns
    # print(dataframe)
    dataframe.replace(' ?', np.nan, inplace=True)

def read(input):
    global dataframe
    target_col_name=None
    # file 1
    if input == 1:
        dataframe = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
        target_col_name='Churn'
    # file 2
    elif input == 2:
        read_2()
        target_col_name='income-exceeds'
    # file 3
    elif input == 3:
        dataframe = pd.read_csv('creditcard.csv')
        target_col_name='Class'
    else:
        print("Invalid input")
    feature,target=preprocessing(target_col_name)
    # train(feature,target,target_col_name)
    bagging(feature,target,target_col_name)


Input

In [105]:
user_input = int(input("Enter 1, 2 or 3: "))
read(user_input)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataframe[column].fillna(mode_value,inplace=True)


Accuracy: 0.8409649661954518
Sensitivity/Recall:  0.6412975670617592
Specificity:  0.9062181447502549
Precision:  0.6908602150537635
F1 Score:  0.6651569071497897
AUROC:  0.8948738444672102
AUPR:  0.7419782770899328
Accuracy: 0.8401966810079902
Sensitivity/Recall:  0.6188396756082346
Specificity:  0.9125382262996942
Precision:  0.6980999296270233
F1 Score:  0.656084656084656
AUROC:  0.8937780397737933
AUPR:  0.7386554270721246
Accuracy: 0.8380454824830977
Sensitivity/Recall:  0.6138490330630069
Specificity:  0.9113149847094801
Precision:  0.693446088794926
F1 Score:  0.6512243547319656
AUROC:  0.8949412512090289
AUPR:  0.7414784590188622
Accuracy: 0.8394283958205285
Sensitivity/Recall:  0.620711166562695
Specificity:  0.910907237512742
Precision:  0.6948324022346368
F1 Score:  0.6556836902800659
AUROC:  0.8947783303858782
AUPR:  0.7419707488490423
Accuracy: 0.8388137676705593
Sensitivity/Recall:  0.6350592638802246
Specificity:  0.9054026503567788
Precision:  0.6869095816464238
F1 Scor