In [129]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder,StandardScaler,MinMaxScaler
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc, precision_recall_curve,accuracy_score


Mainly reading and preprocessing

In [130]:
dataframe=None

def sigmoid(z):
    z = np.array(z, dtype=float)
    return 1 / (1 + np.exp(-z))

def logisticRegression(X, y, theta, bias, learning_rate, maxIteration, noFeatures):
    # print('hi')
    m=len(X)
    for iteration in range(maxIteration):
        # matrix multiplication
        h = sigmoid(np.dot(X,theta)+bias)
        gradient = np.dot(X.T,(h - y))/m
        db=np.sum(h-y)/m
        theta -= learning_rate * gradient
        bias -= learning_rate * db
    return theta,bias

def predict(X, theta,bias):
    probabilities = sigmoid(np.dot(X,theta)+bias)
    predictions = [1 if prob >= 0.5 else 0 for prob in probabilities]
    return np.array(predictions)

def scalingFunction(scaling='standard'):
    if scaling == 'standard':
        scaler = StandardScaler()
    elif scaling == 'minmax':
        scaler = MinMaxScaler()
    
    return scaler

def normalize(X):
    # m,n=X.shape
    # for i in range(n):
    #     X=(X-X.mean(axis=0))/X.std(axis=0)
    return (X - X.mean()) / X.std()



In [None]:
def preprocessing(target_col_name):
    # drop null and fill null
    dataframe.dropna(subset=[target_col_name], inplace=True)
    dataframe.fillna(dataframe.mean(numeric_only=True),inplace=True)
    # fill null for non-numeric columns
    non_numerical_columns = dataframe.select_dtypes(include=['object']).columns
    for column in non_numerical_columns:
        mode_value = dataframe[column].mode()[0]
        dataframe[column].fillna(mode_value,inplace=True)
    # drop duplicates
    dataframe.drop_duplicates(inplace=True)
    # feature and target
    features=dataframe.drop(target_col_name,axis=1)
    target=dataframe[target_col_name]
    # label encoding the target
    encoder=LabelEncoder()
    target=encoder.fit_transform(target)
    # categorization and one-hot encoding
    categorical_columns=features.select_dtypes(include=['object']).columns
    for col in categorical_columns:
        features[col]=features[col].astype('category')
    features=pd.get_dummies(features,columns=categorical_columns)
    # scaling
    candidate_columns=features.select_dtypes(exclude=['bool']).columns
    # scaler=scalingFunction('standard')
    scaler=scalingFunction('minmax')
    features_scaled=features.copy()
    features_scaled[candidate_columns]=scaler.fit_transform(features[candidate_columns])
    # transform to dataframe
    features_df=pd.DataFrame(features_scaled,columns=features.columns)
    target_df=pd.DataFrame(target,columns=[target_col_name])
    # print(features_df.head())
    # print(target_df.head())
    # adding for x0
    features_df_normalized=normalize(features_df)
    # features_df_normalized.insert(0, 'x0', 1)
    # split into datasets
    # X_train, X_test, y_train, y_test = train_test_split(features_df, target_df, test_size=0.2, random_state=42)
    X_train, X_test, y_train, y_test = train_test_split(features_df_normalized, target_df, test_size=0.2, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    # to numpy array
    X = X_train.to_numpy()
    # print(X.shape)
    y = y_train.to_numpy().reshape(-1,1)
    # print(y.shape)
    maxIteration=1000
    # noFeatures=features_df.shape[1]
    noFeatures=features_df_normalized.shape[1]
    # I was stuck here....
    theta = np.zeros((noFeatures,1))
    # print(theta.shape)
    bias=0
    learning_rate = 0.01

    theta_final,bias_final= logisticRegression(X, y, theta,bias, learning_rate, maxIteration, noFeatures)
    X_test_np = X_test.to_numpy()
    # print(X_test_np.shape)
    # print(theta_final.shape)
    predictions = predict(X_test_np, theta_final,bias_final)
    print(predictions)
    y_test_np = y_test.to_numpy().flatten()
    print(y_test_np)
    accuracy = np.mean(predictions == y_test_np)
    print(f"Accuracy: {accuracy}")
    sensitivity = recall_score(y_test, predictions)
    print("Sensitivity/Recall: ", sensitivity)
    tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
    specificity = tn / (tn + fp)
    print("Specificity: ", specificity)
    precision = precision_score(y_test, predictions)
    print("Precision: ", precision)
    f1 = f1_score(y_test, predictions)
    print("F1 Score: ", f1)
    y_prob = sigmoid(np.dot(X_test_np,theta_final))
    auroc = roc_auc_score(y_test, y_prob)
    print("AUROC: ", auroc)
    precision_values, recall_values, _ = precision_recall_curve(y_test, y_prob)
    aupr = auc(recall_values, precision_values)
    print("AUPR: ", aupr)
    model = LogisticRegression(max_iter=maxIteration)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test_np)
    print(accuracy_score(y_test,y_pred))

In [131]:
def read_2():
    global dataframe
    column_file = 'adult/adult.names'
    data_file = 'adult/adult.data'
    columns = []
    with open(column_file, 'r') as f:
        for line in f:
            if '|' not in line:  # Ignore lines starting with '|'
                if ':' in line:
                    # Extract the column name before the ':' punctuation mark
                    col_name = line.split(':')[0].strip()
                    columns.append(col_name)
    columns.append('income-exceeds')
    dataframe = pd.read_csv(data_file, header=None)
    # print(dataframe)
    dataframe.columns = columns
    # print(dataframe)
    dataframe.replace(' ?', np.nan, inplace=True)

def read(input):
    global dataframe
    # file 1
    if input == 1:
        dataframe = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
        preprocessing('Churn')
    # file 2
    elif input == 2:
        read_2()
        preprocessing('income-exceeds')
    # file 3
    elif input == 3:
        dataframe = pd.read_csv('creditcard.csv')
        preprocessing('Class')
    else:
        print("Invalid input")
    # dataframe.head()



In [132]:
user_input = int(input("Enter 1, 2 or 3: "))
read(user_input)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataframe[column].fillna(mode_value,inplace=True)


[0 0 0 ... 0 0 1]
[0 0 0 ... 0 0 1]
Accuracy: 0.83819913952059
Sensitivity/Recall:  0.6238303181534622
Specificity:  0.908256880733945
Precision:  0.6896551724137931
F1 Score:  0.6550933508024893
AUROC:  0.895106842865346
AUPR:  0.7426320747552857


  y = column_or_1d(y, warn=True)


0.8538721573448064


