In [3]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder,StandardScaler,MinMaxScaler
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc, precision_recall_curve,accuracy_score


Mainly reading and preprocessing

In [4]:
dataframe=None

def sigmoid(z):
    z = np.array(z, dtype=float)
    return 1 / (1 + np.exp(-z))

def logisticRegression(X, y, theta, learning_rate, maxIteration, noFeatures):
    # print('hi')
    m=len(y)
    for iteration in range(maxIteration):
        # matrix multiplication
        h = sigmoid(np.dot(X,theta))
        gradient = np.dot(X.T,(h - y))
        theta = theta - learning_rate * gradient
    return theta

def predict(X, theta):
    probabilities = sigmoid(X @ theta)
    predictions = [1 if prob >= 0.5 else 0 for prob in probabilities]
    return np.array(predictions)

def scalingFunction(scaling='standard'):
    if scaling == 'standard':
        scaler = StandardScaler()
    elif scaling == 'minmax':
        scaler = MinMaxScaler()
    
    return scaler

def normalize(X):
    # m,n=X.shape
    # for i in range(n):
    #     X=(X-X.mean(axis=0))/X.std(axis=0)
    return (X - X.mean()) / X.std()

def preprocessing(target_col_name):
    # drop null and fill null
    dataframe.dropna(subset=[target_col_name], inplace=True)
    dataframe.fillna(dataframe.mean(numeric_only=True),inplace=True)
    # fill null for non-numeric columns
    non_numerical_columns = dataframe.select_dtypes(include=['object']).columns
    for column in non_numerical_columns:
        mode_value = dataframe[column].mode()[0]
        dataframe[column].fillna(mode_value,inplace=True)
    # drop duplicates
    dataframe.drop_duplicates(inplace=True)
    # feature and target
    features=dataframe.drop(target_col_name,axis=1)
    target=dataframe[target_col_name]
    # label encoding the target
    encoder=LabelEncoder()
    target=encoder.fit_transform(target)
    # categorization and one-hot encoding
    categorical_columns=features.select_dtypes(include=['object']).columns
    for col in categorical_columns:
        features[col]=features[col].astype('category')
    features=pd.get_dummies(features,columns=categorical_columns)
    # scaling
    candidate_columns=features.select_dtypes(exclude=['bool']).columns
    scaler=scalingFunction('standard')
    # scaler=scalingFunction('minmax')
    features_scaled=features.copy()
    features_scaled[candidate_columns]=scaler.fit_transform(features[candidate_columns])
    # transform to dataframe
    features_df=pd.DataFrame(features_scaled,columns=features.columns)
    target_df=pd.DataFrame(target,columns=[target_col_name])
    # adding for x0
    features_df_normalized=normalize(features_df)
    features_df_normalized.insert(0, 'x0', 1)
    # split into datasets
    X_train, X_test, y_train, y_test = train_test_split(features_df_normalized, target_df, test_size=0.2, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    # to numpy array
    X = X_train.to_numpy()
    y = y_train.to_numpy()
    maxIteration=1000
    noFeatures=features_df_normalized.shape[1]
    # I was stuck here....
    theta = np.zeros((noFeatures,1))
    learning_rate = 0.01

    theta_final= logisticRegression(X, y, theta, learning_rate, maxIteration, noFeatures)
    X_test_np = X_test.to_numpy()
    predictions = predict(X_test_np, theta_final)
    y_test_np = y_test.to_numpy()
    accuracy = np.mean(predictions == y_test_np)
    print(f"Accuracy: {accuracy}")
    sensitivity = recall_score(y_test, predictions)
    print("Sensitivity/Recall: ", sensitivity)
    tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
    specificity = tn / (tn + fp)
    print("Specificity: ", specificity)
    precision = precision_score(y_test, predictions)
    print("Precision: ", precision)
    f1 = f1_score(y_test, predictions)
    print("F1 Score: ", f1)
    print(X_test_np.shape)
    print(theta_final.shape)
    print(X_test_np)
    print(theta_final)
    y_prob = sigmoid(np.dot(X_test_np,theta_final))  # Predicted probabilities
    print(np.isnan(y_prob).sum())
    print(np.isnan(y_test).sum())
    auroc = roc_auc_score(y_test, y_prob)
    print("AUROC: ", auroc)
    precision_values, recall_values, _ = precision_recall_curve(y_test, y_prob)
    aupr = auc(recall_values, precision_values)
    print("AUPR: ", aupr)
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test_np)
    print(accuracy_score(y_test_np,y_pred))

def read_2():
    global dataframe
    column_file = 'adult/adult.names'
    data_file = 'adult/adult.data'
    columns = []
    with open(column_file, 'r') as f:
        for line in f:
            if '|' not in line:  # Ignore lines starting with '|'
                if ':' in line:
                    # Extract the column name before the ':' punctuation mark
                    col_name = line.split(':')[0].strip()
                    columns.append(col_name)
    columns.append('income-exceeds')
    dataframe = pd.read_csv(data_file, header=None)
    # print(dataframe)
    dataframe.columns = columns
    # print(dataframe)
    dataframe.replace(' ?', np.nan, inplace=True)

def read(input):
    global dataframe
    # file 1
    if input == 1:
        dataframe = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
        preprocessing('Churn')
    # file 2
    elif input == 2:
        read_2()
        preprocessing('income-exceeds')
    # file 3
    elif input == 3:
        dataframe = pd.read_csv('creditcard.csv')
        preprocessing('Class')
    else:
        print("Invalid input")
    # dataframe.head()

user_input = int(input("Enter 1, 2 or 3: "))
read(user_input)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataframe[column].fillna(mode_value,inplace=True)


Accuracy: 0.41100025638687926
Sensitivity/Recall:  0.9544235924932976
Specificity:  0.4063706563706564
Precision:  0.3666323377960865
F1 Score:  0.5297619047619048
(1409, 13619)
(13619, 1)
[[ 1.         -0.43988526 -1.27735389 ... -0.01191574 -0.01191574
  -0.01191574]
 [ 1.         -0.43988526  0.35134502 ... -0.01191574 -0.01191574
  -0.01191574]
 [ 1.         -0.43988526  0.79923722 ... -0.01191574 -0.01191574
  -0.01191574]
 ...
 [ 1.         -0.43988526 -0.62587433 ... -0.01191574 -0.01191574
  -0.01191574]
 [ 1.         -0.43988526  1.49143426 ... -0.01191574 -0.01191574
  -0.01191574]
 [ 1.         -0.43988526 -1.27735389 ... -0.01191574 -0.01191574
  -0.01191574]]
[[-15.14531688]
 [  0.71263846]
 [ -4.17499125]
 ...
 [  0.18046772]
 [ -0.27959666]
 [ -0.38726435]]
0
Churn    0
dtype: int64
AUROC:  0.8268940138913328
AUPR:  0.6872713528915502


  y = column_or_1d(y, warn=True)


In [3]:
# rows,cols=dataframe.shape
# cols,rows
# dataframe.describe()
# dataframe.isnull().sum()
# dataframe.duplicated().sum()
# Data cleaning
# dataframe['Attrition'].isnull().sum()
# # drop the rows where the target value is null
# dataframe.dropna(subset=['Attrition'], inplace=True)
# dataframe['Attrition'].isnull().sum()
# dataframe.isnull().sum()
# dataframe.shape
# dataframe.fillna(dataframe.mean(numeric_only=True),inplace=True)
# non_numerical_columns = dataframe.select_dtypes(exclude=['int64', 'float64']).columns
# for column in non_numerical_columns:
#     mode_value = dataframe[column].mode()[0]
#     dataframe[column].fillna(mode_value,inplace=True)
# dataframe.isnull().sum()
# dataframe.duplicated().sum()
# dataframe.drop_duplicates(inplace=True)
# dataframe.duplicated().sum()
# # Creation of input and output features
# features=dataframe.drop('Attrition',axis=1)
# target=dataframe['Attrition']
# #Conversion of features into numeric values
# from sklearn.preprocessing import LabelEncoder
# encoder=LabelEncoder()
# target=encoder.fit_transform(target)
# target
# categorical_columns=features.select_dtypes(include=['object']).columns
# for col in categorical_columns:
#     features[col]=features[col].astype('category')
# features=pd.get_dummies(features,columns=categorical_columns)
# features
# features.dtypes
# Scaling of the features
# candidate_columns=features.select_dtypes(exclude=['bool']).columns
# candidate_columns
# from sklearn.preprocessing import StandardScaler,MinMaxScaler
# def scalingFunction(scaling='standard'):
#     if scaling == 'standard':
#         scaler = StandardScaler()
#     elif scaling == 'minmax':
#         scaler = MinMaxScaler()
    
#     return scaler

# scaler=scalingFunction('standard')
# # scaler=scalingFunction('minmax')
# features_scaled=features.copy()
# features_scaled[candidate_columns]=scaler.fit_transform(features[candidate_columns])
# features_scaled

#  Correlation Analysis
# # transform from numpy array to dataframe
# features_df=pd.DataFrame(features_scaled,columns=features.columns)
# target_df=pd.DataFrame(target,columns=['Attrition'])
# target_df
# target_series=target_df['Attrition']
# correlation=features_df.corrwith(target_series)
# correlation
# from matplotlib import pyplot as plt
# import seaborn as sns
# features_with_target = pd.concat([features_df, target_series], axis=1)
# correlation_matrix = features_with_target.corr()
# plt.figure(figsize=(30, 30))
# sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.1f')
# plt.title('Correlation Matrix')
# plt.show()
# import numpy as np

# features_df.reset_index(drop=True, inplace=True)
# target_df.reset_index(drop=True, inplace=True)

# class_0=features_df.loc[target_df['Attrition']==0]
# class_1=features_df.loc[target_df['Attrition']==1]

# def plot(column):
#     plt.plot(class_0[column],np.zeros_like(class_0[column]),'o',label='No')
#     plt.plot(class_1[column],np.zeros_like(class_1[column]),'x',label='Yes')

#     plt.legend()
#     plt.xlabel(column)
#     plt.title(f"1D scatter plot of {column}")
#     plt.show()

# top_20_features = correlation.abs().nlargest(20).index
# print(top_20_features)
# for feature in top_20_features:
#     plot(feature)

# Validating the pipeline (Bonus Task)
# top_20_features = correlation.index.tolist() #standard scaling
# features_df = features_df[top_20_features]
# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import accuracy_score

# X_train, X_test, y_train, y_test = train_test_split(features_df, target_df, test_size=0.2, random_state=42)

# model = LogisticRegression()
# model.fit(X_train, y_train)
# y_pred = model.predict(X_test)
# accuracy = accuracy_score(y_test, y_pred)
# accuracy