In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import time

# load train and test data (already splited)
train_df = pd.read_csv('./data/train_df.csv')
test_df = pd.read_csv('./data/test_df.csv')

# data preprocessing function
def normalize_data(arr):
    S = sum(arr)
    return [x / S for x in arr]

# train dataset preprocessing
X_train = train_df.drop(columns=['km_diagnosis'])
y_train = train_df['km_diagnosis']

# normalize columns that are not in 0~1 range
columns_to_normalize = ['bmi', 'forehead_circumference', 'neck_circumference', 
                        'armpit_circumference', 'bust', 'rib_cage', 
                        'waist_circumference', 'iliac_circumference', 
                        'femur_circumference', 'urinenighttime_urination']

for col in columns_to_normalize:
    X_train[col] = normalize_data(X_train[col].values)

# test dataset preprocessing
X_test = test_df.drop(columns=['km_diagnosis'])
y_test = test_df['km_diagnosis']

for col in columns_to_normalize:
    X_test[col] = normalize_data(X_test[col].values)


# train logistic regression model
# set initial beta (as same as HE model)
initial_beta = 2 * np.random.rand(X_train.shape[1]) - 1
model = LogisticRegression(C=1e5, solver='lbfgs', max_iter=1000, fit_intercept=True)
model.coef_ = np.array([initial_beta[:-1]])
model.intercept_ = np.array([initial_beta[-1]])

start_time = time.time()
model.fit(X_train, y_train)
execution_time = time.time() - start_time
print('model training execution time: ', execution_time)

# model prediction
y_pred = model.predict(X_test)

# calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of Plaintext model: {accuracy}")

# set threshold to 0.6 and then calculate accuracy again
y_pred_proba = model.predict_proba(X_test)[:, 1]
threshold = 0.6
y_pred_adjusted = (y_pred_proba >= threshold).astype(int)
adjusted_accuracy = accuracy_score(y_test, y_pred_adjusted)
print(f"Accuracy of Plaintext model with adjusted threshold(0.6): {adjusted_accuracy}")

model training execution time:  0.09090709686279297
Accuracy of Plaintext model: 0.5466666666666666
Accuracy of Plaintext model with adjusted threshold(0.6): 0.5466666666666666
