In [136]:
import numpy as np
import pandas as pd
from pathlib import Path
import pickle
from tqdm.notebook import trange, tqdm

from sklearn.linear_model import (
	LogisticRegression, LogisticRegressionCV
)
from sklearn.metrics import (
	confusion_matrix, classification_report, f1_score,
	roc_curve, roc_auc_score, auc, RocCurveDisplay,
    brier_score_loss
)
from sklearn.model_selection import (
	train_test_split, RandomizedSearchCV, GridSearchCV, 
	cross_val_score, cross_val_predict, KFold, StratifiedKFold,
    RepeatedStratifiedKFold
)
from sklearn.preprocessing import (
	LabelEncoder, OneHotEncoder, StandardScaler,
	RobustScaler, QuantileTransformer,
)

SEED = 123
TEST_SIZE = 0.3

In [137]:
with open(Path('../data/df.pkl'), 'rb') as f:
    df = pickle.load(f)

print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148 entries, 0 to 147
Data columns (total 26 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   pid                   148 non-null    object  
 1   study                 148 non-null    object  
 2   sample_id             148 non-null    object  
 3   class                 148 non-null    category
 4   age_years             148 non-null    float64 
 5   gender                148 non-null    category
 6   smoking_status        148 non-null    category
 7   packyears             148 non-null    float64 
 8   artery_number_5       148 non-null    int64   
 9   artery_volume_5       148 non-null    float64 
 10  artery_tortuosity_5   148 non-null    float64 
 11  artery_number_10      148 non-null    int64   
 12  artery_volume_10      148 non-null    float64 
 13  artery_tortuosity_10  148 non-null    float64 
 14  artery_number_15      148 non-null    int64   
 15  artery

Unnamed: 0,pid,study,sample_id,class,age_years,gender,smoking_status,packyears,artery_number_5,artery_volume_5,...,artery_tortuosity_15,vein_number_5,vein_volume_5,vein_tortuosity_5,vein_number_10,vein_volume_10,vein_tortuosity_10,vein_number_15,vein_volume_15,vein_tortuosity_15
0,2008-420,Cooper,420-1,Malignant,65.0,M,former,5.0,1,0.005662,...,1.0,1,0.013388,1.0,1,0.021113,1.0,1,0.034588,1.02139
1,03745-2,PLuSS,LS15-0098,Benign,69.4,M,former,41.0,0,0.00125,...,1.02196,0,0.0,1.0,1,0.055875,1.0,3,0.111875,1.0
2,03533-3,PLuSS,LS15-0162,Benign,66.5,F,former,60.0,1,0.034375,...,1.23261,0,0.0,1.0,1,0.03275,1.0,2,0.091125,1.41931
3,03336-8,PLuSS,LS14-0337,Benign,69.6,M,former,75.0,1,0.0145,...,1.0373,2,0.03875,1.0358,4,0.106625,1.0385,9,0.262,1.04822
4,03244-8,PLuSS,LS15-0063,Benign,70.4,M,former,37.0,4,0.25575,...,1.17149,5,0.388542,1.13853,10,0.520625,1.10964,15,0.892933,1.08346


In [138]:
demo_cols = ['age_years', 'gender', 'smoking_status', 'packyears']
artery_cols = list(df.filter(regex='artery_', axis=1).columns)
vein_cols =   list(df.filter(regex='vein_',   axis=1).columns)
all_features = demo_cols + artery_cols + vein_cols

X = df.loc[:, all_features]

le = LabelEncoder()
y = le.fit_transform(df.loc[:, 'class'])

for i, label in enumerate(le.classes_):
    print(f"{i} = {label}")

y

0 = Benign
1 = Malignant


array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0])

In [139]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=TEST_SIZE, random_state=SEED)

In [140]:
# Categorical

ohe = OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False)

X_cat_tr = pd.DataFrame(
    ohe.fit_transform(X_tr.select_dtypes('category')),
    index=X_tr.index,
    columns=ohe.get_feature_names_out()
)

X_cat_te = pd.DataFrame(
    ohe.fit_transform(X_te.select_dtypes('category')),
    index=X_te.index,
    columns=ohe.get_feature_names_out()
)


# Numeric
scaler = StandardScaler()

X_num_tr = pd.DataFrame(
    scaler.fit_transform(X_tr.select_dtypes(['int', 'float'])),
    index=X_tr.index,
    columns=X_tr.select_dtypes(['int', 'float']).columns
)

X_num_te = pd.DataFrame(
    scaler.transform(X_te.select_dtypes(['int', 'float'])),
    index=X_te.index,
    columns=X_te.select_dtypes(['int', 'float']).columns
)

Xp_tr = pd.concat([X_cat_tr, X_num_tr], axis=1)
Xp_te = pd.concat([X_cat_te, X_num_te], axis=1)

print(f"Shape(train): {Xp_tr.shape}\nShape(test): {Xp_te.shape}")
Xp_tr.head()

Shape(train): (103, 22)
Shape(test): (45, 22)


Unnamed: 0,gender_M,smoking_status_former,age_years,packyears,artery_number_5,artery_volume_5,artery_tortuosity_5,artery_number_10,artery_volume_10,artery_tortuosity_10,...,artery_tortuosity_15,vein_number_5,vein_volume_5,vein_tortuosity_5,vein_number_10,vein_volume_10,vein_tortuosity_10,vein_number_15,vein_volume_15,vein_tortuosity_15
53,1.0,1.0,-0.333474,0.222065,-0.840531,-0.593044,-0.579218,-0.648643,-0.677647,-0.809787,...,-0.903001,-0.653706,-0.352061,0.345673,-0.652965,-0.467049,-0.169009,-0.410321,-0.548741,1.119312
19,1.0,1.0,0.849212,-0.466482,0.180999,0.718436,-0.310359,0.074934,0.382382,-0.561598,...,-0.643161,-0.910697,-0.569273,-0.294839,-0.520049,-0.432649,-0.395586,-0.163649,-0.275604,-0.661644
38,0.0,0.0,0.915842,-0.595585,-0.840531,-0.54225,-0.00427,-0.407451,-0.111597,1.367383,...,0.534424,0.374259,0.408626,-0.505027,0.676193,0.17852,-0.436323,0.165246,-0.092385,0.147801
120,0.0,1.0,0.399458,-0.724687,-0.840531,-0.628279,-0.579218,-0.76924,-0.725954,-0.822673,...,-1.108695,-0.910697,-0.598087,0.290817,-0.652965,-0.658619,-0.038213,-0.82144,-0.713526,-0.287473
23,0.0,1.0,0.416115,-1.327165,-0.840531,-0.628279,-0.579218,-0.889836,-0.726149,-0.822673,...,-0.621532,-0.653706,-0.565284,-0.733975,-0.652965,-0.685397,-0.922639,-0.574768,-0.640038,-0.856368


In [146]:
demo_cols_enc = ['age_years', 'gender_M', 'smoking_status_former', 'packyears']

X_demo_tr, X_demo_te = Xp_tr.loc[:, demo_cols_enc], Xp_te.loc[:, demo_cols_enc]
X_arte_tr, X_arte_te = Xp_tr.loc[:, artery_cols], Xp_te.loc[:, artery_cols]
X_vein_tr, X_vein_te = Xp_tr.loc[:, vein_cols], Xp_te.loc[:, vein_cols]
X_all_tr = pd.concat([X_demo_tr, X_arte_tr, X_vein_tr], axis=1)
X_all_te = pd.concat([X_demo_te, X_arte_te, X_vein_te], axis=1)

In [147]:
# DEFINE MODEL
logit_cv = LogisticRegressionCV(
    Cs=200, 
    fit_intercept=True,
    cv=5,
    penalty='l1',
    solver='liblinear'
)

# INIT PREDICTIONS AND COEF
predictions = pd.DataFrame()
coefficients = pd.DataFrame()

# DEMO
# ---- Fit and predict
logit_cv.fit(X_demo_tr, y_tr)
temp_predictions = pd.DataFrame(logit_cv.predict_proba(X_demo_te)[:, 1], columns=['Demo'], index=X_demo_te.index)
temp_coef = pd.DataFrame({'Demo': logit_cv.coef_.squeeze()}, index=logit_cv.feature_names_in_)
# ---- Combine
predictions = pd.concat([predictions, temp_predictions], axis=1)
coefficients = pd.concat([coefficients, temp_coef], axis=1)

# ARTERY
# ---- Fit and predict
logit_cv.fit(X_arte_tr, y_tr)
temp_predictions = pd.DataFrame(logit_cv.predict_proba(X_arte_te)[:, 1], columns=['Artery'], index=X_arte_te.index)
temp_coef = pd.DataFrame({'Artery': logit_cv.coef_.squeeze()}, index=logit_cv.feature_names_in_)
# ---- Combine
predictions = pd.concat([predictions, temp_predictions], axis=1)
coefficients = pd.concat([coefficients, temp_coef], axis=1)

# VEIN
# ---- Fit and predict
logit_cv.fit(X_vein_tr, y_tr)
temp_predictions = pd.DataFrame(logit_cv.predict_proba(X_vein_te)[:, 1], columns=['Vein'], index=X_vein_te.index)
temp_coef = pd.DataFrame({'Vein': logit_cv.coef_.squeeze()}, index=logit_cv.feature_names_in_)
# ---- Combine
predictions = pd.concat([predictions, temp_predictions], axis=1)
coefficients = pd.concat([coefficients, temp_coef], axis=1)

# ALL
# ---- Fit and predict
logit_cv.fit(X_all_tr, y_tr)
temp_predictions = pd.DataFrame(logit_cv.predict_proba(X_all_te)[:, 1], columns=['All'], index=X_all_te.index)
temp_coef = pd.DataFrame({'All': logit_cv.coef_.squeeze()}, index=logit_cv.feature_names_in_)
# ---- Combine
predictions = pd.concat([predictions, temp_predictions], axis=1)
coefficients = pd.concat([coefficients, temp_coef], axis=1)

# GROUND TRUTH
predictions = pd.concat([predictions, pd.Series(y_te, name='GroundTruth', index=X_vein_te.index)], axis=1)

# EXPORT AND SHOW
predictions.to_csv('../output/logit_predictions.csv', index=False)
predictions.round(3).head()



Unnamed: 0,Demo,Artery,Vein,All,GroundTruth
87,0.472,0.909,0.588,0.854,1
125,0.454,0.873,0.619,0.756,1
124,0.42,0.989,0.72,0.967,1
103,0.524,0.572,0.502,0.447,1
42,0.598,0.232,0.446,0.336,0


In [148]:
coefficients

Unnamed: 0,Demo,Artery,Vein,All
age_years,-0.418889,,,-0.20222
gender_M,0.0,,,0.0
smoking_status_former,0.0,,,0.0
packyears,0.0,,,-0.254038
artery_number_5,,2.110092,,1.037282
artery_volume_5,,0.0,,0.0
artery_tortuosity_5,,0.0,,0.0
artery_number_10,,0.0,,0.0
artery_volume_10,,0.0,,0.0
artery_tortuosity_10,,-0.075722,,0.0
