In [11]:
import pandas as pd
import os, json
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression  # Using Logistic instead of Lasso for classification
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.multioutput import MultiOutputClassifier
import xgboost as xgb
from xgboost import XGBClassifier
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, InputLayer

## 1 - Load the training data

In [12]:
file_path = os.path.join('..', 'data', 'cleaned')

train_df = pd.read_csv(os.path.join(file_path, 'train.csv'), low_memory=False)
train_df.head()

Unnamed: 0,file,VMONTH,VYEAR,VDAYR,YEAR,AGE,SEX,ETHNIC,RACE,USETOBAC,...,PHYSASST,NPNMW,RNLPN,OTHPROV,MHP,NODISP,REFOTHMD,RETAPPT,OTHDISP,ERADMHOS
0,opd2006.csv,December,2006.0,Friday,2006.0,55.0,Male,Not Hispanic or Latino,White Only,Not current,...,No,No,No,No,,One or more dispositions marked,No,No,No,No
1,opd2006.csv,November,2006.0,Thursday,2006.0,66.0,Male,Not Hispanic or Latino,White Only,Not current,...,No,No,No,No,,One or more dispositions marked,No,No,No,No
2,opd2006.csv,November,2006.0,Wednesday,2006.0,71.0,Female,Not Hispanic or Latino,White Only,Not current,...,No,No,No,No,,One or more dispositions marked,Yes,No,No,No
3,opd2006.csv,November,2006.0,Tuesday,2006.0,1.0,Female,Not Hispanic or Latino,White Only,Not current,...,No,No,No,No,,One or more dispositions marked,No,No,No,No
4,opd2006.csv,November,2006.0,Monday,2006.0,21.0,Female,Not Hispanic or Latino,White Only,Current,...,No,No,No,No,,One or more dispositions marked,No,No,No,No


## 2 - Load the vairiables dictionary and define features for clustering

In [13]:
# Load the variables dictionary
with open(os.path.join(file_path, 'variables.json'), 'r') as f:
    variables = json.load(f)

print(f'Variable Categories:\n')
for category, list in variables.items():
    print(f'{category}')
    print(f'{list}')

Variable Categories:

dateOfVisit
['VMONTH', 'VYEAR', 'VDAYR', 'YEAR']
demographics
['AGE', 'SEX', 'ETHNIC', 'RACE', 'USETOBAC']
payment
['PAYPRIV', 'PAYMCARE', 'PAYMCAID', 'PAYWKCMP', 'PAYSELF', 'PAYNOCHG', 'PAYOTH', 'PAYDK', 'PAYTYPER']
visitReason
['INJDET', 'INJURY', 'MAJOR', 'RFV1', 'RFV2', 'RFV3']
patientClinicHistory
['SENBEFOR', 'PASTVIS']
vitalSigns
['HTIN', 'WTLB', 'BMI', 'TEMPF', 'BPSYS', 'BPDIAS']
imputedFields
['BDATEFL', 'SEXFL', 'SENBEFL', 'PASTFL']
physicianDiagnoses
['DIAG1', 'DIAG2', 'DIAG3']
differentialDiagnoses
['PRDIAG1', 'PRDIAG2', 'PRDIAG3']
presentSymptomsStatus
['ARTHRTIS', 'ASTHMA', 'CANCER', 'CASTAGE', 'CEBVD', 'CHF', 'CRF', 'COPD', 'DEPRN', 'DIABETES', 'HYPLIPID', 'HTN', 'IHD', 'OBESITY', 'OSTPRSIS', 'NOCHRON', 'TOTCHRON', 'DMP']
services
['BREAST', 'PELVIC', 'RECTAL', 'SKIN', 'DEPRESS', 'BONEDENS', 'MAMMO', 'MRI', 'ULTRASND', 'XRAY', 'OTHIMAGE', 'CBC', 'ELECTROL', 'GLUCOSE', 'HGBA', 'CHOLEST', 'PSA', 'OTHERBLD', 'BIOPSY', 'CHLAMYD', 'PAPCONV', 'PAPLIQ', 'P

### 2.1 Defining features

In [14]:
# Defining the independent variables as features
features = [item for item in variables['demographics'] if item not in ['USETOBAC']]\
           + variables['vitalSigns']  \
         + variables['patientClinicHistory']\
        + [item for item in variables['presentSymptomsStatus'] if item not in ['NOCHRON', 'TOTCHRON']]
#+ variables['healthEducation']+ variables['visitReason']+ variables['vitalSigns']
print(f'Features: {features}')
print(f'Number of Features: {len(features)}')

Features: ['AGE', 'SEX', 'ETHNIC', 'RACE', 'HTIN', 'WTLB', 'BMI', 'TEMPF', 'BPSYS', 'BPDIAS', 'SENBEFOR', 'PASTVIS', 'ARTHRTIS', 'ASTHMA', 'CANCER', 'CASTAGE', 'CEBVD', 'CHF', 'CRF', 'COPD', 'DEPRN', 'DIABETES', 'HYPLIPID', 'HTN', 'IHD', 'OBESITY', 'OSTPRSIS', 'DMP']
Number of Features: 28


In [15]:
targets = variables['services']
        #+ variables['differentialDiagnoses']  + variables['physicianDiagnoses'] 
print(f'Ftargets: {targets}')
print(f'Number of Ftargets: {len(targets)}')

Ftargets: ['BREAST', 'PELVIC', 'RECTAL', 'SKIN', 'DEPRESS', 'BONEDENS', 'MAMMO', 'MRI', 'ULTRASND', 'XRAY', 'OTHIMAGE', 'CBC', 'ELECTROL', 'GLUCOSE', 'HGBA', 'CHOLEST', 'PSA', 'OTHERBLD', 'BIOPSY', 'CHLAMYD', 'PAPCONV', 'PAPLIQ', 'PAPUNSP', 'HPVDNA', 'EKG', 'SPIRO', 'URINE', 'HTTAKE', 'WTTAKE', 'TEMPTAKE', 'BLODPRES', 'CAM', 'DME', 'HOMEHLTH', 'HOSPICE', 'PT', 'RADTHER', 'SPOCTHER', 'PSYCHOTH', 'OTHMNTL', 'EXCISION', 'ORTHO', 'WOUND', 'ECHOCARD', 'OTHULTRA', 'PROC1', 'PROC2', 'PROC3', 'PROC4', 'PROC5', 'PROC6', 'PROC7', 'PROC8', 'PROC9', 'CATSCAN', 'PREGTEST', 'FOOT', 'RETINAL', 'HIVTEST', 'CAST', 'SPLINT']
Number of Ftargets: 61


In [16]:
train_df['BMI'].isna().sum()

65377

In [17]:
65377/103486

0.6317472894884332

In [18]:
# Calculate the median of the 'BMI' column
bmi_median = train_df['BMI'].median()

# Fill NaN values in the 'BMI' column with the calculated median
train_df['BMI'] = train_df['BMI'].fillna(bmi_median)

In [19]:
# Preprocessing
numeric_features = ['AGE', 'HTIN', 'WTLB', 'BMI', 'TEMPF', 'BPSYS', 'BPDIAS','PASTVIS']
categorical_features = sorted(set(features)- set(numeric_features))

In [20]:
# Iterate over the list of numeric features
for column in numeric_features:
    # Calculate the median of each column
    median_value = train_df[column].median()
    
    # Fill NaN values in each column with its calculated median
    train_df[column] = train_df[column].fillna(median_value)

# Optionally, check if there are any NaN values left in the numeric features
for column in numeric_features:
    print(f"NaN count in {column}: {train_df[column].isna().sum()}")  # This should print 0 for each column if all NaN values are filled

NaN count in AGE: 0
NaN count in HTIN: 0
NaN count in WTLB: 0
NaN count in BMI: 0
NaN count in TEMPF: 0
NaN count in BPSYS: 0
NaN count in BPDIAS: 0
NaN count in PASTVIS: 0


In [21]:
# Iterate over the list of categorical features
for column in categorical_features:
    # Fill NaN values in each column with 'unknown'
    train_df[column] = train_df[column].fillna('unknown')

# Optionally, check if there are any NaN values left in the categorical features
for column in categorical_features:
    print(f"NaN count in {column}: {train_df[column].isna().sum()}")  # This should print 0 for each column if all NaN values are filled

NaN count in ARTHRTIS: 0
NaN count in ASTHMA: 0
NaN count in CANCER: 0
NaN count in CASTAGE: 0
NaN count in CEBVD: 0
NaN count in CHF: 0
NaN count in COPD: 0
NaN count in CRF: 0
NaN count in DEPRN: 0
NaN count in DIABETES: 0
NaN count in DMP: 0
NaN count in ETHNIC: 0
NaN count in HTN: 0
NaN count in HYPLIPID: 0
NaN count in IHD: 0
NaN count in OBESITY: 0
NaN count in OSTPRSIS: 0
NaN count in RACE: 0
NaN count in SENBEFOR: 0
NaN count in SEX: 0


In [22]:
X = train_df[features]
y = train_df[targets]

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [23]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [24]:
# Adjust y_train imputation strategy
# If y_train is a DataFrame with multiple target columns, impute each column
for column in y_train.columns:
    y_train[column].fillna('unknown', inplace=True)

  y_train[column].fillna('unknown', inplace=True)


In [25]:
# Define Models
models = {
    "Random Forest": RandomForestClassifier(random_state=42, n_jobs=-1),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "XGBoost": xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')
}

In [41]:
# MultiOutput Wrapper

for name, model in models.items():
    print(next(iter(models.keys())))
    wrapped_model = MultiOutputClassifier(model, n_jobs=-1)
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('model', wrapped_model)])
    #print(X_train.sample(5))
    #print(y_train.sample(5))
    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_test)
    print(f"{name} Accuracy: {accuracy_score(y_test, preds)}")

Random Forest


KeyboardInterrupt: 