In [None]:
import numpy as np
import pandas as pd
import healthcareai as hai
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv('./PFTResultsDatasetv2.csv')

In [None]:
"""
      'TestTakenDTS'              # Same as VisitDate except with time. discarding for now
    
    , 'HeightInches'              # Information captured in BMI
    , 'WeightLbs'                 # Information captured in BMI
    
    , 'AssumedPatientEncounterID' # ID column. no semantic significance
    , 'PatientID'                 # ID column. no semantic significance
    , 'PAT_DemId'                 # ID column. no semantic significance
    , 'VISITINFO_ID'              # ID column. no semantic significance
    , 'LEVEL_ID'                  # ID column. no semantic significance
    , 'MEAS_Id'                   # ID column. no semantic significance
    , 'SUBMEAS_Id'                # ID column. no semantic significance
    
    , 'IsCOPDFLG'                 # This based on the patient's problem list which is unreliable
                                  # per problem documentation, patients were mistakenly diagnosed unless given a test
    
    , 'IsFEV1FVCUnder70FLG'       # Removing this feature because it is derived from the target column FEV1FVC_PRE
"""

col_exclude = [
      'TestTakenDTS'
    , 'HeightInches'
    , 'WeightLbs'
    , 'AssumedPatientEncounterID'
    , 'PatientID'
    , 'PAT_DemId'
    , 'VISITINFO_ID'
    , 'LEVEL_ID'
    , 'MEAS_Id'
    , 'SUBMEAS_Id'
    , 'IsCOPDFLG'
    , 'IsFEV1FVCUnder70FLG'
]

In [None]:
# return all columns exclude those listed in the exlcusion list
copd = df.drop(col_exclude, axis=1) # alternate method = df.loc[:, ~df.columns.isin(col_exclude)]

# this is done because the healthcareai package seems to not like "Is" in the column names and confuses it with python syntax
alias = { 
      'IsChlamydialInfectionNBR' : 'ChlamydialInfectionNBR'
    , 'IsDiabetesMellitusNBR' : 'DiabetesMellitusNBR'
    , 'IsHeartFailureNBR' : 'HeartFailureNBR'
    , 'IsUpperRespiratoryNBR' : 'UpperRespiratoryNBR'
    , 'IsBronchitisNBR' : 'BronchitisNBR'
    , 'IsCOPDNBR' : 'COPDNBR'
    , 'IsAsthmaNBR' : 'AsthmaNBR'
    , 'IsPneumoniaNBR' : 'PneumoniaNBR'
    , 'IsCoughNBR' : 'CoughNBR'
    , 'IsDyspneaNBR' : 'DyspneaNBR'
    , 'IsRespiratorySymptomsNBR' : 'RespiratorySymptomsNBR'
    , 'IsChestPainNBR' : 'ChestPainNBR'
    , 'IsFatigueNBR' : 'FatigueNBR'
    , 'IsSleepDisorderNBR' : 'SleepDisorderNBR'
    , 'IsChlamydialInfectionFLG' : 'ChlamydialInfectionFLG'
    , 'IsDiabetesMellitusFLG' : 'DiabetesMellitusFLG'
    , 'IsHeartFailureFLG' : 'HeartFailureFLG'
    , 'IsUpperRespiratoryFLG' : 'UpperRespiratoryFLG'
    , 'IsBronchitisFLG' : 'BronchitisFLG'
    , 'IsAsthmaFLG' : 'AsthmaFLG'
    , 'IsPneumoniaFLG' : 'PneumoniaFLG'
    , 'IsCoughFLG' : 'CoughFLG'
    , 'IsDyspneaFLG' : 'DyspneaFLG'
    , 'IsRespiratorySymptomsFLG' : 'RespiratorySymptomsFLG'
    , 'IsChestPainFLG' : 'ChestPainFLG'
    , 'IsFatigueFLG' : 'FatigueFLG'
    , 'IsSleepDisorderFLG' : 'SleepDisorderFLG'
    , 'IsTobaccoUserDSC' : 'TobaccoUserDSC'
    , 'IsMaleFLG' : 'MaleFLG'      
}

copd.rename(columns=alias, inplace=True)

In [None]:
# create function to round float to integer if not NaN
def round_nan(number):
    if not np.isnan(number):
        return round(number)
    return number        

In [None]:
# round floats to integers because decimal places are not sensitive enough to mean significance
# example: BMI of 29.24107, a 0.24107 BMI would not determine a person's BMI level on its own

copd['BMI'] = copd['BMI'].apply(round_nan)
copd['BloodPressureDiastolicNBR'] = copd['BloodPressureDiastolicNBR'].apply(round_nan)
copd['BloodPressureSystolicNBR'] = copd['BloodPressureSystolicNBR'].apply(round_nan)

In [None]:
# I am considering transforming this to an ordinal value where there would be 10 values to regress to
# 0-9 = 0, 10-19 = 1, 20-29 = 2, 30-39 = 3, 40-49 = 4 ... 90-100 = 10
target = copd['FEV1FVC_PRE']

In [None]:
# get count of unique values of each column and remove low cardinality columns that would not help with the training
low_cardinality = []

for i in range(copd.shape[1]):
    if copd.iloc[:,i].nunique() < 2:
        low_cardinality.append(copd.columns[i])
        #print(copd.columns[i], ' unique values = ', copd.iloc[:,i].nunique(),'\n')
        
# print(low_cardinality)
copd.drop(low_cardinality, axis=1, inplace=True)

In [None]:
# split the data into training, validation, and final model evaluation
# 70% train, 20% validation, 10% final model evaluation
# there are 4,783 rows of data
classification_trainer = hai.SupervisedModelTrainer(
          dataframe = copd
        , predicted_column = 'FEV1FVC_PRE'
        , model_type = 'regression'
        , grain_column = 'MRN'
        , impute = True
        , imputeStrategy = 'RandomForest'
        , verbose = True
)

In [None]:
rfr = classification_trainer.random_forest_regression()

In [None]:
print(rfr.metrics)