In [None]:
import os, collections, json
from os import listdir, makedirs
from os.path import join, isdir
import pandas as pd
import numpy as np
import seaborn as sns

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
from data_manipulations import *

### Load data

In [None]:
data_path = join('data', '04_merged', 'all_data_combined.xlsx')
raw_data = pd.read_excel(data_path)
data = raw_data.copy()

### Global variables

In [None]:
save = True

preprocessed_folder = join('data', '05_preprocessed')

if not isdir(preprocessed_folder):
    makedirs(preprocessed_folder)

In [None]:
kis = 'Arztbrief/KIS Angaben'
beh_plan = 'Behandlungsplan'
ct = 'CT'
kai = 'Klinisch-anamnestische Information'
labor = 'Laborparameter'

In [None]:
columns = list(data.columns)

arztbrief_columns = [col for col in columns if check_template(col, kis)]
behandlungsplan_columns = [col for col in columns if check_template(col, beh_plan)]
ct_columns = [col for col in columns if check_template(col, ct)]
kai_columns = [col for col in columns if check_template(col, kai)]
laborparameter_columns = [col for col in columns if check_template(col, labor)]

### Replace NaNs

In [None]:
nan_values = ['Nicht beantwortet', 'Unbekannt', 'Nicht evaluierbar']

# Set these values as NaN
for value in nan_values:
    data = data.replace(value, np.nan)

### Feature selection

In [None]:
# Load numerical, nominal and ordinal features
with open('data/features/full_features.json', 'r') as fp:
    features = json.load(fp)

num_features = features['num']
ordinal_features = features['ordinal']
nominal_features = features['nominal']

#### Nominal feature selection

In [None]:
# Normalized number of missing values (in nominal variables)
data[nominal_features].isnull().sum() * 100 / len(data)

Remove non-ordinal columns where > 30% of data is missing

In [None]:
percent_missing = data[nominal_features].isnull().sum() * 100 / len(data)
missing_value_df = pd.DataFrame({'column_name': nominal_features,
                                 'percent_missing': percent_missing})
drop_features = list(missing_value_df[missing_value_df['percent_missing'] > 30]['column_name'])
for feature in drop_features:
    if 'Laborparameter' not in feature:
        nominal_features.remove(feature)

#### Ordinal feature selection

In [None]:
# Normalized number of missing values (in ordinal variables)
data[ordinal_features].isnull().sum() * 100 / len(data)

Remove ordinal columns where more than 30% of data is missing

In [None]:
percent_missing = data[ordinal_features].isnull().sum() * 100 / len(data)
missing_value_df = pd.DataFrame({'column_name': ordinal_features,
                                 'percent_missing': percent_missing})
drop_features = list(missing_value_df[missing_value_df['percent_missing'] > 30]['column_name'])
for feature in drop_features:
    if 'Laborparameter' not in feature:
        ordinal_features.remove(feature)

#### Numerical feature selection

In [None]:
# Transform datatype of numerical features
for num in num_features:
    try:
        data[num] = pd.to_numeric(data[num])
    except:
        print(num)

In [None]:
# Normalized number of missing values (in numerical variables)
data[num_features].isnull().sum() * 100 / len(data)

Remove numerical columns where more than 30% of data is missing

In [None]:
percent_missing = data[num_features].isnull().sum() * 100 / len(data)
missing_value_df = pd.DataFrame({'column_name': num_features,
                                 'percent_missing': percent_missing})
drop_features = list(missing_value_df[missing_value_df['percent_missing'] > 30]['column_name'])
for feature in drop_features:
    if 'Laborparameter' not in feature:
        num_features.remove(feature)

### Feature encoding

#### Ordinal feature encoding

In [None]:
with open('data/features/ordinal_encoding.json', 'r') as fp:
    ordinal_encoding = json.load(fp)

In [None]:
for cat in ordinal_features:
    if cat not in ordinal_encoding.keys():
        print(cat)

In [None]:
for feature in ordinal_features:
    try:
        data[feature] = data.apply(lambda row: encode(row[feature], ordinal_encoding[feature]), axis=1)
    except KeyError:
        print(feature)

In [None]:
# Transform datatype of ordinal features
for feature in ordinal_features:
    try:
        data[feature] = pd.to_numeric(data[feature])
    except:
        print(feature)

#### Non-ordinal feature encoding

Variante 1: Label encoding

In [None]:
nominal_features_code = [f'{col}##CODE' for col in nominal_features]

In [None]:
for i, col in enumerate(nominal_features):
    data[col] = data[col].astype('category')
    data[nominal_features_code[i]] = data[col].cat.codes

### Feature generation
* Anzahl betroffener Lungenlappen
* Summe Schweregrad aller Lungenlappen
* Mittelwert Schweregrad aller Lungenlappen

In [None]:
generated_features = []

In [None]:
pathologies = ['Konsolidierung', 'Milchglasareal', 'Bronchuswandverdickungen']
lobes = ['Oberlappen rechts', 'Mittellappen rechts', 'Unterlappen rechts', 'Oberlappen links', 'Lingula', 'Unterlappen links']

In [None]:
encoding_schweregrad = {
    'subsegmental': 0,
    'segmental': 1,
    'mehrsegmental bis lobär': 2
}

encoding_bronchi = {
    'vereinzelt': 1,
    'mehrsegmental': 2,
    'generalisiert': 3
}

In [None]:
schweregrad_columns = []
for patho in pathologies:
    for lobe in lobes:
        col = f'CT//{patho}::Schweregrad ({lobe})'
        if patho == 'Bronchuswandverdickungen':
            data[col] = data.apply(lambda row: encode(row[col], encoding_bronchi), axis=1)
        else:
            data[col] = data.apply(lambda row: encode(row[col], encoding_schweregrad), axis=1)
    data[f'CT//{patho}::Anzahl betroffener Lappen'] = data.apply(lambda row: count_affected_lobes(row[f'CT//{patho}::Lokalisation Lappen']), axis=1)
    data[f'CT//{patho}::Schweregrad Summe'] = data.apply(lambda row: get_agg_severity(row, patho, 'sum'), axis=1)
    data[f'CT//{patho}::Schweregrad Mittelwert'] = data.apply(lambda row: get_agg_severity(row, patho, 'avg'), axis=1)
    generated_features.extend([f'CT//{patho}::Anzahl betroffener Lappen', f'CT//{patho}::Schweregrad Summe', f'CT//{patho}::Schweregrad Mittelwert'])

In [None]:
target_variable = 'Target_3'
figure, axis = plt.subplots(3, 3, figsize=(15,15))
for i, patho in enumerate(pathologies):
    sns.boxplot(data=data, x=target_variable, y=f'CT//{patho}::Anzahl betroffener Lappen', ax=axis[i, 0])
    sns.boxplot(data=data, x=target_variable, y=f'CT//{patho}::Schweregrad Summe', ax=axis[i, 1])
    sns.boxplot(data=data, x=target_variable, y=f'CT//{patho}::Schweregrad Mittelwert', ax=axis[i, 2])

* Anzahl an Komorbiditäten

In [None]:
comorbs = {'Klinisch-anamnestische Information//Komorbiditäten aus Arztbrief::Emphysem': [2,3],
'Klinisch-anamnestische Information//Komorbiditäten aus Arztbrief::Lungenfibrose': [2,3],
'Klinisch-anamnestische Information//Komorbiditäten aus Arztbrief::Chronisch obstruktive Lungenerkrankung': [1],
'Klinisch-anamnestische Information//Komorbiditäten aus Arztbrief::Bluthochdruck': [1],
'Klinisch-anamnestische Information//Komorbiditäten aus Arztbrief::Herzerkrankungen': [1],
'Klinisch-anamnestische Information//Komorbiditäten aus Arztbrief::Stauung/Ödem': [1],
'Klinisch-anamnestische Information//Komorbiditäten aus Arztbrief::Dialyse': [1],
'Klinisch-anamnestische Information//Komorbiditäten aus Arztbrief::Diabetes mellitus': ['Typ I', 'Typ II'],
'Klinisch-anamnestische Information//Komorbiditäten aus Arztbrief::Tabak rauchen': ['Aktuell']}

In [None]:
for comorb, occurence in comorbs.items():
    data[f"Occurrence_{comorb.split('::')[1]}"] = data.apply(lambda row: int(row[comorb] in occurence), axis=1)

In [None]:
data['Number_comorbidities'] = data.apply(lambda row: count_comorbs(row, comorbs), axis=1)

In [None]:
generated_features.append('Number_comorbidities')

In [None]:
sns.boxplot(data=data, x=target_variable, y='Number_comorbidities')

### Data modifications

In [None]:
# Drop columns that should not be included in the model
drop_num_features = [
    'Laborparameter//Entzündungsparameter::High-sensitivity C-reactive protein (hs-CRP): Wert',
    'Laborparameter//Gerinnungsfunktion::D-Dimer: Wert',
    'Laborparameter//Blutbild::Lymphozyten: Wert',
    'Laborparameter//Entzündungsparameter::Interleukin-6 (IL-6): Wert',
    'Arztbrief/KIS Angaben//Arztbrief/KIS Angaben::Tage seit Aufnahme',
]
for feature in drop_num_features:
    try:
        num_features.remove(feature)
    except ValueError:
        print(feature)

drop_ordinal_features = [
    'CT//Gesamtbeurteilung::Lungenparenchym',
    'Arztbrief/KIS Angaben//Outcome Parameter::Die letzte dokumentierte Patientenoutcomeerfassung beschreibt',
    'Arztbrief/KIS Angaben//Outcome Parameter::Innerhalb des erfassten Aufenthaltes war der höchste Behandlungsstatus'
]
for feature in drop_ordinal_features:
    try:
        ordinal_features.remove(feature)
    except ValueError:
        print(feature)

Remove patients with NaN values in age column

In [None]:
age_col = 'Klinisch-anamnestische Information//Demographische Informationen::Alter'
print(data[age_col].isna().sum())
data = data[~data[age_col].isna()]
data.reset_index(drop=True, inplace=True)

Build prediction models for missing values

In [None]:
missing_values_cols = [col for col in ordinal_features + nominal_features_code + num_features + generated_features if data[col].isna().sum() > 0]

In [None]:
for col in missing_values_cols:
    before = data[col].isna().sum()
    if col in ['Klinisch-anamnestische Information//Klinische Symptome::Syst. RR', 'Klinisch-anamnestische Information//Klinische Symptome::Sauerstoff-Sättigung']:
        pred_type = 'regressor'
    else:
        pred_type = 'classifier'
    model, missing_indices, predictions = predict_missing_values(data[ordinal_features + nominal_features_code + num_features + generated_features], col, pred_type=pred_type)
    for i, ind in enumerate(missing_indices):
        data.loc[ind, col] = predictions[i]
    after = data[col].isna().sum()
    print(f'Missing values: {before} -> {after}')
    print('#########################################')

Remove patients with CT//StudyDate > 20

In [None]:
data['CT//StudyDate'].describe()

In [None]:
len(data[data['CT//StudyDate'] >= 20])

In [None]:
data = data[data['CT//StudyDate'] < 21]

### Save data

In [None]:
preprocessed_data = data[['PatientID', 'Location'] + [f'Target_{i}' for i in [0,2,3]] 
                         + ordinal_features + nominal_features_code + num_features]

if save:
    preprocessed_data.to_csv(f'{preprocessed_folder}/all_data_combined_nominal-label-encoded.csv', index=False)

In [None]:
ohenc_data = data.copy()
ohenc_data = data[['PatientID', 'Location'] + [f'Target_{i}' for i in [0,2,3]] 
                            + ordinal_features + nominal_features + num_features]
ohenc_data = pd.get_dummies(ohenc_data, columns=nominal_features)

if save:
    ohenc_data.to_csv(f'{preprocessed_folder}/all_data_combined_nominal-one-hot.csv', index=False)

In [None]:
preprocessed_data = data[['PatientID', 'Location'] + [f'Target_{i}' for i in [0,2,3]] 
                         + ordinal_features + nominal_features_code + num_features + generated_features]

if save:
    preprocessed_data.to_csv(f'{preprocessed_folder}/all_data_combined_nominal-label-encoded_GF.csv', index=False)

In [None]:
ohenc_data = data.copy()
ohenc_data = data[['PatientID', 'Location'] + [f'Target_{i}' for i in [0,2,3]] 
                            + ordinal_features + nominal_features + num_features + generated_features]
ohenc_data = pd.get_dummies(ohenc_data, columns=nominal_features)

if save:
    ohenc_data.to_csv(f'{preprocessed_folder}/all_data_combined_nominal-one-hot_GF.csv', index=False)

In [None]:
selected_features = {
    'num': num_features,
    'ordinal': ordinal_features,
    'nominal': nominal_features,
    'generated': generated_features
}

with open('data/features/selected_features.json', 'w') as fp:
    json.dump(selected_features, fp)