In [1]:
# Import libraries
from sklearn.model_selection import train_test_split
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc

In [2]:
data = pd.read_csv('cov_merged.csv')
target = pd.read_csv('outcomes.csv')
print(data)

            rowId  covariateId  covariateValue  \
0          3938.0   4083311102               1   
1        175397.0     81380102               1   
2         23031.0   4145418102               1   
3        152472.0    135473102               1   
4         60021.0   4213101102               1   
...           ...          ...             ...   
5411125   67932.0    378253104               1   
5411126  135007.0   4211852104               1   
5411127  127381.0    197988104               1   
5411128   20208.0    201826104               1   
5411129  161935.0    440005104               1   

                                             covariateName  analysisId  \
0        condition_occurrence during day -365 through -...         102   
1        condition_occurrence during day -365 through -...         102   
2        condition_occurrence during day -365 through -...         102   
3        condition_occurrence during day -365 through -...         102   
4        condition_occurrence

In [3]:
# Create windows to represent analysis ids
analysisId_to_window = {101: "all", 102: '365d', 103: '180d', 104: '030d', 105: 'all',
                        106: '365d', 107: '180d', 108: '030d'}

# Create observations data file from data file
observation = pd.DataFrame()
observation['conceptId'] = data['conceptId']
observation['analysisId'] = data['analysisId']
observation['window'] = data['analysisId'].apply(lambda x: analysisId_to_window.get(x, 'Unknown'))
observation['covariateValue'] = 1
observation['patient'] = data['rowId']
observation['covariateId'] = data['covariateId']
print(observation)

         conceptId  analysisId window  covariateValue   patient  covariateId
0          4083311         102   365d               1    3938.0   4083311102
1            81380         102   365d               1  175397.0     81380102
2          4145418         102   365d               1   23031.0   4145418102
3           135473         102   365d               1  152472.0    135473102
4          4213101         102   365d               1   60021.0   4213101102
...            ...         ...    ...             ...       ...          ...
5411125     378253         104   030d               1   67932.0    378253104
5411126    4211852         104   030d               1  135007.0   4211852104
5411127     197988         104   030d               1  127381.0    197988104
5411128     201826         104   030d               1   20208.0    201826104
5411129     440005         104   030d               1  161935.0    440005104

[5411130 rows x 6 columns]


In [4]:
observation_all = observation[observation['window'] == "all"]
print(len(observation_all))
observation_365d = observation[observation['window'] == "365d"]
print(len(observation_365d))
observation_180d = observation[observation['window'] == "180d"]
print(len(observation_180d))
observation_030d = observation[observation['window'] == "030d"]
print(len(observation_030d))

# Pick time window
chosen_window = "365d"
observation = observation[observation['window'] == chosen_window]

4257791
656937
385185
111217


In [5]:
risk_window_end = 1825
# Creating outcomes list
outcomes = []
has_outcome = 0
no_outcome = 0
for i in observation['patient']:
    if i in target['rowId'].values:
        if target.loc[target['rowId'] == i, 'daysToEvent'].values[0] < risk_window_end:
            outcomes.append(1)
            has_outcome += 1
        else:
            outcomes.append(0)
            no_outcome += 1
    else:
        outcomes.append(0)
        no_outcome += 1

print(f'Has outcome {has_outcome}')
print(f'No outcome {no_outcome}')

Has outcome 1174
No outcome 655763


In [6]:
random_state = 42
test_size = 0.25
# Split the data in train and test set
X_train, X_test, y_train, y_test = train_test_split(observation, outcomes, stratify=outcomes, random_state=random_state, test_size=test_size)

In [7]:
# Load ancestor data
ancestry = pd.read_csv("relations_ancestor_total.csv")
parentage = ancestry[ancestry['MIN_LEVELS_OF_SEPARATION'] == 1]

print(len(set(parentage)))
observed_concepts_train = X_train['conceptId']

6


In [8]:
# Identify unique ancestor concepts related to observed concepts
relevant_concept_train = ancestry[ancestry['DESCENDANT_CONCEPT_ID'].isin(observed_concepts_train)][
    'ANCESTOR_CONCEPT_ID'].unique()

In [9]:
# Filter ancestry dataframe to include only relevant ancestor descendant relationships
relevant_ancestry_train = ancestry[
    ancestry['ANCESTOR_CONCEPT_ID'].isin(relevant_concept_train) | ancestry['DESCENDANT_CONCEPT_ID'].isin(
        relevant_concept_train)]
relevant_ancestry_train = relevant_ancestry_train.rename(
    columns={'ANCESTOR_CONCEPT_ID': 'ancestor', 'DESCENDANT_CONCEPT_ID': 'descendent',
             'MIN_LEVELS_OF_SEPARATION': 'minimum separation', 'MAX_LEVELS_OF_SEPARATION': 'maximum separation'}) \
    .drop(columns=['Unnamed: 0', 'CTID'])


In [10]:
# Filter parentage dataframe to include only relevant parent-child relationships
relevant_parentage_train = parentage[
    parentage['ANCESTOR_CONCEPT_ID'].isin(relevant_concept_train) | parentage['DESCENDANT_CONCEPT_ID'].isin(
        relevant_concept_train)]
relevant_parentage_train = relevant_parentage_train.rename(columns={'ANCESTOR_CONCEPT_ID': 'parent', 'DESCENDANT_CONCEPT_ID': 'child'})\
    .drop(columns=['Unnamed: 0', 'MIN_LEVELS_OF_SEPARATION', 'MAX_LEVELS_OF_SEPARATION', 'CTID'])

In [11]:
children_train = set(relevant_parentage_train['child'])
parents_train = set(relevant_parentage_train['parent'])

In [12]:
# Identify leaf concepts
leaf_concept_train = children_train - parents_train
leaf_concepts_train = pd.DataFrame({"id": list(leaf_concept_train)})

In [13]:
# Group by patient and count cumulative concept count
observed_concepts_per_patient = X_train.groupby(["patient", "window"]).agg(conceptcount=("conceptId", "nunique")).reset_index()


In [14]:
# Count the number of unique patients per window
train_patientcount = X_train.groupby("window")['patient'].nunique().reset_index()
train_patientcount = train_patientcount.rename(columns={'patient': 'count'})

In [15]:
# Merge observations with the compute unique concept count per patient
train_observation_ancestor = X_train.merge(observed_concepts_per_patient, on=['patient', 'window'])
train_observation_ancestor = train_observation_ancestor.merge(relevant_ancestry_train, left_on='conceptId', right_on='descendent')\
    .drop(columns=['descendent', 'minimum separation', 'maximum separation'])

In [16]:
# Compute the weight of each concept by inverting the unique concept count
train_observation_ancestor['weight'] = 1.0 / train_observation_ancestor['conceptcount']
train_observation_ancestor = train_observation_ancestor.rename(columns={'conceptId': 'observed concept'})
train_observation_ancestor = train_observation_ancestor.drop(columns=['conceptcount'])

In [17]:
# Aggregate weighted concept values per patient
train_per_patient_weighted_concept = train_observation_ancestor.groupby(['patient', 'window', 'ancestor']).agg(weight=('weight', 'sum')).reset_index()
train_per_patient_weighted_concept = train_per_patient_weighted_concept.rename(columns={'ancestor': 'conceptId'})

In [18]:
# Merge weighted concept values with patient count per window
df_train_weighted_concept = train_per_patient_weighted_concept.merge(train_patientcount, on="window")


In [19]:
# Group by window and concept and aggregate the weight
train_weighted_concept = df_train_weighted_concept.groupby(["window", "conceptId"], as_index=False).agg(weight=("weight", "sum"))

In [20]:
# Normalize weights by dividing by patient count
train_weighted_concept["weight"] = train_weighted_concept["weight"] / df_train_weighted_concept.groupby(["window", "conceptId"])["count"].first().values


In [21]:
# Rename columns for clarity in parent child relationships
wcfp = train_weighted_concept.rename(columns={"conceptId": "parent", "weight": "parent weight"})
wcfc = train_weighted_concept.rename(columns={"conceptId": "child", "weight": "child weight"})

# Merge with parent child relationships
df_wcfp = relevant_parentage_train.merge(wcfp, on="parent")
df_wcfc = df_wcfp.merge(wcfc, on=["child", "window"])

In [22]:
# Assign a weight of zero to leaf nodes
leaf_df = leaf_concepts_train.merge(wcfp, left_on='id', right_on='parent').drop(columns=['parent'])
leaf_df["child"] = None
leaf_df["child weight"] = 0.0
leaf_df = leaf_df.rename(columns={"id": "parent"})

In [23]:
# Combine hierarchical relationships and leaf concepts
train_weighted_parentage = pd.concat([df_wcfc, leaf_df], ignore_index=True)


In [24]:
# Choose parent weight and child weight
parent_weight = 0.2
child_weight = 0.5

# Select concepts where parent weight is >= chosen parent weight and child weight < chosen child weight
concept_selection = train_weighted_parentage[(train_weighted_parentage['parent weight'] >= parent_weight) & (child_weight > train_weighted_parentage['child weight'])]


In [25]:
# Select only observed concepts
observed = X_train['conceptId']
# Select observed descendants
observed_descendants = relevant_ancestry_train[relevant_ancestry_train['descendent'].isin(observed)]
# Select concepts with observed descendants
concepts_with_observed_descendants = observed_descendants['ancestor']
# Selected concepts by knowledge dr
selected_concepts_list = concept_selection['child']
# Select only concepts that are selected with knowledge based dimension reduction technique
selected_concepts_df = X_train[X_train['conceptId'].isin(selected_concepts_list)]

In [26]:
patient = []
concept = []
covariateId = []
covariateValue = []
analysisId = []

for _, row in X_train.iterrows():

    if row['conceptId'] in set(selected_concepts_list) and row['conceptId'] in set(concepts_with_observed_descendants):
        patient.append(row['patient'])
        concept.append(row['conceptId'])
        covariateId.append(row['covariateId'])
        analysisId.append(row['analysisId'])
        covariateValue.append(1)
    else:
        patient.append(row['patient'])
        concept.append(row['conceptId'])
        covariateId.append(row['covariateId'])
        analysisId.append(row['analysisId'])
        covariateValue.append(0)
        # pass

df = pd.DataFrame()
df['conceptId'] = concept
df['analysisId'] = analysisId
df['covariateValue'] = covariateValue
df['patient'] = patient
df['covariateId'] = covariateId

X_train = df

In [27]:
def pivot_covariates(df):
    observation = df.pivot(index='patient', columns='covariateId', values='covariateValue')
    return observation.fillna(0)

X_train = pivot_covariates(X_train)
X_test = pivot_covariates(X_test)

print(len(X_train))
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

144317


In [28]:
def create_outcomes(indexes):
    outcomes = []
    for i in indexes:
        if i in target['rowId'].values:
            days_to_event = target.loc[target['rowId'] == i, 'daysToEvent'].values[0]
            outcomes.append(1 if days_to_event < risk_window_end else 0)
        else:
            outcomes.append(0)
    return outcomes

y_train = create_outcomes(X_train.index)
y_test = create_outcomes(X_test.index)

In [29]:
print(len(X_train))
print(len(y_train))

144317
144317


In [30]:
# Choose model, penalty, solver
lr_penalty = 'l1'
lr_solver = "liblinear"
model = LogisticRegression(penalty=lr_penalty, solver=lr_solver)

In [31]:
# Fit model on train data
model.fit(X_train, y_train)

# Get model coefficients
model_coefficients = model.coef_[0]
print(model.coef_)

number_model_features = len(set(model_coefficients))

print('Number of model features:', number_model_features)

# Get train probabilities
y_prob_train = model.predict_proba(X_train)

# Get ths train auc score
train_auc = roc_auc_score(y_train, y_prob_train[:, 1])
print(train_auc)

# Get test probabilities
y_prob_test = model.predict_proba(X_test)

# Get ths test auc score
test_auc = roc_auc_score(y_test, y_prob_test[:, 1])
print(test_auc)

# Calculate precision-recall curve
precision_test, recall_test, _ = precision_recall_curve(y_test, y_prob_test[:, 1])

# Calculate AUPRC
auprc_test_score = auc(recall_test, precision_test)

[[0. 0. 0. ... 0. 0. 0.]]
Number of model features: 1
0.5
0.5
