In [None]:
import pandas as pd
import numpy as np
import re
import csv
import random

## Open annotations file 

Open the 'annotations.txt' file and read in any lines starting with 'text_'. This indicates a line with phenotypes for the patients' text file of notes. From this we can extract the patient ID for the given phenotypes.

In [None]:
with open("annotations.txt","r") as f:
    annotations = []
    for ln in f:
        if ln.startswith("text_"):
            annotations.append(ln[5:])


## Cleaning up the lines read in from annotations

In [None]:
pheno = [] # will fill nested list with the information from each line in the file

for ln in range(len(annotations)):
        
    line = re.split(r'\t+', annotations[ln]) # split lines by \t delimiter
    
    line_strip = [] 
        
    for i in line:         
        # strip away unwanted characters that got retained when reading in the file         
        strip1 = i.strip('.txt')
        strip2 = strip1.strip('http://purl.obolibrary.org/obo/')
        strip = strip2.strip('\n')        
        line_strip.append(strip) 
        
    pheno.append(line_strip) # each list is one line from the file
    

for i in pheno:
    
    if len(i) != 8:
        
        i.insert(5, 'NA') 
        # ensure all lines have the same number of entries; position 6
        # is variably filled so if not, add in 'NA'
        

## Formatting 

In [None]:
annotations_df = pd.DataFrame(pheno) # create dataframe from pheno 

phenotypes = annotations_df.iloc[:,0:2] # get first 3 columns (patient ID and Human Phenotype Ontology (HP) ID)

phenotypes = phenotypes.set_axis(['Patient_ID', 'HP_ID'], axis=1, inplace=False) # set column names for use later


In [None]:
phenotypes.head()

In [None]:
# tidy up the Human Phenotype ontology (HP) identifiers to HP:0100806 ...

phenotypes['HP_ID'] = [x.replace('_',':') for x in phenotypes['HP_ID']]
phenotypes = phenotypes.set_index(phenotypes.loc[:,'Patient_ID'], drop=True)

# iterate through the unique patient IDs in the index of phenotypes dataframe 
# saving each set of HP IDs for that patient and formatting to a ; delimited list in new dataframe 

phenos = pd.DataFrame(columns= ['Patient_ID', 'HP_IDs']) # new dataframe for cleaned IDs 

for i in list(dict.fromkeys(phenotypes.index.tolist())):
    
    l = phenotypes.loc[i, 'HP_ID']
    
    if isinstance(l, str):
        s = str(l)
    else:
        s = str(l.tolist())
    
    s = s.replace(',', ';').replace('[', '').replace(']', '').replace("'", '').replace(' ', '')

    phenos.append({'Patient_ID': i}, ignore_index=True)
    phenos.loc[i, 'Patient_ID'] = i
    phenos.loc[i, 'HP_IDs'] = s

    
# tidy up dataframe; set index as patient IDs and drop this column
phenos = phenos.set_index('Patient_ID')    

In [None]:
phenos.shape

# note: there may be fewer than 1000 patients here, as some patients in the sample (although exisiting in NOTEEVENTS) 
# do not actually have notes that can be annotated

# if using updated code for your own sample, this should not be an issue

In [None]:
phenos.head()

### Save as tab delimited file

This should be in the right format for semantic measures library https://www.semantic-measures-library.org/sml/index.php?q=toolkit 

In [None]:
# all patients in sample

phenos.to_csv('annotated_phenotypes.tsv', sep='\t', header=False)

In [None]:
# first 10 for practice 

phenos.iloc[0:10,:].to_csv('ten_annotated_phenotypes.tsv', sep='\t', header=False)

## Generate patient pairs

Want to generate all possible patient pairs so can make similarity comparisons across all patients. 

In [None]:
import itertools

### For sample of 10

In [None]:
ten_patients = phenos.iloc[0:10,:]

In [None]:
def pairs_from_dataframe(dataframe):
    
    ids = dataframe.index.tolist()
    
    pairs = list(itertools.combinations(ids, 2))
    
    pairs_df = pd.DataFrame.from_records(pairs)
    
    pairs_df = pairs_df.set_index(pairs_df.iloc[:,0], drop=True)
    
    pairs_df = pairs_df.drop(0, axis=1)
    
    return pairs_df

In [None]:
ten_pairs_df = pairs_from_dataframe(ten_patients)

In [None]:
ten_pairs_df.to_csv('ten_patient_pairs.tsv', sep='\t', header=False)

### For all patients

In [None]:
all_pairs_df = pairs_from_dataframe(phenos)

In [None]:
all_pairs_df.shape

In [None]:
all_pairs_df.head()

In [None]:
all_pairs_df.to_csv('all_patient_pairs.tsv', sep='\t', header=False)