In [1]:
import pandas as pd
import numpy as np
import re
import csv
import random

## Open annotations file 

Open the 'annotations.txt' file and read in any lines starting with 'text_' as this indicates a line with phenotypes for the patients' text file of notes. Then read character 6 onwards to get patient ID at the start of each line (cutting off 'text_').

In [2]:
random.seed(5)

In [3]:
with open("annotations.txt","r") as f:
    annotations = []
    for ln in f:
        if ln.startswith("text_"):
            annotations.append(ln[5:])


## Cleaning 

In [4]:
pheno = [] # will fill nested list with the information from each line in the file

for ln in range(len(annotations)):
        
    line = re.split(r'\t+', annotations[ln]) # split lines by \t delimiter
    
    line_strip = [] 
        
    for i in line:         
        # strip away unwanted characters that got retained when reading in the file         
        strip1 = i.strip('.txt')
        strip2 = strip1.strip('http://purl.obolibrary.org/obo/')
        strip = strip2.strip('\n')        
        line_strip.append(strip) 
        
    pheno.append(line_strip) # each list is one line from the file
    

for i in pheno:
    
    if len(i) != 8:
        
        i.insert(5, 'NA') 
        # ensure all lines have the same number of entries; position 6
        # is variably filled so if not, add in 'NA'
        

## Formatting 

In [5]:
annotations_df = pd.DataFrame(pheno) # create dataframe from pheno 

phenotypes = annotations_df.iloc[:,0:2] # get first 3 columns (patient ID and Human Phenotype Ontology (HP) ID)

phenotypes = phenotypes.set_axis(['Patient_ID', 'HP_ID'], axis=1, inplace=False) # set column names for use later


In [6]:
phenotypes.head()

Unnamed: 0,Patient_ID,HP_ID
0,183903,HP_0100806
1,183903,HP_0012531
2,183903,HP_0012420
3,183903,HP_0001945
4,183903,HP_0002098


In [7]:
for i in range(len(phenotypes.index)):
    
    replace = phenotypes.loc[i,'HP_ID'].replace('_', ':') # format to HP:0100806 ...
    
    phenotypes.loc[i,'HP_ID'] = replace

phenotypes = phenotypes.set_index(phenotypes.loc[:,'Patient_ID'], drop=True)

# iterate through the unique patient IDs in the index of phenotypes dataframe 
# saving each set of HP IDs for that patient and formatting to a ; delimited list in new dataframe 

phenos = pd.DataFrame(columns= ['Patient_ID', 'HP_IDs']) # new dataframe for cleaned IDs 

for i in set(phenotypes.index.tolist()):
    
    l = phenotypes.loc[i, 'HP_ID']
    
    if isinstance(l, str):
        s = str(l)
    else:
        s = str(l.tolist())
    
    s = s.replace(',', ';')
    s = s.replace('[', '')
    s = s.replace(']', '')
    s = s.replace("'", '')
    s = s.replace(' ', '')

    phenos.append({'Patient_ID': i}, ignore_index=True)
    phenos.loc[i, 'Patient_ID'] = i
    phenos.loc[i, 'HP_IDs'] = s
    
    

In [8]:
phenos.shape

# at this point there is only 985 patients in the samples, not 1000
# probably due to file reading at the start

(985, 2)

In [9]:
phenos.head()

Unnamed: 0,Patient_ID,HP_IDs
168392,168392,HP:0002090;HP:0000822;HP:0001041;HP:0000969;HP...
129887,129887,HP:0000819;HP:0002617;HP:0025143;HP:0002018;HP...
193341,193341,HP:0012115;HP:0030245;HP:0100806;HP:0001433;HP...
150333,150333,HP:0002027;HP:0002018;HP:0002013;HP:0001394;HP...
187245,187245,HP:0002094;HP:0012531;HP:0002094;HP:0001649;HP...


In [10]:
# tidy up dataframe; set index as patient IDs and drop this column

phenos.set_index(phenos.loc[:,'Patient_ID'], drop=True) 
phenos = phenos.drop('Patient_ID', axis=1)

### Save as tab delimited file

This should be in the right format for semantic measures library https://www.semantic-measures-library.org/sml/index.php?q=toolkit 

In [11]:
# all patients in sample

phenos.to_csv('annotated_phenotypes.tsv', sep='\t', header=False)

In [12]:
# first 10 for practice 

phenos.iloc[0:10,:].to_csv('ten_annotated_phenotypes.tsv', sep='\t', header=False)

## Generate patient pairs

In [13]:
import itertools

### For sample of 10

In [14]:
ten_patients = phenos.iloc[0:10,:]

In [15]:
def pairs_from_dataframe(dataframe):
    
    ids = dataframe.index.tolist()
    
    pairs = list(itertools.combinations(ids, 2))
    
    pairs_df = pd.DataFrame.from_records(pairs)
    
    pairs_df = pairs_df.set_index(pairs_df.iloc[:,0], drop=True)
    
    pairs_df = pairs_df.drop(0, axis=1)
    
    return pairs_df

In [16]:
ten_pairs_df = pairs_from_dataframe(ten_patients)

In [17]:
ten_pairs_df.to_csv('ten_pairs.tsv', sep='\t', header=False)

### For all patients

In [18]:
all_pairs_df = pairs_from_dataframe(phenos)

In [19]:
all_pairs_df.shape

(484620, 1)

In [20]:
all_pairs_df.to_csv('all_pairs.tsv', sep='\t', header=False)