################################################################################################################
  
  This notebook is used to convert the file format of the OHSUMED dataset to a JSONL format

################################################################################################################


In [1]:
import os
import sys
import json 

sys.path.append("./hugging/lib/python3.7/site-packages")

In [3]:
import pandas as pd
import numpy as np

In [4]:
trainpath = "./ohsu-trec/trec9-train/ohsumed.87"
testpath = "./ohsu-trec/trec9-test/ohsumed.88-91"

In [5]:
train_id = list()
with open(trainpath) as f:
    for line in f.readlines():
        line = line.strip()
        if line.startswith(".I"):
            line = line.replace(".I", "").strip()
            train_id.append(line)
print(f"Number of lines in train: {len(train_id)}")

test_id = list()
with open(testpath) as f:
    for line in f.readlines():
        line = line.strip()
        if line.startswith(".I"):
            line = line.replace(".I", "").strip()
            test_id.append(line)
print(f"Number of lines in test: {len(test_id)}")

Number of lines in train: 54710
Number of lines in test: 293856


In [6]:
counter =0
with open(testpath) as f:
    for line in f.readlines():
        line = line.strip()
        print(line)
        if line.startswith(".I"):
            counter+=1
            
            if counter > 20:
                break
            
print(f"Number of lines in test: {len(test_id)}")

.I 54711
.U
88000001
.S
Alcohol Alcohol 8801; 22(2):103-12
.M
Acetaldehyde/*ME; Buffers; Catalysis; HEPES/PD; Nuclear Magnetic Resonance; Phosphates/*PD; Protein Binding; Ribonuclease, Pancreatic/AI/*ME; Support, U.S. Gov't, Non-P.H.S.; Support, U.S. Gov't, P.H.S..
.T
The binding of acetaldehyde to the active site of ribonuclease: alterations in catalytic activity and effects of phosphate.
.P
JOURNAL ARTICLE.
.W
Ribonuclease A was reacted with [1-13C,1,2-14C]acetaldehyde and sodium cyanoborohydride in the presence or absence of 0.2 M phosphate. After several hours of incubation at 4 degrees C (pH 7.4) stable acetaldehyde-RNase adducts were formed, and the extent of their formation was similar regardless of the presence of phosphate. Although the total amount of covalent binding was comparable in the absence or presence of phosphate, this active site ligand prevented the inhibition of enzymatic activity seen in its absence. This protective action of phosphate diminished with progressive

Number of lines in test: 293856


 Run a loop through each of the train & test files & extract json

 The original field definiations

| Column market | Defination | key name | Notes | 

|----|----|----|----|----|

| .I | sequential identifier | seqId | important note: documents should be processed in this order |

| .U | MEDLINE identifier (UI) | medline_ui | (<DOCNO> used for relevance judgements) |
    
| .M | Human-assigned MeSH terms (MH) | mesh_terms
    
| .T | Title (TI) | title
    
| .P | Publication type (PT) | publication_type
    
| .W | Abstract (AB) | abstract
    
| .A | Author (AU) | author
    
| .S |Source (SO) | source

In [7]:
column_map = {
    ".I": "seq_id", ".U": "medline_ui", ".M": "mesh_terms", ".T": "title", 
    ".P": "publication_type", ".W": "abstract", ".A": "author",
    ".S": "source"
}

In [8]:
def ohsumed_dict():
    # Define a dict
    ohs_dict = {"seq_id": -1,
    "medline_ui": -1,
    "mesh_terms": "",
    "title": "",
    "publication_type": "",
    "abstract": "",
    "author": "",
    "source": ""}
    
    return ohs_dict

In [9]:
%%time
abstract = list()
tag = ""
print_tag = False

for filepath in [trainpath, testpath]:

    with open(filepath) as f:
        for line in f.readlines():

            line = line.strip()

            if tag and tag != ".I" and not line.startswith("."):
                key = column_map[tag]

                ohs_dict[key] = line
                if print_tag == True:
                    print(key, line)

            if line.startswith(".I"):
                tag = ".I"
                try:
                    if ohs_dict:
                        abstract.append(ohs_dict)
                except:
                    print("first run")
                    

                ohs_dict = ohsumed_dict()
                line = line.replace(".I", "").strip()
                ohs_dict['seq_id'] = line

            elif ".U" in line:
                tag = ".U"
            elif ".M" in line:
                tag = ".M"
            elif ".T" in line:
                tag = ".T"
            elif ".P" in line:
                tag = ".P"
            elif ".W" in line:
                tag = ".W"
            elif ".A" in line:
                tag = ".A"
            elif ".S" in line:
                tag = ".S"

        # Collect the last abstract
        abstract.append(ohs_dict) 
        print(f"Length of {filepath}: {len(abstract)}" )
        
        if 'train' in filepath:
            filename = "./ohsumed-preprocessed/data/trec9-train-ohsumed-87.jsonl"
        else:
            filename = "./ohsumed-preprocessed/data/trec9-test-ohsumed-88-91.jsonl"
        
        result = pd.DataFrame(abstract).to_json(orient="records")
        parsed = json.loads(result)
        with open(filename, 'w') as outfile:
            print(f"Writing to file {filename}...")
            json.dump(parsed, outfile)

first run
Length of ./ohsumed-preprocessed/ohsu-trec/trec9-train/ohsumed.87: 54710
Writing to file ./ohsumed-preprocessed/data/trec9-train-ohsumed-87.jsonl...
Length of ./ohsumed-preprocessed/ohsu-trec/trec9-test/ohsumed.88-91: 348567
Writing to file ./ohsumed-preprocessed/data/trec9-test-ohsumed-88-91.jsonl...
CPU times: user 20.2 s, sys: 2.11 s, total: 22.3 s
Wall time: 24.7 s


In [11]:
#break here

### Below cells are run for quality checks & trial runs

In [None]:
len(train)

In [None]:
assert len(train) == 54710 #52798 
assert len(test) == 283335 #293858

In [None]:
test_df = pd.DataFrame(train)
print(test_df.shape)
test_df.isnull().sum()

In [None]:
test_df_ids = list(test_df['seq_id'])

In [None]:
test_df_ids = list(set(test_df_ids))
len(test_df_ids)

In [None]:
test_id = list(set(test_id))
len(test_id)

In [None]:
diff = set(test_df_ids).difference(test_id) 
assert len(diff) == len(test_id) - len(test_df_ids)

In [None]:
diff

In [None]:
parsed = json.loads(result)

with open('train.jsonl', 'w') as outfile:
    json.dump(parsed, outfile)