# Preparing

In [30]:
import os
import re
from collections import Counter

# Extract X,Y from bio files

In [33]:
def read_bio_file(file_path):
    """
    Extract X (tokens) and Y (labels) from a BIO file, splitting sentences by '.'
    """
    tokens = []
    labels = []
    current_tokens = []
    current_labels = []

    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line:  # Non-empty line
                token, label = line.split()
                current_tokens.append(token)
                current_labels.append(label)
                
                # Check if the token is a period '.' not part of a number
                if re.fullmatch(r'\.', token):
                    tokens.append(current_tokens)
                    labels.append(current_labels)
                    current_tokens = []
                    current_labels = []
    
    # Append the remaining tokens/labels if any
    if current_tokens:
        tokens.append(current_tokens)
        labels.append(current_labels)
    
    return tokens, labels

def read_multi_bio_files(directory_path):
    """
    Read all BIO files in a directory and extract X and Y, separating sentences by periods.
    """
    all_X = []  # List of all tokenized notes
    all_Y = []  # List of all labels
    
    for file_name in os.listdir(directory_path):
        file_path = os.path.join(directory_path, file_name)
        
        # Skip non-text files
        if not file_name.endswith('.bio'):
            continue
        
        print(f"Processing file: {file_name}")
        file_X, file_Y = read_bio_file(file_path)
        all_X.append(file_X)
        all_Y.append(file_Y)
    
    return all_X, all_Y

X, Y 3 dim lists

- first dim represents notes
- second dim represents sentences
- third dim represents words

In [35]:
# Example usage
directory_path = "bio/"
X, Y = read_multi_bio_files(directory_path)

# Output
print(f"Number of Notes: {len(X)}")
print("Sample X (tokens):", X[0][:2])  # First two sentences
print("Sample Y (labels):", Y[0][:2])  # Corresponding labels


Processing file: note_1.bio
Processing file: note_10.bio
Processing file: note_100.bio
Processing file: note_1000.bio
Processing file: note_10000.bio
Processing file: note_10001.bio
Processing file: note_10002.bio
Processing file: note_10003.bio
Processing file: note_10004.bio
Processing file: note_10005.bio
Processing file: note_10006.bio
Processing file: note_10007.bio
Processing file: note_10008.bio
Processing file: note_10009.bio
Processing file: note_1001.bio
Processing file: note_10010.bio
Processing file: note_10011.bio
Processing file: note_10012.bio
Processing file: note_10013.bio
Processing file: note_10014.bio
Processing file: note_10015.bio
Processing file: note_10016.bio
Processing file: note_10017.bio
Processing file: note_10018.bio
Processing file: note_10019.bio
Processing file: note_1002.bio
Processing file: note_10020.bio
Processing file: note_10021.bio
Processing file: note_10022.bio
Processing file: note_10023.bio
Processing file: note_10024.bio
Processing file: not

In [37]:
import pickle

# Save X to a pickle file
with open("ner_x.pkl", "wb") as pkl_file:
    pickle.dump(X, pkl_file)


# Save Y to a pickle file
with open("ner_y.pkl", "wb") as pkl_file:
    pickle.dump(Y, pkl_file)

# To load it back later
# with open("ner_y.pkl", "rb") as pkl_file:
#     loaded_Y = pickle.load(pkl_file)

## Calculating unique values

In [36]:
# Flatten the 3D list into a 1D list
flattened_Y = [label for sublist1 in Y for sublist2 in sublist1 for label in sublist2]

# Count unique values
unique_counts = Counter(flattened_Y)

# Print the result
print(unique_counts)

Counter({'O': 3814913, 'B-SYMPTOM': 230094, 'I-SYMPTOM': 129286, 'B-BODY_MEASURE': 73736, 'I-BODY_MEASURE': 46278, 'B-UNIT': 44452, 'B-VISIT_MOTIVATION': 42588, 'B-VALUE': 42000, 'I-VISIT_MOTIVATION': 38868, 'B-AGE': 13494, 'B-GENDER': 13446})
