In [3]:
import os
import glob
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def read_txt_file(file_path):
    with open(file_path, 'r') as file:
        return file.read()

def read_ann_file(file_path):
    with open(file_path, 'r') as file:
        return file.readlines()

def preprocess_text(text):
    # Normalize text
    text = text.lower()
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

def get_labels_from_ann(ann_lines, text):
    labels = ['O'] * len(text.split())
    for line in ann_lines:
        parts = line.strip().split()
        tag = parts[1]
        start = int(parts[2])
        end = int(parts[3])
        if tag not in ['Drug', 'Strength', 'Form', 'Dosage', 'Duration', 'Frequency', 'Route', 'ADE', 'Reason']:
            continue
        label = f"B-{tag}"
        words = text[start:end].split()
        if len(words) > 1:
            labels[start] = label
            for i in range(1, len(words)):
                labels[start + i] = f"I-{tag}"
        else:
            labels[start] = label
    return labels

def create_dataset(data_dir):
    txt_files = glob.glob(os.path.join(data_dir, '*.txt'))
    dataset = []
    for txt_file in txt_files:
        base_name = os.path.splitext(txt_file)[0]
        ann_file = base_name + '.ann'

        if not os.path.exists(ann_file):
            continue

        text = read_txt_file(txt_file)
        ann_lines = read_ann_file(ann_file)

        preprocessed_text = preprocess_text(text)
        labels = get_labels_from_ann(ann_lines, preprocessed_text)
        tokens = preprocessed_text.split()

        dataset.append({'tokens': tokens, 'ner_tags': labels})

    return dataset

data_dir = 'n2c2/n2c2/part2'
dataset = create_dataset(data_dir)

# Split the dataset into training and validation sets
train_data, val_data = train_test_split(dataset, test_size=0.2)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\smrh1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


KeyboardInterrupt: 