<a href="https://colab.research.google.com/github/SmartNurse/Year_Dream/blob/main/BaseLine_(2023_11_21).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 라이브러리 불러오기

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import json
import nltk
import pandas as pd
import re




# 데이터 불러오기

In [None]:
# ncp JSON 파일 열기
ncp_filename = '/content/drive/MyDrive/Colab Notebooks/data/Smart_Nurse2/nursingrecord_ncp.json'
with open(ncp_filename, 'r') as file:
    ncp_data = json.load(file)

# aws JSON 파일 열기
aws_filename = '/content/drive/MyDrive/Colab Notebooks/data/Smart_Nurse2/nursingrecord_aws.json'
with open(aws_filename, 'r') as file:
    aws_data = json.load(file)

info_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/data/Smart_Nurse2/patientinfo.csv")

In [None]:
# JSON 문자열을 파이썬 딕셔너리로 변환하는 함수
def parse_json(json_str):
    try:
        return json.loads(json_str)
    except json.JSONDecodeError:
        return {}  # JSON 파싱 에러 시 빈 딕셔너리 반환

# 각 레코드에 대해 'content' 필드를 파싱
for item in ncp_data:
    item['content'] = parse_json(item['content'])

for item in aws_data:
    item['content'] = parse_json(item['content'])


# 데이터프레임 생성
ncp_df = pd.DataFrame(ncp_data)
aws_df = pd.DataFrame(aws_data)

# 전처리

In [None]:
NANDA_columns = ['domain', 'class', 'diagnosis', 'collectingData', 'goal', 'plan', 'interventions', 'evaluation']
SOAPIE_columns = ['subjective', 'objective', 'assessment', 'planning', 'interventions', 'evaluation']
FOCUS_DAR_columns = ['focus', 'data', 'action', 'response']
NARRATIVE_NOTES_columns = ['narrativeNote']
NURSING_columns = ['assessment', 'diagnosisRelate', 'diagnosis', 'goal', 'plan', 'interventions', 'evaluation']

def pre_processing(ncp_df, aws_df, record_type, columns)  :
    # 특정 'record_type' 행들만 필터링
    record_type_dict = {'NANDA' : 0, 'SOAPIE' : 1, 'FOCUS_DAR' : 2, 'NARRATIVE_NOTES' : 3, 'NURSING' : 4}
    ncp_record_type = ncp_df[ncp_df['record_type'] == record_type_dict[record_type]]
    aws_record_type = aws_df[aws_df['record_type'] == record_type_dict[record_type]]

    # 'content' 열을 확장하여 새로운 데이터 프레임 생성
    ncp_content_df = pd.json_normalize(ncp_record_type['content'])
    aws_content_df = pd.json_normalize(aws_record_type['content'])

    # 필요한 열만 가져오기
    ncp_content_df = ncp_content_df[columns]
    aws_content_df = aws_content_df[columns]

    # 두 데이터프레임 합치기
    df = pd.concat([ncp_content_df, aws_content_df])

    # 중복된 행 제거
    df = df.drop_duplicates()
    return df

# 전처리 한 데이터프레임 불러오기
nanda_df = pre_processing(ncp_df, aws_df, 'NANDA', NANDA_columns)
soapie_df = pre_processing(ncp_df, aws_df, 'SOAPIE', SOAPIE_columns)
focus_dar_df = pre_processing(ncp_df, aws_df, 'FOCUS_DAR', FOCUS_DAR_columns)
narrative_notes_df = pre_processing(ncp_df, aws_df, 'NARRATIVE_NOTES', NARRATIVE_NOTES_columns)
nursing_df = pre_processing(ncp_df, aws_df, 'NURSING', NURSING_columns)

## 전처리 함수 모음

In [None]:
# 필터링 조건에 맞지 않는 행을 확인하는 함수
def is_valid_row(text):
    # 숫자만 / 알파벳만 / 한글자모음만 / 특수 문자만 있는 경우
    if re.fullmatch(r'\d+', text) or re.fullmatch(r'[a-zA-Z]+', text) or re.fullmatch(r'[ㄱ-ㅎㅏ-ㅣ]+', text) or re.fullmatch(r'[^\w\s]+', text):
        return False
    return True

# 필터링 조건에 맞지 않는 단어를 제거하는 함수
def remove_invalid_words(text):
    words = str(text).split()
    valid_words = [word for word in words if is_valid_row(word)]
    return ' '.join(valid_words) if valid_words else pd.NA  # 빈 문자열 대신 pd.NA 반환

# 길이가 n 미만인 데이터가 있는 행 제거
def remove_short_words(text):
    # 문자열로 변환하고 단어로 분리
    words = str(text).split()
    # 길이가 3 초과인 단어만 유효하다고 판단
    valid_words = [word for word in words if len(word) > 3]
    # 유효한 단어들을 다시 문자열로 합치기
    return ' '.join(valid_words) if valid_words else pd.NA

def unify_terms(sentence):
    S_patterns_to_unify = [
        r'<주관적(?![\]자료])',
        r'\[주관적\]',
        r'주관적:',
        r'-주관적:',
        r'주관적자료:',
        r'\[주관적자료\]',
        r'주관적자료(?!\])',
        r'\<주관적자료\>',
        r'\[주관적',
        r'\*주관적',
        r'\(주관적\)',
        r'\(주관적',
        r'주관적-보호자',
        r'\(\[주관적자료\]\)',
        r'-\[주관적자료\]',
        r'\*\[주관적자료\]',
    ]

    O_patterns_to_unify = [
        r'<객관적(?![\]자료])',
        r'\[객관적\]',
        r'객관적:',
        r'-객관적:',
        r'객관적자료:',
        r'\[객관적자료\]',
        r'객관적자료(?!\])',
        r'\<객관적자료\>',
        r'\[객관적',
        r'\*객관적',
        r'\(객관적\)',
        r'\(객관적',
        r'객관적-보호자',
        r'\(\[객관적자료\]\)',
        r'-\[객관적자료\]',
        r'\*\[객관적자료\]',
    ]
    ST_patterns_to_unitfy = [
        r'단기목표:',
        r'\<단기목표\>',
        r'단기목표',
        r'-단기목표',
        r'단기',
        r'단기:',
        r'\[단기\]',
        r'\<단기\>',
        r'\*단기목표:',
        r'\*\[단기목표\]',
        r'\*단기:',
        r'\*\<단기목표\>',
        r'\*단기목표',
        r'\[단기목표\]\)',
        r'\(\[단기목표\]\)'
    ]
    LT_patterns_to_unitfy = [
        r'장기목표:',
        r'\<장기목표\>',
        r'장기목표',
        r'-장기목표',
        r'장기',
        r'장기:',
        r'\[장기\]',
        r'\<장기\>',
        r'\*장기목표:',
        r'\*\[장기목표\]',
        r'\*장기:',
        r'\*\<장기목표\>',
        r'\*장기목표',
        r'\[장기목표\]\)',
        r'\(\[장기목표\]\)'

    ]


    for pattern in S_patterns_to_unify:
      sentence = re.sub(pattern, '[주관적자료]', sentence)

    for pattern in O_patterns_to_unify:
      sentence = re.sub(pattern, '[객관적자료]', sentence)

    for pattern in ST_patterns_to_unitfy:
      sentence = re.sub(pattern, '[단기목표]', sentence)

    for pattern in LT_patterns_to_unitfy:
      sentence = re.sub(pattern, '[장기목표]', sentence)


    # '[주관적자료]자료]'와 같은 패턴을 '[주관적자료]'로 치환
    sentence = re.sub(r'\[주관적자료\](자료\])+', '[주관적자료]', sentence)
    sentence = re.sub(r'\[객관적자료\](자료\])+', '[객관적자료]', sentence)
    sentence = re.sub(r'\[\[\[단기목표\]목표\]\]', '[단기목표]', sentence)
    sentence = re.sub(r'\[\[단기목표\]목표\]', '[단기목표]', sentence)
    sentence = re.sub(r'\[단기목표\]:', '[단기목표]', sentence)
    sentence = re.sub(r'\[\[\[장기목표\]목표\]\]', '[장기목표]', sentence)
    sentence = re.sub(r'\[\[장기목표\]목표\]', '[장기목표]', sentence)
    sentence = re.sub(r'\[장기목표\]:', '[장기목표]', sentence)


    return sentence

def check_row_for_duplicates(row):
    row_values = row.tolist()
    return len(row_values) != len(set(row_values))

#위 전처리 함수들을 적용하는 함수
def function_apply(df, filter_column) :
    # 모든 열에 적용
    for column in df.columns :
        df[column] = df[column].apply(remove_invalid_words)

    # filter된 열에만 적용
    for column in filter_column :
        df[column] = df[column].apply(remove_short_words)

    # 행별로 적용
    df = df[~df.apply(check_row_for_duplicates, axis = 1)]

    # '<NA>'가 포함된 행 제거
    df = df[~df.apply(lambda x: x.astype(str).str.contains('<NA>')).any(axis=1)]
    df = df.reset_index(drop=True)


    # 모든 행 동음이의어 처리
    for column in df.columns:
      df[column] = df[column].apply(unify_terms)

    return df

In [None]:
nanda_filter_column = ['collectingData', 'goal', 'plan', 'interventions', 'evaluation']
soapie_filter_column = ['subjective', 'objective', 'assessment', 'planning', 'interventions', 'evaluation']
focus_dar_filter_column = ['focus', 'data', 'action','response']
narrative_notes_filter_column = ['narrativeNote']
nursing_filter_column = ['assessment', 'diagnosisRelate', 'diagnosis', 'goal', 'plan', 'interventions', 'evaluation']

nanda_df = function_apply(nanda_df, nanda_filter_column)
soapie_df = function_apply(soapie_df, soapie_filter_column)
focus_dar_df = function_apply(focus_dar_df, focus_dar_filter_column)
narrative_notes_df = function_apply(narrative_notes_df, narrative_notes_filter_column)
nursing_df = function_apply(nursing_df, nursing_filter_column)


In [None]:
nanda_df['input'] = nanda_df.apply(lambda x: f"domain: {x['domain']},\n"
                                  f"class: {x['class']},\n"
                                  f"diagnosis: {x['diagnosis']},\n"
                                  f"collectingData: {x['collectingData']},\n"
                                  f"goal: {x['goal']},\n"
                                  f"plan: {x['plan']},\n"
                                  f"interventions: {x['interventions']},\n"
                                  f"evaluation: {x['evaluation']}", axis=1)

In [None]:
soapie_df['input'] = soapie_df.apply(lambda x: f"subjective: {x['subjective']},\n"
                                  f"objective: {x['objective']},\n"
                                  f"assessment: {x['assessment']},\n"
                                  f"planning: {x['planning']},\n"
                                  f"interventions: {x['interventions']},\n"
                                  f"evaluation: {x['evaluation']},\n", axis=1)

In [None]:
focus_dar_df['input'] = focus_dar_df.apply(lambda x: f"focus: {x['focus']},\n"
                                  f"data: {x['data']},\n"
                                  f"action: {x['action']},\n"
                                  f"response: {x['response']},\n"
                                  , axis=1)

In [None]:
nursing_df['input'] = nursing_df.apply(lambda x: f"assessment: {x['assessment']},\n"
                                  f"diagnosisRelate: {x['diagnosisRelate']},\n"
                                  f"diagnosis: {x['diagnosis']},\n"
                                  f"goal: {x['goal']},\n"
                                  f"plan: {x['plan']},\n"
                                  f"interventions: {x['interventions']},\n"
                                  f"evaluation: {x['evaluation']},\n"
                                  , axis=1)