In [48]:
import json
import re
import pandas as pd
import numpy as np

# Flatten and clean <u>expert.json</u>, then merge abnormalities per xray

In [49]:
def preprocess_json(path):
    """
    read json file path, preprocess, and return it as DataFrame

    :param path: str
    :return df: DataFrame
    """

    def replace_none_to_null(text):
        """
        replace none texts to null

        :param text: str
        :return text: str
        """
        patterns = [r'"None"', r'"none"']
        for p in patterns:
            text = re.sub(p, 'null', text)

        return text

    with open(path) as f:
        data = json.load(f)

    # convert json data to str, replace none with null
    data = json.dumps(data)
    data = replace_none_to_null(data)

    # convert str data back to json and create df
    json_data = json.loads(data)
    df = pd.json_normalize(json_data)

    return df


def merge_abnormalities_per_xray(df_):
    """
    group multiple rows of abnormality w/ the same id and merge them into one row

    :param df_: DataFrame
    :return merged_data: dict
    """

    def merge_abnormalities_per_level():
        """
        group multiple rows of abnormality w/c are separated by level characteristic and merge them into one row containing all

        :return:
        """
        first_item = current_df.iloc[0]

        external_id = first_item['.External ID']
        description = first_item['.Description']
        polygons = current_df['.polygons'].apply(str).unique()
        polygons = [eval(p) for p in polygons]

        level_one = current_df[current_df['value'] == 'level_four']['.value'].to_list()
        level_two = current_df[current_df['value'] == 'level_one']['answer.value'].to_list()
        level_three = current_df[current_df['value'] == 'level_two']['answer.value'].to_list()
        level_four = current_df[current_df['value'] == 'level_three']['answer.value'].to_list()
        level_five = current_df[current_df['value'] == 'level_four']['answer.value'].to_list()

        merged_data['External ID'].append(external_id)
        merged_data['Description'].append(description)
        merged_data['polygons'].append(polygons)
        merged_data['level_one'].append(level_one)
        merged_data['level_two'].append(level_two)
        merged_data['level_three'].append(level_three)
        merged_data['level_four'].append(level_four)
        merged_data['level_five'].append(level_five)

    with_ab_ids = df_['.External ID'].unique()
    merged_data = {'External ID': [],
                   'Description': [],
                   'polygons': [],
                   'level_one': [],
                   'level_two': [],
                   'level_three': [],
                   'level_four': [],
                   'level_five': [],
                   }
    for id_ in with_ab_ids:
        current_df = df_[df_['.External ID'] == id_]
        merge_abnormalities_per_level()

    return merged_data


def give_empty_column(x, col):
    """
    add an empty key 'col' into dictionary 'x'

    :param x: dict
    :param col: str
    :return: dict
    """
    if col not in x.keys():
        x.update({col: None})
    return x


def remove_excess_and_keep_last(row):
    """
    remove excess and keep last

    :param row: Series
    :return row: Series
    """

    def no_duplicates():
        """check if there are no duplicates"""
        original_len = len(row['level_five'])
        removed_duplicates_len = len(set(row['level_five']))
        return original_len == removed_duplicates_len

    if no_duplicates():
        return row

    n = row['len_polygons']

    while len(row['level_one']) > n:
        row['level_one'].pop(0)
    while len(row['level_two']) > n:
        row['level_two'].pop(0)
    while len(row['level_three']) > n:
        row['level_three'].pop(0)
    while len(row['level_four']) > n:
        row['level_four'].pop(0)
    while len(row['level_five']) > n:
        row['level_five'].pop(0)

    return row

In [50]:
df = preprocess_json(
    r"Tufts Dental Database/Expert/expert.json")

# flatten Label.objects, fix separated rows
df = pd.concat([df.drop(columns='Label.objects'), pd.json_normalize(df['Label.objects'])], axis=1)
df_1 = df[df[1].notna()].drop(columns=[0, 2, 3]).rename(columns={1: 0})
df_2 = df[df[2].notna()].drop(columns=[0, 1, 3]).rename(columns={2: 0})
df_3 = df[df[3].notna()].drop(columns=[0, 1, 2]).rename(columns={3: 0})
df = pd.concat([df, df_1, df_2, df_3], axis=0)
df = df.drop(columns=[1, 2, 3, 'Label.classifications'])

# add id and description to each dictionary of json objects
df[0] = df.apply(lambda row: {**row[0], 'External ID': row['External ID'], 'Description': row['Description']}, axis=1)

# create with_ab_df and fix the abnormality characteristic levels before merging back
with_ab_df = pd.json_normalize(data=df[0], record_path='classifications',
                               meta=['External ID', 'Description', 'title', 'value', 'polygons'],
                               meta_prefix='.')
cols = ['.External ID', '.Description', '.polygons', '.value', 'value', 'answer.value', 'answers']
with_ab_df = with_ab_df[cols]
levels_df_id = with_ab_df['.External ID'].unique()
df = df[~df['External ID'].isin(levels_df_id)]
df = df.drop(columns=[0])

# flatten answers, fix separated answers (level three and four)
with_ab_df['answers'] = with_ab_df['answers'].fillna("").apply(list)
with_ab_df = pd.concat([with_ab_df.drop(columns='answers'), pd.json_normalize(with_ab_df['answers'])], axis=1)
not_nan_0 = with_ab_df[with_ab_df[0].notna()].index
not_nan_1 = with_ab_df[with_ab_df[1].notna()].index
not_nan_2 = with_ab_df[with_ab_df[2].notna()].index

levels_df_0 = with_ab_df.loc[not_nan_0].drop(columns=[1, 2, 'answer.value']).rename(columns={0: 'answer.value'})
levels_df_1 = with_ab_df.loc[not_nan_1].drop(columns=[0, 2, 'answer.value']).rename(columns={1: 'answer.value'})
levels_df_2 = with_ab_df.loc[not_nan_2].drop(columns=[0, 1, 'answer.value']).rename(columns={2: 'answer.value'})

levels_df_0['answer.value'] = levels_df_0['answer.value'].apply(lambda x: pd.json_normalize(x)['value'])
levels_df_1['answer.value'] = levels_df_1['answer.value'].apply(lambda x: pd.json_normalize(x)['value'])
levels_df_2['answer.value'] = levels_df_2['answer.value'].apply(lambda x: pd.json_normalize(x)['value'])

with_ab_df = with_ab_df.drop(columns=[0, 1, 2])
with_ab_df = pd.concat([with_ab_df, levels_df_0, levels_df_1, levels_df_2], axis=0)
with_ab_df = with_ab_df.dropna(subset=['answer.value'])

# merge abnormalities per xray id
merged_ab_df = pd.DataFrame(merge_abnormalities_per_xray(with_ab_df))
# merge back all
expert_merged_df = pd.concat([df, merged_ab_df], axis=0)

# cleanup
# remove duplicates
expert_merged_df.drop_duplicates(subset='External ID', inplace=True)
expert_merged_df.set_index('External ID', inplace=True)
# remove excess per level characteristic
expert_merged_df['len_polygons'] = expert_merged_df['polygons'].apply(lambda x: len(x) if x is not np.nan else 0)
expert_merged_df['len_abnormalities'] = expert_merged_df['level_five'].apply(lambda x: len(x) if x is not np.nan else 0)
with_excess = expert_merged_df[expert_merged_df['len_abnormalities'] > expert_merged_df['len_polygons']].index
expert_merged_df.loc[with_excess] = expert_merged_df.loc[with_excess].apply(lambda x: remove_excess_and_keep_last(x),
                                                                            axis=1)
# # drop unsure classes
expert_merged_df['len_polygons'] = expert_merged_df['polygons'].apply(lambda x: len(x) if x is not np.nan else 0)
expert_merged_df['len_abnormalities'] = expert_merged_df['level_five'].apply(lambda x: len(x) if x is not np.nan else 0)
# with_unsure = expert_merged_df[expert_merged_df['len_abnormalities'] > expert_merged_df['len_polygons']].index
# expert_merged_df.drop(index=with_unsure, inplace=True)
# # drop unclassified abnormalities
# with_unclassified = expert_merged_df[expert_merged_df['len_abnormalities'] < expert_merged_df['len_polygons']].index
# expert_merged_df.drop(index=with_unclassified, inplace=True)


# save to csv
expert_merged_df.to_csv('Tufts Dental Database/Expert/expert_merged.csv')

expert_merged_df.tail()

Unnamed: 0_level_0,Description,polygons,level_one,level_two,level_three,level_four,level_five,len_polygons,len_abnormalities
External ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1009.JPG,Periapical radiolucency associated with tooth ...,"[[[[595, 631], [594, 632], [591, 632], [590, 6...","[periapical, non-odontogenic, periapical]","[well_defined, well_defined, ill_defined]","[radiolucent, radiopaque, mixed-septae/calcifi...",[thinning],"[benign_cyst_neoplasia, developmental, inflamm...",3,3
1012.JPG,Pericoronal radiolucencies associated with too...,"[[[[1226, 499], [1224, 501], [1224, 502], [122...",[pericoronal],[well_defined],[radiolucent],[thinning],[benign_cyst_neoplasia],1,1
1015.JPG,Apical inflammatory changes associated with to...,"[[[[493, 562], [492, 563], [491, 563], [490, 5...",[periapical],[ill_defined],[mixed-septae/calcification],[],[inflammation],1,1
1047.JPG,There are linear calcific flecks noted in the ...,"[[[[1473, 701], [1473, 706], [1472, 707], [147...",[non-odontogenic],[well_defined],[radiopaque],[],[metabolic/systemic],1,1
1001.JPG,Multiple extractions sites in the maxilla and ...,"[[[[267, 496], [265, 498], [264, 498], [261, 5...","[periapical, periapical, periapical]",[well_defined],[radiolucent],[],"[benign_cyst_neoplasia, inflammation, trauma]",1,3


# Flatten and clean <u>student.json</u>, then merge abnormalities per xray

In [51]:
df = preprocess_json(r"Tufts Dental Database/Student/student.json")

# flatten Label.objects, fix separated rows
df = pd.concat([df.drop(columns='Label.objects'), pd.json_normalize(df['Label.objects'])], axis=1)
df_1 = df[df[1].notna()].drop(columns=[0]).rename(columns={1: 0})
# df_2 = df[df[2].notna()].drop(columns=[0, 1, 3]).rename(columns={2: 0})
# df_3 = df[df[3].notna()].drop(columns=[0, 1, 2]).rename(columns={3: 0})
# df = pd.concat([df, df_1, df_2, df_3], axis=0)
df = pd.concat([df, df_1], axis=0)
df = df.drop(columns=[1, 'Label.classifications'])

# add id and description to each dictionary of json objects
df[0] = df.apply(lambda row: {**row[0], 'External ID': row['External ID'], 'Description': row['Description']}, axis=1)

df[0] = df[0].apply(lambda x: give_empty_column(x, 'classifications'))

# create with_ab_df and fix the abnormality characteristic levels before merging back
with_ab_df = pd.json_normalize(data=df[0], record_path='classifications',
                               meta=['External ID', 'Description', 'title', 'value', 'polygons'],
                               meta_prefix='.', errors='ignore')
cols = ['.External ID', '.Description', '.polygons', '.value', 'value', 'answer.value', 'answers']
with_ab_df = with_ab_df[cols]
levels_df_id = with_ab_df['.External ID'].unique()
df = df[~df['External ID'].isin(levels_df_id)]
df = df.drop(columns=[0])

# flatten answers, fix separated answers (level three and four)
with_ab_df['answers'] = with_ab_df['answers'].fillna("").apply(list)
with_ab_df = pd.concat([with_ab_df.drop(columns='answers'), pd.json_normalize(with_ab_df['answers'])], axis=1)
not_nan_0 = with_ab_df[with_ab_df[0].notna()].index
not_nan_1 = with_ab_df[with_ab_df[1].notna()].index
not_nan_2 = with_ab_df[with_ab_df[2].notna()].index

levels_df_0 = with_ab_df.loc[not_nan_0].drop(columns=[1, 2, 'answer.value']).rename(columns={0: 'answer.value'})
levels_df_1 = with_ab_df.loc[not_nan_1].drop(columns=[0, 2, 'answer.value']).rename(columns={1: 'answer.value'})
levels_df_2 = with_ab_df.loc[not_nan_2].drop(columns=[0, 1, 'answer.value']).rename(columns={2: 'answer.value'})

levels_df_0['answer.value'] = levels_df_0['answer.value'].apply(lambda x: pd.json_normalize(x)['value'])
levels_df_1['answer.value'] = levels_df_1['answer.value'].apply(lambda x: pd.json_normalize(x)['value'])
levels_df_2['answer.value'] = levels_df_2['answer.value'].apply(lambda x: pd.json_normalize(x)['value'])

with_ab_df = with_ab_df.drop(columns=[0, 1, 2])
with_ab_df = pd.concat([with_ab_df, levels_df_0, levels_df_1, levels_df_2], axis=0)
with_ab_df = with_ab_df.dropna(subset=['answer.value'])

# merge abnormalities per xray id
merged_ab_df = pd.DataFrame(merge_abnormalities_per_xray(with_ab_df))
# merge back all
student_merged_df = pd.concat([df, merged_ab_df], axis=0)

# cleanup
# remove duplicates
student_merged_df.drop_duplicates(subset='External ID', inplace=True)
student_merged_df.set_index('External ID', inplace=True)
# remove excess per level characteristic
student_merged_df['len_polygons'] = student_merged_df['polygons'].apply(lambda x: len(x) if x is not np.nan else 0)
student_merged_df['len_abnormalities'] = student_merged_df['level_five'].apply(
    lambda x: len(x) if x is not np.nan else 0)
with_excess = student_merged_df[student_merged_df['len_abnormalities'] > student_merged_df['len_polygons']].index
student_merged_df.loc[with_excess] = student_merged_df.loc[with_excess].apply(lambda x: remove_excess_and_keep_last(x),
                                                                              axis=1)
# # drop unsure classes
student_merged_df['len_polygons'] = student_merged_df['polygons'].apply(lambda x: len(x) if x is not np.nan else 0)
student_merged_df['len_abnormalities'] = student_merged_df['level_five'].apply(
    lambda x: len(x) if x is not np.nan else 0)
# with_unsure = student_merged_df[student_merged_df['len_abnormalities'] > student_merged_df['len_polygons']].index
# student_merged_df.drop(index=with_unsure, inplace=True)
# # drop unclassified abnormalities
# with_unclassified = student_merged_df[student_merged_df['len_abnormalities'] < student_merged_df['len_polygons']].index
# student_merged_df.drop(index=with_unclassified, inplace=True)

# save to csv
student_merged_df.to_csv('Tufts Dental Database/Student/student_merged.csv')
student_merged_df.tail()

Unnamed: 0_level_0,Description,polygons,level_one,level_two,level_three,level_four,level_five,len_polygons,len_abnormalities
External ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1010.JPG,There is a large periapical radiolucency on to...,"[[[[863, 587], [863, 590], [861, 592], [861, 5...",[periapical],[well_defined],[radiolucent],[osseous_expansion],[benign_cyst_neoplasia],1,1
709.JPG,There is a periapical radiolucency on tooth nu...,"[[[[510, 580], [509, 581], [508, 581], [507, 5...",[periapical],[well_defined],[radiolucent],[],[inflammation],1,1
741.JPG,There is a non-odontogenic well-defined radiop...,"[[[[140, 766], [141, 767], [141, 768], [142, 7...",[non-odontogenic],[well_defined],[radiopaque],[],[metabolic/systemic],1,1
840.JPG,On the left side by the bifurcation of the car...,"[[[[1516, 724], [1517, 725], [1515, 727], [151...",[non-odontogenic],[well_defined],[radiopaque],[],[metabolic/systemic],1,1
205.JPG,There is a periapical well-defined radiolucent...,"[[[[428, 577], [429, 576], [431, 578], [431, 5...",[periapical],[well_defined],[radiolucent],[],[inflammation],1,1


# ~~Separate abnormalities per xray~~  (DO NOT USE, improperly implemented)

In [52]:
def separate_abnormalities_per_xray(df_, ):
    """
    separate abnormalities and their characteristics per xray

    :param df_: DataFrame
    :return separated_data: dict
    """

    def add_separated_data():
        """
        for each abnormality in an xray, separate and add this data to separated_data dict

        :return:
        """
        external_id = current_df.name
        description = current_df['Description']

        n = current_df['len_abnormalities']

        if n == 0:
            separated_data['External ID'].append(external_id)
            separated_data['Description'].append(description)
            separated_data['polygons'].append(np.nan)
            separated_data['level_one'].append(np.nan)
            separated_data['level_two'].append(np.nan)
            separated_data['level_three'].append(np.nan)
            separated_data['level_four'].append(np.nan)
            separated_data['level_five'].append(np.nan)
            return

        polygons = current_df['polygons']
        level_one = current_df['level_one']
        level_two = current_df['level_two']
        level_three = current_df['level_three']
        level_four = current_df['level_four']
        level_five = current_df['level_five']

        poly_n = len(polygons)
        l1_n = len(level_one)
        l2_n = len(level_two)
        l3_n = len(level_three)
        l4_n = len(level_four)
        l5_n = len(level_five)

        for i in range(n):
            separated_data['External ID'].append(external_id)
            separated_data['Description'].append(description)
            poly_i = i if i < poly_n else -1
            l1_i = i if i < l1_n else -1
            l2_i = i if i < l2_n else -1
            l3_i = i if i < l3_n else -1
            l4_i = i if i < l4_n else -1
            l5_i = i if i < l5_n else -1
            separated_data['polygons'].append(polygons[poly_i] if poly_n > 0 else np.nan)
            separated_data['level_one'].append(level_one[l1_i] if l1_n > 0 else np.nan)
            separated_data['level_two'].append(level_two[l2_i] if l2_n > 0 else np.nan)
            separated_data['level_three'].append(level_three[l3_i] if l3_n > 0 else np.nan)
            separated_data['level_four'].append(level_four[l4_i] if l4_n > 0 else np.nan)
            separated_data['level_five'].append(level_five[l5_i] if l5_n > 0 else np.nan)

    separated_data = {'External ID': [],
                      'Description': [],
                      'polygons': [],
                      'level_one': [],
                      'level_two': [],
                      'level_three': [],
                      'level_four': [],
                      'level_five': [],
                      }
    df_ids = df_.index.unique()
    for id_ in df_ids:
        current_df = df_.loc[id_]
        add_separated_data()

    return separated_data


# separate abnormalities per xray
expert_separated_df = pd.DataFrame(separate_abnormalities_per_xray(expert_merged_df))
student_separated_df = pd.DataFrame(separate_abnormalities_per_xray(student_merged_df))

# set index to external id
expert_separated_df.set_index('External ID', inplace=True)
student_separated_df.set_index('External ID', inplace=True)

# save to csv
expert_separated_df.to_csv('Tufts Dental Database/Expert/expert_separated.csv')
student_separated_df.to_csv('Tufts Dental Database/Student/student_separated.csv')

# Check count

In [53]:
# count expert merged
expert_merged_total = len(expert_merged_df)
expert_merged_abnormal = len(expert_merged_df[expert_merged_df['level_five'].notna()])
expert_merged_normal = expert_merged_total - expert_merged_abnormal
# # count expert separated
# expert_separated_total = len(expert_separated_df)
# expert_separated_abnormal = len(expert_separated_df[expert_separated_df['level_five'].notna()])
# expert_separated_normal = expert_separated_total - expert_separated_abnormal
# count student merged
student_merged_total = len(student_merged_df)
student_merged_abnormal = len(student_merged_df[student_merged_df['level_five'].notna()])
student_merged_normal = student_merged_total - student_merged_abnormal
# # count student separated
# student_separated_total = len(student_separated_df)
# student_separated_abnormal = len(student_separated_df[student_separated_df['level_five'].notna()])
# student_separated_normal = student_separated_total - student_separated_abnormal


# Create the hierarchical index
index = pd.MultiIndex.from_product([['expert', 'student'], ['merged', 'separated']], names=['annotator', 'type'])
columns = pd.Index(['normal', 'abnormal', 'total'], name='count')
# Create the DataFrame with the index and columns
df = pd.DataFrame(index=index, columns=columns)
# Add the data to the DataFrame
df.loc[('expert', 'merged'), 'normal'] = expert_merged_normal
df.loc[('expert', 'merged'), 'abnormal'] = expert_merged_abnormal
df.loc[('expert', 'merged'), 'total'] = expert_merged_total
# df.loc[('expert', 'separated'), 'normal'] = expert_separated_normal
# df.loc[('expert', 'separated'), 'abnormal'] = expert_separated_abnormal
# df.loc[('expert', 'separated'), 'total'] = expert_separated_total
df.loc[('student', 'merged'), 'normal'] = student_merged_normal
df.loc[('student', 'merged'), 'abnormal'] = student_merged_abnormal
df.loc[('student', 'merged'), 'total'] = student_merged_total
# df.loc[('student', 'separated'), 'normal'] = student_separated_normal
# df.loc[('student', 'separated'), 'abnormal'] = student_separated_abnormal
# df.loc[('student', 'separated'), 'total'] = student_separated_total

df

Unnamed: 0_level_0,count,normal,abnormal,total
annotator,type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
expert,merged,660.0,340.0,1000.0
expert,separated,,,
student,merged,799.0,201.0,1000.0
student,separated,,,


# Additional Notes

The annotations to the following images have multiple abnormalities in the same exact area (same mask).

- expert_df
    - ['935.JPG', '196.JPG', '906.JPG', '938.JPG', '521.JPG', '559.JPG',
       '378.JPG', '992.JPG', '995.JPG', '255.JPG', '288.JPG', '612.JPG',
       '551.JPG', '778.JPG', '810.JPG', '373.JPG', '566.JPG', '630.JPG',
       '315.JPG', '105.JPG', '336.JPG', '721.JPG', '371.JPG', '532.JPG',
       '1036.JPG', '1037.JPG', '1001.JPG']

- student_df
    - ['245.JPG', '761.JPG', '315.JPG', '925.JPG', '371.JPG', '357.JPG',
       '780.JPG', '224.JPG', '548.JPG', '355.JPG', '44.JPG', '108.JPG',
       '627.JPG', '951.JPG', '1011.JPG']