In [1]:
import pandas as pd
import matplotlib.pyplot
import json
from pathlib import Path

In [2]:
path_to_features_train = Path.cwd() / "resources" / "lesions_features_train.csv"
path_to_features_test = Path.cwd() / "resources" / "lesions_features_test.csv"
path_to_patients = Path.cwd().parents[0] / "data" / "metadata" / "patients.csv"
path_to_series = Path.cwd().parents[0] / "data" / "metadata" / "series.json"

In [3]:
# Add patient info to series
patients_df = pd.read_csv(path_to_patients)
with open(path_to_series, 'r') as file:
    series_df = pd.DataFrame(json.load(file))
series_df.drop(columns='patient_code', inplace=True)
series_df = series_df.merge(
    patients_df,
    on=['patient_id'],
    how='left'
)

In [4]:
# Read dataframe
df = pd.read_csv(path_to_features_train)
df["subset"] = "train"
test_df = pd.read_csv(path_to_features_test)
test_df["subset"] = "test"
df = pd.concat([df, test_df], ignore_index=True)
df["uuid"] = df["filename"].apply(lambda x: x.split('.nii.gz')[0])
df["lesion_type"] = df["label_description"].apply(lambda x: x.split(',')[0])
df["lesion_location"] = df["label_description"].apply(lambda x: x.split(',')[1])
columns = [
    'patient_id',
    'uuid',
    'sex',
    'age',
    'diagnosis'
]
df = df.merge(
    series_df[columns],
    on=['uuid'],
    how='left'
)
display(df.describe())

Unnamed: 0,label_value,voxels_count,volume_ml,slices_count,mean_HU,std_HU,major_axis,minor_axis,major_axis_slice_idx,patient_id,age
count,973.0,973.0,973.0,973.0,973.0,973.0,973.0,973.0,973.0,973.0,973.0
mean,56.27852,6334.654676,5.151262,9.502569,-50.183756,171.362119,18.288524,11.041996,135.846865,19.807811,53.355601
std,65.175416,34163.668939,30.934654,8.623065,121.551975,103.195755,18.354697,10.331633,62.48055,8.455345,12.40449
min,1.0,52.0,0.014906,2.0,-680.5344,10.485603,3.239035,0.0,11.0,1.0,37.0
25%,7.0,458.0,0.194881,4.0,-119.00098,30.51495,9.119366,6.082024,85.0,12.0,37.0
50%,31.0,1186.0,0.497052,8.0,-48.438553,215.59561,12.983194,8.237212,131.0,22.0,56.0
75%,81.0,3026.0,1.659989,11.0,60.97165,244.60356,21.007446,11.977829,178.0,28.0,61.0
max,278.0,799981.0,719.045727,88.0,127.34615,394.05157,198.395296,117.515563,335.0,44.0,75.0


In [5]:
# Lung and Liver tumors
subset_df = df[df["lesion_location"].isin(["lung", "liver"])].copy()
grouped = subset_df.groupby(['lesion_location', 'subset']).size().reset_index(name="lesions_count")
display(grouped)
grouped = subset_df.groupby(['lesion_location', 'subset'])["filename"].nunique().reset_index(name="images")
display(grouped)
grouped = subset_df.groupby(['lesion_location', 'subset'])["patient_id"].nunique().reset_index(name="patient_count")
display(grouped)
grouped = subset_df.groupby(['lesion_location', 'subset'])["volume_ml"].sum().reset_index(name="annotated_volume_ml")
display(grouped)
grouped = subset_df.groupby(['lesion_location'])["volume_ml"].sum().reset_index(name="annotated_volume_ml")
display(grouped)

Unnamed: 0,lesion_location,subset,lesions_count
0,liver,test,78
1,liver,train,115
2,lung,test,254
3,lung,train,417


Unnamed: 0,lesion_location,subset,images
0,liver,test,2
1,liver,train,16
2,lung,test,4
3,lung,train,16


Unnamed: 0,lesion_location,subset,patient_count
0,liver,test,1
1,liver,train,7
2,lung,test,2
3,lung,train,8


Unnamed: 0,lesion_location,subset,annotated_volume_ml
0,liver,test,1199.236905
1,liver,train,1057.88736
2,lung,test,129.349177
3,lung,train,557.856009


Unnamed: 0,lesion_location,annotated_volume_ml
0,liver,2257.124265
1,lung,687.205185
