# Importing Packages

In [112]:
import pandas as pd
import numpy as np
import os
from PIL import Image

# Preprocessing
## Loading Train Data

In [127]:
# gets data from the directory and returns (image, cobb_angle)
def get_data(file_name, cobb_angle):
    data_dir = "../datasets/shriners_xrays_png/overlays"
    full_path = "./%s/%s" % (data_dir, file_name)

    if os.path.exists(full_path):
        img = Image.open(full_path)
        data = np.expand_dims(np.asarray(img), axis=2)
        instance = (img, cobb_angle)
        return instance
    else:
        print("image not found at: " + full_path)

## Loading In Truth Data

In [129]:
df = pd.read_csv("shriners_overlay_angles.csv")

In [130]:
df = df.replace("Nan", np.nan)
df.count()

Image                 160
Main Thoracic         124
Lumbar                 72
Proximal Thoracic      28
Additional Metrics    160
dtype: int64

In [131]:
# still need to determine how to deal with nan truth values
# for now, drop all nan values and use "main thoracic" because it has the most truth values
df = df[df['Main Thoracic'].notna()].reset_index(drop=True)

In [132]:
df.count()

Image                 124
Main Thoracic         124
Lumbar                 72
Proximal Thoracic      28
Additional Metrics    124
dtype: int64

In [133]:
df.head()

Unnamed: 0,Image,Main Thoracic,Lumbar,Proximal Thoracic,Additional Metrics
0,9_DICOM_EXP00000_EXP0000_overlay.png,48.8,,,False
1,99_DICOM_EXP00000_EXP0000_overlay.png,71.0,-44.4,,False
2,997_DICOM_EXP00001_EXP0000_overlay.png,83.8,,,False
3,995_DICOM_EXP00000_EXP0000_overlay.png,32.8,-64.7,,True
4,994_DICOM_EXP00000_EXP0000_overlay.png,-38.1,43.0,-14.3,True


In [134]:
df.tail()

Unnamed: 0,Image,Main Thoracic,Lumbar,Proximal Thoracic,Additional Metrics
119,106_DICOM_EXP00000_EXP0000_overlay.png,61.2,-44.3,-32.8,True
120,104_DICOM_EXP00000_EXP0000_overlay.png,-51.4,58.7,,False
121,101_DICOM_EXP00000_EXP0000_overlay.png,42.0,,,False
122,1004_DICOM_EXP00000_EXP0000_overlay.png,-49.6,,,False
123,1001_DICOM_EXP00000_EXP0000_overlay.png,57.8,-68.2,,False


# Train/Test/Validation Splits
- train/test/validation split -> 70/15/15

In [135]:
# given a string "x/y/z" x=train_split, y=test_split, z=validation_split
def get_splits(splits):
    temp = splits.split("/")
    return (int(temp[0])/100, (int(temp[0]) + int(temp[1]))/100)

In [136]:
# train/test/validation splits
splits = get_splits("70/15/15")

# shuffle entries in the df
# df_sample = df
df_sample = df.sample(frac=1, random_state=1)
display(df_sample)

Unnamed: 0,Image,Main Thoracic,Lumbar,Proximal Thoracic,Additional Metrics
48,30_DICOM_EXP00000_EXP0000_overlay.png,-78.5,58.0,,True
114,121_DICOM_EXP00000_EXP0000_overlay.png,62.2,-45.2,-26.6,False
73,191_DICOM_EXP00000_EXP0000_overlay.png,41.4,,,False
106,136_DICOM_EXP00000_EXP0000_overlay.png,51.3,,,False
45,37_DICOM_EXP00000_EXP0000_overlay.png,24.7,-34.9,,False
...,...,...,...,...,...
9,987_DICOM_EXP00000_EXP0000_overlay.png,41.1,-28.2,,False
72,192_DICOM_EXP00000_EXP0000_overlay.png,59.8,-37.3,,False
12,979_DICOM_EXP00000_EXP0000_overlay.png,56.3,,,False
107,134_DICOM_EXP00000_EXP0000_overlay.png,55.1,-41.0,-29.1,True


In [137]:
index = df_sample.index.tolist()
indexLen = len(index)

train_indices = index[0:int(indexLen * splits[0])]
test_indices = index[int(indexLen * splits[0]):int(indexLen * splits[1])]
validation_indices = index[int(indexLen * splits[1]):]
print(train_indices)

[48, 114, 73, 106, 45, 31, 67, 46, 98, 83, 115, 33, 100, 117, 102, 121, 2, 89, 110, 51, 65, 78, 80, 17, 85, 54, 35, 69, 88, 99, 77, 42, 105, 55, 53, 44, 62, 58, 10, 112, 32, 82, 38, 19, 123, 27, 36, 56, 39, 74, 91, 95, 40, 59, 66, 90, 23, 34, 116, 108, 4, 103, 15, 104, 41, 52, 26, 43, 24, 97, 118, 93, 49, 21, 70, 3, 111, 30, 120, 47, 92, 8, 81, 60, 0, 113]


In [138]:
train_instances = []
test_instances = []
validation_instances = []

for i in train_indices:
    row = df.iloc[i]
    train_instances.append(get_data(row["Image"], row["Main Thoracic"]))

for i in test_indices:
    row = df.iloc[i]
    test_instances.append(get_data(row["Image"], row["Main Thoracic"]))

for i in validation_indices:
    row = df.iloc[i]
    validation_instances.append(get_data(row["Image"], row["Main Thoracic"]))

print(train_instances)
print(test_instances)
print(validation_instances)

[(<PIL.PngImagePlugin.PngImageFile image mode=L size=1800x3511 at 0x7FD4F912DBE0>, '-78.5'), (<PIL.PngImagePlugin.PngImageFile image mode=L size=2431x3892 at 0x7FD4FBF1CE20>, '62.2'), (<PIL.PngImagePlugin.PngImageFile image mode=L size=1567x3790 at 0x7FD4F912D670>, '41.4'), (<PIL.PngImagePlugin.PngImageFile image mode=L size=1694x4237 at 0x7FD4F93C6670>, '51.3'), (<PIL.PngImagePlugin.PngImageFile image mode=L size=2425x5747 at 0x7FD4F912D2E0>, '24.7'), (<PIL.PngImagePlugin.PngImageFile image mode=L size=2414x5675 at 0x7FD4F912D6A0>, '49.2'), (<PIL.PngImagePlugin.PngImageFile image mode=L size=2425x3847 at 0x7FD4F912D520>, '52.9'), (<PIL.PngImagePlugin.PngImageFile image mode=L size=2431x3898 at 0x7FD4F9042DF0>, '-53.4'), (<PIL.PngImagePlugin.PngImageFile image mode=L size=2431x3897 at 0x7FD4F98DCE80>, '53.4'), (<PIL.PngImagePlugin.PngImageFile image mode=L size=2026x4922 at 0x7FD4F9042F40>, '63.0'), (<PIL.PngImagePlugin.PngImageFile image mode=L size=2416x5666 at 0x7FD4F9042C70>, '61.0

In [140]:
# exist_ok=True -> won't throw error if directories already exist
dir = "./../datasets_npz/shriners_xrays_png/overlays/"
os.makedirs(dir, exist_ok=True)
with open(dir + "scoliosis_train.npz", "wb") as file:
    np.savez(file, arr=train_instances)

with open(dir + "scoliosis_test.npz", "wb") as file:
    np.savez(file, arr=test_instances)

  val = np.asanyarray(val)
  val = np.asanyarray(val)
