# Importing Packages

In [64]:
import pandas as pd
import numpy as np
import os
from PIL import Image
import scipy.io

# Preprocessing

In [82]:
# "train" set does not have csv files with cobb angle and landmark labels
# data_sets = ["test", "val"]
data_set = "val"

# path to base directory containing jpg data
jpg_dir = "../datasets/aasce-original-pp/data/" + data_set + "/"

# path to base directory containing truth data, given the set of data (test, validation)
data_dir = "../datasets/aasce-original-pp/labels/" + data_set + "/"

# path to store .npz format data in
npz_dir = "../datasets_npz/aasce-original-pp/data/"

# Getting File Names & Image Data (jpg)

In [83]:
df = pd.read_csv(data_dir + "filenames.csv", names=["filenames"], dtype=object)

In [84]:
def get_image(file_name):
    """Gets image associated with the given filename.

    Parameters
    ----------
    file_name : str
        contains the name of the file to load image data for

    Returns
    -------
    data : numpy.array
    """
    full_path = jpg_dir + file_name

    if os.path.exists(full_path):
        img = Image.open(full_path)
        data = np.expand_dims(np.asarray(img), axis=2)
        return data
    else:
        print("image not found at: " + full_path)

In [85]:
images = []

for i in range(len(df.axes[0])):
    filename = df.iloc[i, 0]
    images.append(get_image(filename))

df["image"] = images

# Truth Data
- we are provided with:
    - vertebrae landmarks (68 landmarks per image - 4 landmarks for each of 17 vertebra)
    - cobb angle measurements (3 cobb angles per image)

## Landmarks

In [86]:
mat_instances = []
for i in range(len(df.axes[0])):
    filename = df.iloc[i, 0]
    mat_instance = scipy.io.loadmat(data_dir + filename)
    mat_instances.append(mat_instance["p2"])

df["landmark"] = mat_instances

## Cobb Angle

In [87]:
angles = pd.read_csv(data_dir + "angles.csv", names=["angle" + str(i) for i in range(3)], dtype=object)
df = df.join(angles)

In [88]:
# dir = "./../datasets_npz/aasce-original-pp/data/"
os.makedirs(dir, exist_ok=True)

instances = []

for i in range(len(df.axes[0])):
    row = df.iloc[i]
    instance = (row["image"], max(row["angle0"], row["angle1"], row["angle2"]), row["landmark"])
    
with open(dir + "aasce_" + data_set + ".npz", "wb") as file:
    np.savez(file, arr=instances)