In [242]:
import numpy as np # linear algebra
import pandas as pd # data processing
import matplotlib.pyplot as plt # showing and rendering figures
# io related
from skimage.io import imread
import os
from glob import glob
import seaborn as sns 
%matplotlib inline 

In [258]:
base_bone_dir = os.path.join('..', 'input', 'rsna-bone-age')
age_df = pd.read_csv(os.path.join(base_bone_dir, 'boneage-training-dataset.csv'))
age_df['path'] = age_df['id'].map(lambda x: os.path.join(base_bone_dir,
                                                         'boneage-training-dataset', 
                                                         'boneage-training-dataset', 
                                                         '{}.png'.format(x)))
age_df['exists'] = age_df['path'].map(os.path.exists)
print(age_df['exists'].sum(), 'images found of', age_df.shape[0], 'total')
age_df['gender'] = age_df['male'].map(lambda x: 'male' if x else 'female')
boneage_mean = age_df['boneage'].mean()
boneage_div = 2*age_df['boneage'].std()
# we don't want normalization for now
boneage_mean = 0
boneage_div = 1.0
age_df['boneage_zscore'] = age_df['boneage'].map(lambda x: (x-boneage_mean)/boneage_div)
age_df.dropna(inplace = True)
age_df.sample(3)

# **Data Visualization**

In [259]:
age_df[['boneage', 'male', 'boneage_zscore']].hist(figsize = (10, 5))
age_df['boneage_category'] = pd.cut(age_df['boneage'], 10)

In [262]:
age_df[ 'gender'].hist(figsize = (10, 5))

# **Age Group**
Below we divide the data to 8 different subgroup using the age variable and show an example from the male and female gender to visualize the data.

In [245]:
age_groups = 8
#qcut tries to divide up the underlying data into equal sized bins
age_df['age_class']=pd.qcut(age_df['boneage'], age_groups)

#group the data by age class and gender
age_overview_df = age_df.groupby(['age_class', 'male']).apply(lambda x: x.sample(1)).reset_index(drop = True)

In [246]:
#creates figures and a grid of subplots with a single call
fig, m_axs = plt.subplots( age_groups, 2 , figsize = (12,6* age_groups))
for c_ax, (_, c_row) in zip(m_axs.flatten(),
                            age_overview_df.sort_values(['age_class', 'male']).iterrows()):
    c_ax.imshow(imread(c_row['path']), cmap = 'viridis')
    c_ax.axis('off')
    c_ax.set_title('{boneage} months, {male}'.format(**c_row))

# **Training and Validation Datasets**

The data is dividied into an 85:15 training:validation split resulting in 10,719 training images and 1,892 validation images. The reason for the this divisidion is to maximise the training set to maximise the number of examples the network could learn from.

In [247]:
from sklearn.model_selection import train_test_split
raw_train_df, valid_df = train_test_split(age_df, test_size=0.15, random_state= 2018, stratify = age_df['boneage_category'])
print('train', raw_train_df.shape[0],'validation', valid_df.shape[0])

In [248]:
train_df = raw_train_df.groupby(['boneage_category', 'male']).apply(lambda x: x.sample(500, replace=True)
                                                                   ).reset_index(drop = True)
print('New Data Size:', train_df.shape[0], 'Old Size:', raw_train_df.shape[0])
train_df['boneage'].hist(figsize=(10, 5))

In [249]:
train_df['gender'].hist(figsize=(10, 5))