In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
ANNOTATIONS_TRAIN_CSV = '../metadata/annotations_train.csv'

In [None]:
df_annots              = pd.read_csv(ANNOTATIONS_TRAIN_CSV)
df_annots['age_years'] = df_annots['age'].to_numpy() / 365.25
df_annots

In [None]:
age = df_annots['age_years'].to_numpy()
sex = df_annots['sex'].to_numpy()

age_m = age[sex=='M']
age_f = age[sex=='F']

In [None]:
bins = np.linspace(start=15.0, stop=30.0, num=16, endpoint=True, dtype=np.int32)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20,10))

ax[0].hist(age, bins=bins, lw=1.5, color='lightgray', edgecolor='k')
ax[0].set_ylim(0,360)
ax[0].tick_params(labelsize=16, size=4)
ax[0].set_xlabel('age / (years)', fontsize=16)
ax[0].set_ylabel('count', fontsize=16)
ax[0].set_title('female and male combined', fontsize=18)

ax[1].hist([age_f, age_m], bins=bins, lw=1.5, color=['#8fb39f', '#f9f1cb'], edgecolor='k', label=['female', 'male'])
ax[1].set_ylim(0,360)
ax[1].tick_params(labelsize=16, size=4)
ax[1].set_xlabel('age / (years)', fontsize=16)
ax[1].set_ylabel('count', fontsize=16)
ax[1].set_title('female and male side by side', fontsize=18)
ax[1].legend(fontsize=16)

plt.savefig('../results/plots/data_distribution_train_set.png', facecolor='white', bbox_inches='tight', dpi=600)
plt.show()

In [None]:
df_annots['age_bin'] = np.digitize(df_annots['age_years'].to_numpy(), bins)
df_annots

In [None]:
_, counts         = np.unique(df_annots['age_bin'].to_numpy(), return_counts=True)
n_samples_per_bin = int(np.max(counts)/2.0)
n_samples_per_bin

In [None]:
df_flat = df_annots.groupby(['age_bin', 'sex']).apply(lambda x: x.sample(n_samples_per_bin, replace = True)).reset_index(drop = True)
df_flat

In [None]:
age_flat = df_flat['age_years'].to_numpy()
sex_flat = df_flat['sex'].to_numpy()

age_flat_m = age_flat[sex_flat=='M']
age_flat_f = age_flat[sex_flat=='F']

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20,10))

ax[0].hist(age_flat, bins=bins, lw=1.5, color='lightgray', edgecolor='k')
ax[0].set_ylim(0,360)
ax[0].tick_params(labelsize=16, size=4)
ax[0].set_xlabel('age / (years)', fontsize=16)
ax[0].set_ylabel('count', fontsize=16)
ax[0].set_title('female and male combined', fontsize=18)

ax[1].hist([age_flat_f, age_flat_m], bins=bins, lw=1.5, color=['#8fb39f', '#f9f1cb'], edgecolor='k', label=['female', 'male'])
ax[1].set_ylim(0,360)
ax[1].tick_params(labelsize=16, size=4)
ax[1].set_xlabel('age / (years)', fontsize=16)
ax[1].set_ylabel('count', fontsize=16)
ax[1].set_title('female and male side by side', fontsize=18)
ax[1].legend(fontsize=16)

plt.savefig('../results/plots/data_distribution_train_set_flat.png', facecolor='white', bbox_inches='tight', dpi=600)
plt.show()