# xzt Train & Test Data Preparation

Here the data is prepared such that 6550 points, ordered and hits first can be taken for train and random points can be taken for test

In [1]:
def print_df_stats(df_noise, df_mixed):
    """
    General Stats on mixed and noise groups
    """
    noise_stats = df_noise.groupby('group')['label']\
                    .count()\
                    .sort_values(ascending=False)\
                    .describe()
    mixed_stats = df_mixed.groupby('group')['label']\
                    .count()\
                    .sort_values(ascending=False)\
                    .describe()
    
    neg, pos = np.bincount(df_mixed['label'])
    total = neg + pos
    
    hits = df_mixed[df_mixed.label == 1].groupby(['group', 'label'])['label']
    noise = df_mixed[df_mixed.label == 0].groupby(['group', 'label'])['label']
    
    hits_count = hits.count().sort_values(ascending=False)[:SIZE]
    noise_count = noise.count().sort_values(ascending=False)[:SIZE]
    
    hits_to_noise = hits_count.droplevel(level='label')/noise_count.droplevel(level='label')
    
    class_imbalance = df_mixed.groupby(['group', 'label'])['label'].count()[:20]
    
        
    print("NOISE STATS: \n{}\n".format(noise_stats))
    print("MIXED STATS: \n{}\n".format(mixed_stats))
    
    print("Mixed Groups: \n")
    print("Examples:\n Total: {}\n Positive: {} ({:.2f}% of total)\n".format(total,
                                                                             pos, 100 * pos / total))
    
    print("Example of class imbalance in mixed groups: \n ".format(class_imbalance.head()))
           
    print("Hits Only (Within mixed groups):\n")
    print("The largest hits for a group: {}\n".format(hits_count.max()))
    print("The smallest hits for a group: {}\n".format(hits_count.min()))
    print("The mean hits for a group: {}\n".format(hits_count.mean()))
                                                                                                                                    
    print("Noise Only (Within mixed groups):\n") 
    print("The largest noise for a group: {}\n".format(noise_count.max()))
    print("The smallest noise for a group: {}\n".format(noise_count.min())) 
    print("The mean noise for a group: {}\n".format(noise_count.mean()))
    print("Hits-Noise Ratio:\n") 
    print("The highest ratio: {:.2f}% ({})\n".format(hits_to_noise.max()*100,
                                                    hits_to_noise.max()))
    print("The smallest ratio: {:.2f}% ({}) \n".format(hits_to_noise.min()*100,
                                                       hits_to_noise.min())) 
    print("The mean ratio: {:.2f}% ({})\n".format(hits_to_noise.mean()*100, 
                                                  hits_to_noise.mean()))




def plot(df_mixed):
    """
    Generate plots of full data in mixed groups
    """
    pos_df = pd.DataFrame(df_mixed[df_mixed.label == 1], columns = df_mixed.columns)
    neg_df = pd.DataFrame(df_mixed[df_mixed.label == 0], columns = df_mixed.columns)
    sns.jointplot(pos_df['pos_z'], pos_df['time'],
                  kind='hex')
    plt.suptitle("Positive distribution (Hits) for pos_z vs time")
    sns.jointplot(neg_df['pos_z'], neg_df['time'],
                  kind='hex')
    _ = plt.suptitle("Negative distribution (Noise) for pos_z vs time")
    
    plt.savefig("../../assets/Distributon of points in Mixed Groups (posz-time)")
    

def sampled_plot(sampled_mixed):
    """
    Generate plots of equally sampled points in mixed groups
    """
    list_mixed_groups = df_mixed.groupby('group')['label'].count().sort_values(ascending=False)[:SIZE]
    data_subset = sampled_mixed.loc[sampled_mixed['group'].isin(list_mixed_groups)]
    pos_df = data_subset[data_subset.label == 1]
    neg_df = data_subset[data_subset.label == 0]
    
    sns.jointplot(pos_df['pos_z'], pos_df['time'],
                  kind='hex')
    plt.suptitle("EQUAL SAMPLING: Positive distribution (Hits) for pos_z vs time")
    
    sns.jointplot(neg_df['pos_z'], neg_df['time'],
                  kind='hex')
    _ = plt.suptitle("EQUAL SAMPLING: Negative distribution (Noise) for pos_z vs time")

    plt.savefig("../../assets/Distributon of Equally Sampled points in Mixed Groups (posz-time)")

In [2]:
import matplotlib
import pandas as pd
import numpy as np
from random import shuffle

In [3]:
def read_csv(READ_PATH):
    """
    Read CSV at specified Path
    """
    return pd.read_csv(READ_PATH)


def identify_groups(df):
    """
    1. Tag groups that have only noise [0]
    2. Tag groups that have both noise and hits [0,1]
    3. Separate noise groups and hits+noise groups
    """
    # Label groups by noise and and hits
    df_count_label_type = pd.DataFrame(df.groupby('group')['label'].unique()).reset_index()
    
    # Obtain groups with only hits
    df_noise = df_count_label_type.loc[
        np.array(list(map(len, df_count_label_type.label.values))) == 1]
    
    # Obtain groups with noise && hits
    df_mixed = df_count_label_type.loc[
        np.array(list(map(len, df_count_label_type.label.values))) > 1]
    
    return df_noise, df_mixed


def generate_dfs(df, df_noise, df_mixed):
    """
    Obtain full dataframe based on identified noise
    only groups and mixed groups
    """
    df_noise = df[df.group.isin(df_noise.group)]
    df_mixed = df[df.group.isin(df_mixed.group)]
    
    return df_noise, df_mixed


def identify_top_groups(df, SIZE):
    """
    Obtain a list of the top groups to be selected as 
    per SIZE
    
    **Note: SIZE == 200**
    """
    top_groups = df.groupby('group')['label'].count().sort_values(ascending=False)[:SIZE]
    top_groups = list(top_groups.index)
    
    return top_groups


def randomize_files(files):
    """
    Shuffles files in a random order
    """
    return shuffle(files)


def prepare_split(files):
    """
    Identifies the ratios for files to be split into
    
    **Note**
    Current setting is for 80-20
    """
    eighty = int(0.8 * len(files))
    twenty = int(len(files) - eighty)
    files = np.array(files)
    
    return eighty, twenty, files


def generate_ids(eighty, twenty):
    """
    Assigns 1 to 80% of the files and 0 to 20% of the files
    """
    idx = np.hstack((np.ones(eighty),
                     np.zeros(twenty)))
    return idx


def train_test_split(files, idx):
    """
    Files tagged as 1 are categorised as training files
    Files tagged as 0 are categorised as test files
    """
    train = files[idx == 1]
    test = files[idx == 0]
    print("TRAIN SET: {0}".format(train))
    print("TEST SET: {0}".format(test))
    return train, test


def generate_train_test(df):
    top_groups = identify_top_groups(df, SIZE)
    randomize_files(top_groups)
    eighty, twenty, files = prepare_split(top_groups)
    idx = generate_ids(eighty, twenty)
    train, test = train_test_split(files, idx) 
    
    return train, test


def sort_groups(df):
    """
    Within each mixed groups (hits + noise), we need to sort
    groups such that those with highest hits are selected first
    
    DataFrame --> List(int)
    """
    
    # group by groups and labels and associated counts for each label
    grouped_df = pd.DataFrame(df.groupby(['group', 'label'])['label'].count())
    grouped_df = grouped_df.rename(columns={'label':'count'})
    
    # sort groups based on highest occurance of hits
    grouped_sorted_df = grouped_df.sort_values(grouped_df.columns.tolist())\
                            .sort_index(level=1, ascending=False, sort_remaining=False)\
                            .reset_index()
    
    # Obtain list of groups with highest hits based on sorted order
    sorted_groups_list = pd.DataFrame(grouped_sorted_df.group)

    # Drop duplicate groups
    sorted_groups_list = sorted_groups_list.drop_duplicates()

    # Count the occurances of groups (should only occur once)
    sorted_groups_list['g'] = sorted_groups_list.groupby('group').cumcount()

    # Make copy of dataframe
    copy_df = df
    
    # Save original index positions as a column
    copy_df_indices = copy_df.reset_index()
    
    # Make a count of occurances for each group
    copy_df_indices['group_count'] = copy_df_indices.groupby('group').cumcount() 
    
    # Merge the list of groups with the partial df to obtain corresponding full dataframe
    copy_df = sorted_groups_list.merge(copy_df_indices)\
                                .set_index('index')\
                                .rename_axis(None)\
                                .drop(['group_count', 'g'], axis=1)
    
    # For each group, sort by labels within each group starting from 1 till 0
    df = copy_df.groupby(['group'], sort=False)\
                 .apply(lambda x: x.sort_values(['label'], ascending=False))\
                 .reset_index(drop=True)
    
    return df    


def equal_sampling(df):
    """
    Equally sample points from each timeslice group.
    Use 6550 - min number of points in hits group
    
    **Note** 
    Sorting values on label ensures that max number of
    hits per group is taken. 
    
    **For example**
    Group 1 has 6549 noise and 600 hits. Sorting first on hits will allow
    sample to have 600 hits and 5950 noise. 
    Without any sorting, sample would have had 6549 noise and 1 hit
    """    
    POINTS = 6550
    return df.groupby('group', sort=False)\
             .head(POINTS)\
             .reset_index(drop=True)


def remove_groups(df):
    """
    Remove groups that have less than 6550 members per timeslice
    group
    """
    POINTS = 6550
    g = df.groupby('group')
    
    return g.filter(lambda x: len(x) >= POINTS)


def sample_noise_data(tag, df):
    """
    Sample noise based on specified tag
    """
    if tag == "equal":
        sampled_noise = equal_sampling(df)
        equal_sampled_noise = remove_groups(sampled_noise)

        return equal_sampled_noise
    
    
def sample_mixed_data(tag, df):
    """
    Sample mixed data based on specified tag
    """
    df = sort_groups(df)
    
    if tag == "equal":
        return equal_sampling(df)

In [4]:
def save_noise_test(df_noise, test, WRITE_NOISE_TEST):
    """
    Save unsampled noise test data
    """
    for idx in test:
        file_name = "group_"+str(idx)+".xyz"
        np.savetxt(WRITE_NOISE_TEST + file_name,
                   df_noise[df_noise.group == idx][['pos_x', 'pos_z', 'time']].values)
    
    print("All {0} files saved successfully in {1}!"\
                  .format(len(test), WRITE_NOISE_TEST))
    
def save_mixed_test(df_mixed, test, WRITE_MIXED_TEST):
    """
    Save unsampled mixed test data
    """
    for idx in test:
        file_name = "group_"+str(idx)+".xyz"
        np.savetxt(WRITE_MIXED_TEST + file_name,
                   df_mixed[df_mixed.group == idx][['pos_x', 'pos_z', 'time']].values)
    
    print("All {0} files saved successfully in {1}!"\
                  .format(len(test), WRITE_MIXED_TEST))
    
    
def save_noise_train(sampled_df, train, WRITE_NOISE_TRAIN):
    """
    Save sampled noise train data
    """
    
    for idx in train:
        file_name = "group_" + str(idx) + ".xyz"
        np.savetxt(WRITE_NOISE_TRAIN + file_name,
                   sampled_df[sampled_df.group == idx][['pos_x', 'pos_z', 'time']].values)
    
    print("All {0} files saved successfully in {1}!"\
                .format(len(train), WRITE_NOISE_TRAIN))
    
    
def save_mixed_train(sampled_df, train, WRITE_MIXED_TRAIN):
    """
    Save sampled mixed train data
    """
    
    for idx in train:
        file_name = "group_" + str(idx) + ".xyz"
        np.savetxt(WRITE_MIXED_TRAIN + file_name,
                   sampled_df[sampled_df.group == idx][['pos_x', 'pos_z', 'time']].values)
    
    print("All {0} files saved successfully in {1}!"\
                .format(len(train), WRITE_MIXED_TRAIN))

In [5]:
READ_PATH = "../../data/simplified_data.csv"
WRITE_NOISE_TRAIN = "../../data/ensemble/xzt/points/noise/train/"
WRITE_NOISE_TEST = "../../data/ensemble/xzt/points/noise/test/"

WRITE_MIXED_TRAIN =  "../../data/ensemble/xzt/points/mixed/train/"
WRITE_MIXED_TEST = "../../data/ensemble/xzt/points/mixed/test/"

SIZE = 200

df = read_csv(READ_PATH)
df_noise, df_mixed = identify_groups(df)
df_noise, df_mixed = generate_dfs(df, df_noise, df_mixed)

In [6]:
####NOISE####
train, test = generate_train_test(df_noise)
df_train_noise = df_noise[df_noise.group.isin(train)]
df_train_noise = sample_noise_data("equal", df_train_noise)

save_noise_test(df_noise, test, WRITE_NOISE_TEST)
save_noise_train(df_train_noise, train, WRITE_NOISE_TRAIN)

TRAIN SET: [5605 1405 1101 5182 1567  532  762  669 1346 1955 5103 5678 2869 5009
 6540 4874 5840  374 3219 1304 1639 4274  208 4989 3704    0 2272  788
 2957 4430 5097 6117 2751 3487 1202 5511 4775 4477  474 3180 6432 1016
 2658 1531  648 5006 1204 3403 4391 1620 5941 4640 3985 1129 4019 4671
 3123  134 6396 1413 1481 2093 5718 3974 5977 4199 4615 4426 3355  758
 4267 4081 5648  210 5905 6630 5491 4590 3253 1586 1681 2533 6427 3941
  576 5038 2932  999 2956 5982 4571 4328 5149  437 6048 2587 5434 3416
 3222 4799 4897 2872 3168 4969 3440 4870 1108 3815 5308 1151 3492 4060
 3042 2128 1374 2385 6560 5701 6132 3763  914 3609  432 6433 3943  601
 3925 5337 3152 4695 4900 1846 1189 6241 3968 3594  583 5510 6589 2414
  416 4591 1325 1055 4848 5991 4796 6139 5412 3569 5578 2834 4483 1583
 3215 3758 3352 6555  903 5922]
TEST SET: [2527 4444 2875 1667 4179 3189  630 5067 3340  449 1729 5222 6177 3060
 2719 2934 3507 3960 4463 5190 3792 2557 1246 1333 2621 3799  686 1391
 6181  990  301  909 403

In [7]:
####MIXED####
train, test = generate_train_test(df_mixed)
df_train_mixed = df_mixed[df_mixed.group.isin(train)]
df_train_mixed = sample_mixed_data("equal", df_train_mixed)

save_mixed_test(df_mixed, test, WRITE_MIXED_TEST)
save_mixed_train(df_train_mixed, train, WRITE_MIXED_TRAIN)

TRAIN SET: [2517 5883 6426   36 6084 1214 1273 5866 2330 3310 3727  231 4282 5281
 1038 2338 4565 6542 5272  411 1657 2323 5295  797 4215 5215 1496 3611
   50 1454 6553 4793 2774 2099 2231 1653 4926 2161 2567 1813 1488 1344
 4227 5133 2404 4562  941 2654 5886 6530  810 6221 4791  584 5621 5280
 2642 4080 4735 3398 3094 3490 3315 3287 4854 1163 1732 5116 1637 3953
 2795 3170 3670 3072  530  979 6038 2759 4243 1836 5422 2362 6590  845
 4327 4417 1194 4034 5234 1172 4532 1825 3967 2374 3025 3924 6134  997
 5114 1937 6314   55 6495 1209 5847 2821 6567 1516  809 3483 6534 6203
 3923 3001  281 5795 1446  103 5032 1869 1276 1926 2890 5211 3252  394
  477 1820 4943 2024  355 2098 4740 3628 1918 4347 1154 4646 4301  491
 3700  716 4515 1599 1671 3845 1275 2021 5270 4679 6588 4428 4075 5725
  102 5857 1184 1484 6337 4653]
TEST SET: [1357 1658 4460  615 5812 5659 6378 2367 6493  166 4132 5747  375  249
 4569  554 5823  831 6109 3063 1973 5219 2211 5312 1650 5010 2389 1232
 5799  857 3514 6006 471