# xyt Train & Test Data Preparation

Here the data is prepared such that 6550 points, ordered and hits first can be taken for train and random points can be taken for test

Point Cloud NN requires data in the following scheme:

dataset

--- class1  

    --- train  
        * file.off
        * file.off
    
    --- test 
        * file.off
        * file.off
    

--- class2  

    --- train 
        * file.off
        * file.off
    
    --- test  
        * file.off
        * file.off

In [None]:
def print_df_stats(df_noise, df_mixed):
    """
    General Stats on mixed and noise groups
    """
    noise_stats = df_noise.groupby('group')['label']\
                    .count()\
                    .sort_values(ascending=False)\
                    .describe()
    mixed_stats = df_mixed.groupby('group')['label']\
                    .count()\
                    .sort_values(ascending=False)\
                    .describe()
    
    neg, pos = np.bincount(df_mixed['label'])
    total = neg + pos
    
    hits = df_mixed[df_mixed.label == 1].groupby(['group', 'label'])['label']
    noise = df_mixed[df_mixed.label == 0].groupby(['group', 'label'])['label']
    
    hits_count = hits.count().sort_values(ascending=False)[:SIZE]
    noise_count = noise.count().sort_values(ascending=False)[:SIZE]
    
    hits_to_noise = hits_count.droplevel(level='label')/noise_count.droplevel(level='label')
    
    class_imbalance = df_mixed.groupby(['group', 'label'])['label'].count()[:20]
    
        
    print("NOISE STATS: \n{}\n".format(noise_stats))
    print("MIXED STATS: \n{}\n".format(mixed_stats))
    
    print("Mixed Groups: \n")
    print("Examples:\n Total: {}\n Positive: {} ({:.2f}% of total)\n".format(total,
                                                                             pos, 100 * pos / total))
    
    print("Example of class imbalance in mixed groups: \n ".format(class_imbalance.head()))
           
    print("Hits Only (Within mixed groups):\n")
    print("The largest hits for a group: {}\n".format(hits_count.max()))
    print("The smallest hits for a group: {}\n".format(hits_count.min()))
    print("The mean hits for a group: {}\n".format(hits_count.mean()))
                                                                                                                                    
    print("Noise Only (Within mixed groups):\n") 
    print("The largest noise for a group: {}\n".format(noise_count.max()))
    print("The smallest noise for a group: {}\n".format(noise_count.min())) 
    print("The mean noise for a group: {}\n".format(noise_count.mean()))
    print("Hits-Noise Ratio:\n") 
    print("The highest ratio: {:.2f}% ({})\n".format(hits_to_noise.max()*100,
                                                    hits_to_noise.max()))
    print("The smallest ratio: {:.2f}% ({}) \n".format(hits_to_noise.min()*100,
                                                       hits_to_noise.min())) 
    print("The mean ratio: {:.2f}% ({})\n".format(hits_to_noise.mean()*100, 
                                                  hits_to_noise.mean()))




def plot(df_mixed):
    """
    Generate plots of full data in mixed groups
    """
    pos_df = pd.DataFrame(df_mixed[df_mixed.label == 1], columns = df_mixed.columns)
    neg_df = pd.DataFrame(df_mixed[df_mixed.label == 0], columns = df_mixed.columns)
    sns.jointplot(pos_df['pos_z'], pos_df['time'],
                  kind='hex')
    plt.suptitle("Positive distribution (Hits) for pos_z vs time")
    sns.jointplot(neg_df['pos_z'], neg_df['time'],
                  kind='hex')
    _ = plt.suptitle("Negative distribution (Noise) for pos_z vs time")
    
    plt.savefig("../../assets/Distributon of points in Mixed Groups (posz-time)")
    

def sampled_plot(sampled_mixed):
    """
    Generate plots of equally sampled points in mixed groups
    """
    list_mixed_groups = df_mixed.groupby('group')['label'].count().sort_values(ascending=False)[:SIZE]
    data_subset = sampled_mixed.loc[sampled_mixed['group'].isin(list_mixed_groups)]
    pos_df = data_subset[data_subset.label == 1]
    neg_df = data_subset[data_subset.label == 0]
    
    sns.jointplot(pos_df['pos_z'], pos_df['time'],
                  kind='hex')
    plt.suptitle("EQUAL SAMPLING: Positive distribution (Hits) for pos_z vs time")
    
    sns.jointplot(neg_df['pos_z'], neg_df['time'],
                  kind='hex')
    _ = plt.suptitle("EQUAL SAMPLING: Negative distribution (Noise) for pos_z vs time")

    plt.savefig("../../assets/Distributon of Equally Sampled points in Mixed Groups (posz-time)")

In [1]:
import matplotlib
import pandas as pd
import numpy as np
from random import shuffle

In [2]:
def read_csv(READ_PATH):
    """
    Read CSV at specified Path
    """
    return pd.read_csv(READ_PATH)


def identify_groups(df):
    """
    1. Tag groups that have only noise [0]
    2. Tag groups that have both noise and hits [0,1]
    3. Separate noise groups and hits+noise groups
    """
    # Label groups by noise and and hits
    df_count_label_type = pd.DataFrame(df.groupby('group')['label'].unique()).reset_index()
    
    # Obtain groups with only hits
    df_noise = df_count_label_type.loc[
        np.array(list(map(len, df_count_label_type.label.values))) == 1]
    
    # Obtain groups with noise && hits
    df_mixed = df_count_label_type.loc[
        np.array(list(map(len, df_count_label_type.label.values))) > 1]
    
    return df_noise, df_mixed


def generate_dfs(df, df_noise, df_mixed):
    """
    Obtain full dataframe based on identified noise
    only groups and mixed groups
    """
    df_noise = df[df.group.isin(df_noise.group)]
    df_mixed = df[df.group.isin(df_mixed.group)]
    
    return df_noise, df_mixed


def identify_top_groups(df, SIZE):
    """
    Obtain a list of the top groups to be selected as 
    per SIZE
    
    **Note: SIZE == 200**
    """
    top_groups = df.groupby('group')['label'].count().sort_values(ascending=False)[:SIZE]
    top_groups = list(top_groups.index)
    
    return top_groups


def randomize_files(files):
    """
    Shuffles files in a random order
    """
    return shuffle(files)


def prepare_split(files):
    """
    Identifies the ratios for files to be split into
    
    **Note**
    Current setting is for 80-20
    """
    eighty = int(0.8 * len(files))
    twenty = int(len(files) - eighty)
    files = np.array(files)
    
    return eighty, twenty, files


def generate_ids(eighty, twenty):
    """
    Assigns 1 to 80% of the files and 0 to 20% of the files
    """
    idx = np.hstack((np.ones(eighty),
                     np.zeros(twenty)))
    return idx


def train_test_split(files, idx):
    """
    Files tagged as 1 are categorised as training files
    Files tagged as 0 are categorised as test files
    """
    train = files[idx == 1]
    test = files[idx == 0]
    print("TRAIN SET: {0}".format(train))
    print("TEST SET: {0}".format(test))
    return train, test


def generate_train_test(df):
    top_groups = identify_top_groups(df, SIZE)
    randomize_files(top_groups)
    eighty, twenty, files = prepare_split(top_groups)
    idx = generate_ids(eighty, twenty)
    train, test = train_test_split(files, idx) 
    
    return train, test


def sort_groups(df):
    """
    Within each mixed groups (hits + noise), we need to sort
    groups such that those with highest hits are selected first
    
    DataFrame --> List(int)
    """
    
    # group by groups and labels and associated counts for each label
    grouped_df = pd.DataFrame(df.groupby(['group', 'label'])['label'].count())
    grouped_df = grouped_df.rename(columns={'label':'count'})
    
    # sort groups based on highest occurance of hits
    grouped_sorted_df = grouped_df.sort_values(grouped_df.columns.tolist())\
                            .sort_index(level=1, ascending=False, sort_remaining=False)\
                            .reset_index()
    
    # Obtain list of groups with highest hits based on sorted order
    sorted_groups_list = pd.DataFrame(grouped_sorted_df.group)

    # Drop duplicate groups
    sorted_groups_list = sorted_groups_list.drop_duplicates()

    # Count the occurances of groups (should only occur once)
    sorted_groups_list['g'] = sorted_groups_list.groupby('group').cumcount()

    # Make copy of dataframe
    copy_df = df
    
    # Save original index positions as a column
    copy_df_indices = copy_df.reset_index()
    
    # Make a count of occurances for each group
    copy_df_indices['group_count'] = copy_df_indices.groupby('group').cumcount() 
    
    # Merge the list of groups with the partial df to obtain corresponding full dataframe
    copy_df = sorted_groups_list.merge(copy_df_indices)\
                                .set_index('index')\
                                .rename_axis(None)\
                                .drop(['group_count', 'g'], axis=1)
    
    # For each group, sort by labels within each group starting from 1 till 0
    df = copy_df.groupby(['group'], sort=False)\
                 .apply(lambda x: x.sort_values(['label'], ascending=False))\
                 .reset_index(drop=True)
    
    return df    


def equal_sampling(df):
    """
    Equally sample points from each timeslice group.
    Use 6550 - min number of points in hits group
    
    **Note** 
    Sorting values on label ensures that max number of
    hits per group is taken. 
    
    **For example**
    Group 1 has 6549 noise and 600 hits. Sorting first on hits will allow
    sample to have 600 hits and 5950 noise. 
    Without any sorting, sample would have had 6549 noise and 1 hit
    """    
    POINTS = 6550
    return df.groupby('group', sort=False)\
             .head(POINTS)\
             .reset_index(drop=True)


def remove_groups(df):
    """
    Remove groups that have less than 6550 members per timeslice
    group
    """
    POINTS = 6550
    g = df.groupby('group')
    
    return g.filter(lambda x: len(x) >= POINTS)


def sample_noise_data(tag, df):
    """
    Sample noise based on specified tag
    """
    if tag == "equal":
        sampled_noise = equal_sampling(df)
        equal_sampled_noise = remove_groups(sampled_noise)

        return equal_sampled_noise
    
    
def sample_mixed_data(tag, df):
    """
    Sample mixed data based on specified tag
    """
    df = sort_groups(df)
    
    if tag == "equal":
        return equal_sampling(df)

In [3]:
def save_noise_test(df_noise, test, WRITE_NOISE_TEST):
    """
    Save unsampled noise test data
    """
    for idx in test:
        file_name = "group_"+str(idx)+".xyz"
        np.savetxt(WRITE_NOISE_TEST + file_name,
                   df_noise[df_noise.group == idx][['pos_x', 'pos_y', 'time']].values)
    
    print("All {0} files saved successfully in {1}!"\
                  .format(len(test), WRITE_NOISE_TEST))
    
def save_mixed_test(df_mixed, test, WRITE_MIXED_TEST):
    """
    Save unsampled mixed test data
    """
    for idx in test:
        file_name = "group_"+str(idx)+".xyz"
        np.savetxt(WRITE_MIXED_TEST + file_name,
                   df_mixed[df_mixed.group == idx][['pos_x', 'pos_y', 'time']].values)
    
    print("All {0} files saved successfully in {1}!"\
                  .format(len(test), WRITE_MIXED_TEST))
    
    
def save_noise_train(sampled_df, train, WRITE_NOISE_TRAIN):
    """
    Save sampled noise train data
    """
    
    for idx in train:
        file_name = "group_" + str(idx) + ".xyz"
        np.savetxt(WRITE_NOISE_TRAIN + file_name,
                   sampled_df[sampled_df.group == idx][['pos_x', 'pos_y', 'time']].values)
    
    print("All {0} files saved successfully in {1}!"\
                .format(len(train), WRITE_NOISE_TRAIN))
    
    
def save_mixed_train(sampled_df, train, WRITE_MIXED_TRAIN):
    """
    Save sampled mixed train data
    """
    
    for idx in train:
        file_name = "group_" + str(idx) + ".xyz"
        np.savetxt(WRITE_MIXED_TRAIN + file_name,
                   sampled_df[sampled_df.group == idx][['pos_x', 'pos_y', 'time']].values)
    
    print("All {0} files saved successfully in {1}!"\
                .format(len(train), WRITE_MIXED_TRAIN))

In [4]:
READ_PATH = "../../data/simplified_data.csv"
WRITE_NOISE_TRAIN = "../../data/ensemble/points/noise/train/"
WRITE_NOISE_TEST = "../../data/ensemble/points/noise/test/"

WRITE_MIXED_TRAIN =  "../../data/ensemble/points/mixed/train/"
WRITE_MIXED_TEST = "../../data/ensemble/points/mixed/test/"

SIZE = 200

df = read_csv(READ_PATH)
df_noise, df_mixed = identify_groups(df)
df_noise, df_mixed = generate_dfs(df, df_noise, df_mixed)

In [5]:
####NOISE####
train, test = generate_train_test(df_noise)
df_train_noise = df_noise[df_noise.group.isin(train)]
df_train_noise = sample_noise_data("equal", df_train_noise)

save_noise_test(df_noise, test, WRITE_NOISE_TEST)
save_noise_train(df_train_noise, train, WRITE_NOISE_TRAIN)

TRAIN SET: [2385  990  432  914 1567 1189 1481 3060 3974 5511 3355 5067 2621 2128
  686 5308    0 5905 5434 3941 6555 3042 1202  788 6132 4647 4477 2093
 6433  416 5605 4590 6589 4391 3492 3609  347 3222 3189 1016  474  648
 4019 1325 3416  437 5097 2834 2414 6241 1846 1204 5009  134 6630 5149
  208 5922 2932 1681 6333 5038  301 4848 6117 2533 6436 1304 3507 5648
 1639 4199 3219 5678 5222 4615  601 4032 2272 5718 5412 2957 5510 5550
  762 3123 2197 4969 6177 3594 6540  449  909 4870 5840 1333 1729 4775
 6432 5982 4267 2956 3704 2875 4796 4900 6048 2527 5977 6139 2872 3152
 2719 6560 5182  630 1413 6396 4081  190 1620 6181 5941 4591 2934 5991
 1586 3440  999  758 4897  374 1246 3487 1955 2869 1055 4671 4426 1346
 3968 4060 4444 3215 5103 3799 3925 3815 3352 3403  669 1129 1531 3168
 3758  210 1374 4571 3569 1667]
TEST SET: [4430 3985 3943 5190 1391 3792 4483 4179 1405 4695 2751 3253 2658 4328
 3340 4274 3180 3960 4640 4799 1108  576 3763 5578 4989 4463 1151 1583
 5337 2557 5701 6427 549

In [6]:
####MIXED####
train, test = generate_train_test(df_mixed)
df_train_mixed = df_mixed[df_mixed.group.isin(train)]
df_train_mixed = sample_mixed_data("equal", df_train_mixed)

save_mixed_test(df_mixed, test, WRITE_MIXED_TEST)
save_mixed_train(df_train_mixed, train, WRITE_MIXED_TRAIN)

TRAIN SET: [6567 2161 4034 2795 6221 6038 6493 5234 1637 1836 3094 3670  411  231
 5295 3845 5032 5133 5280 5114 4562  797 1937 5886 3287  281  988 2890
 2099 4646 4327  554 1154 4132 1926 2711 6134 4515 4075 4532  809  979
  166 6109  249 6084 4080 6426 6588 1653 6542 2362 2338 1657 2098 5847
 1194 3490 2231 4735  845 1184  941 2821 5219  831 5272 5747 1273 3063
 5312 1446 1454   50 5010 4227 5857 3514 1584  584 1275  530 4717 6378
 6534 4791 4926 4653 2389 5823 5270 4958 2517 2774 3924   32 3628 6314
  355 3072 3953 3483 6553   36 5422 2374 1599  491 5659 3398 1650 3025
 1820 2367 4569 1357 4793 5866 5725 1484 6495 2330 3700 6006 1671 5281
 3727 2323 4943 2211 4347 5799 4215 1813 2759 1209 3923  103  997 4679
  394 2024 4460 1516 4428 5116 2567 5883 4740  102  716 5795 4565 2021
 4243 3310 1276 1918 2642 1973]
TEST SET: [1344 2404 1825 4282 1658 2185 1893 4301 1163 1496 2654 6337 1172 5621
 1232 5812 1732 3315 3252  810 3170 4417  375  857 5215 3611 6203 6590
 1869 6530 1214 1038 148