# xyt Train & Test Data Preparation

Here the data is prepared such that 6550 points, ordered and hits first can be taken for train and random points can be taken for test

In [1]:
def print_df_stats(df_noise, df_mixed):
    """
    General Stats on mixed and noise groups
    """
    noise_stats = df_noise.groupby('group')['label']\
                    .count()\
                    .sort_values(ascending=False)\
                    .describe()
    mixed_stats = df_mixed.groupby('group')['label']\
                    .count()\
                    .sort_values(ascending=False)\
                    .describe()
    
    neg, pos = np.bincount(df_mixed['label'])
    total = neg + pos
    
    hits = df_mixed[df_mixed.label == 1].groupby(['group', 'label'])['label']
    noise = df_mixed[df_mixed.label == 0].groupby(['group', 'label'])['label']
    
    hits_count = hits.count().sort_values(ascending=False)[:SIZE]
    noise_count = noise.count().sort_values(ascending=False)[:SIZE]
    
    hits_to_noise = hits_count.droplevel(level='label')/noise_count.droplevel(level='label')
    
    class_imbalance = df_mixed.groupby(['group', 'label'])['label'].count()[:20]
    
        
    print("NOISE STATS: \n{}\n".format(noise_stats))
    print("MIXED STATS: \n{}\n".format(mixed_stats))
    
    print("Mixed Groups: \n")
    print("Examples:\n Total: {}\n Positive: {} ({:.2f}% of total)\n".format(total,
                                                                             pos, 100 * pos / total))
    
    print("Example of class imbalance in mixed groups: \n ".format(class_imbalance.head()))
           
    print("Hits Only (Within mixed groups):\n")
    print("The largest hits for a group: {}\n".format(hits_count.max()))
    print("The smallest hits for a group: {}\n".format(hits_count.min()))
    print("The mean hits for a group: {}\n".format(hits_count.mean()))
                                                                                                                                    
    print("Noise Only (Within mixed groups):\n") 
    print("The largest noise for a group: {}\n".format(noise_count.max()))
    print("The smallest noise for a group: {}\n".format(noise_count.min())) 
    print("The mean noise for a group: {}\n".format(noise_count.mean()))
    print("Hits-Noise Ratio:\n") 
    print("The highest ratio: {:.2f}% ({})\n".format(hits_to_noise.max()*100,
                                                    hits_to_noise.max()))
    print("The smallest ratio: {:.2f}% ({}) \n".format(hits_to_noise.min()*100,
                                                       hits_to_noise.min())) 
    print("The mean ratio: {:.2f}% ({})\n".format(hits_to_noise.mean()*100, 
                                                  hits_to_noise.mean()))




def plot(df_mixed):
    """
    Generate plots of full data in mixed groups
    """
    pos_df = pd.DataFrame(df_mixed[df_mixed.label == 1], columns = df_mixed.columns)
    neg_df = pd.DataFrame(df_mixed[df_mixed.label == 0], columns = df_mixed.columns)
    sns.jointplot(pos_df['pos_z'], pos_df['time'],
                  kind='hex')
    plt.suptitle("Positive distribution (Hits) for pos_z vs time")
    sns.jointplot(neg_df['pos_z'], neg_df['time'],
                  kind='hex')
    _ = plt.suptitle("Negative distribution (Noise) for pos_z vs time")
    
    plt.savefig("../../assets/Distributon of points in Mixed Groups (posz-time)")
    

def sampled_plot(sampled_mixed):
    """
    Generate plots of equally sampled points in mixed groups
    """
    list_mixed_groups = df_mixed.groupby('group')['label'].count().sort_values(ascending=False)[:SIZE]
    data_subset = sampled_mixed.loc[sampled_mixed['group'].isin(list_mixed_groups)]
    pos_df = data_subset[data_subset.label == 1]
    neg_df = data_subset[data_subset.label == 0]
    
    sns.jointplot(pos_df['pos_z'], pos_df['time'],
                  kind='hex')
    plt.suptitle("EQUAL SAMPLING: Positive distribution (Hits) for pos_z vs time")
    
    sns.jointplot(neg_df['pos_z'], neg_df['time'],
                  kind='hex')
    _ = plt.suptitle("EQUAL SAMPLING: Negative distribution (Noise) for pos_z vs time")

    plt.savefig("../../assets/Distributon of Equally Sampled points in Mixed Groups (posz-time)")

In [2]:
import matplotlib
import pandas as pd
import numpy as np
from random import shuffle

In [57]:
def read_csv(READ_PATH):
    """
    Read CSV at specified Path
    """
    return pd.read_csv(READ_PATH)


def identify_groups(df):
    """
    1. Tag groups that have only noise [0]
    2. Tag groups that have both noise and hits [0,1]
    3. Separate noise groups and hits+noise groups
    """
    # Label groups by noise and and hits
    df_count_label_type = pd.DataFrame(df.groupby('group')['label'].unique()).reset_index()
    
    # Obtain groups with only hits
    df_noise = df_count_label_type.loc[
        np.array(list(map(len, df_count_label_type.label.values))) == 1]
    
    # Obtain groups with noise && hits
    df_mixed = df_count_label_type.loc[
        np.array(list(map(len, df_count_label_type.label.values))) > 1]
    
    return df_noise, df_mixed


def generate_dfs(df, df_noise, df_mixed):
    """
    Obtain full dataframe based on identified noise
    only groups and mixed groups
    """
    df_noise = df[df.group.isin(df_noise.group)]
    df_mixed = df[df.group.isin(df_mixed.group)]
    
    return df_noise, df_mixed


def identify_top_groups(df, SIZE):
    """
    Obtain a list of the top groups to be selected as 
    per SIZE
    
    **Note: SIZE == 200**
    """
    top_groups = df.groupby('group')['label'].count().sort_values(ascending=False)[:SIZE]
    top_groups = list(top_groups.index)
    
    return top_groups


def randomize_files(files):
    """
    Shuffles files in a random order
    """
    return shuffle(files)


def prepare_split(files):
    """
    Identifies the ratios for files to be split into
    
    **Note**
    Current setting is for 80-20
    """
    eighty = int(0.8 * len(files))
    twenty = int(len(files) - eighty)
    files = np.array(files)
    
    return eighty, twenty, files


def generate_ids(eighty, twenty):
    """
    Assigns 1 to 80% of the files and 0 to 20% of the files
    """
    idx = np.hstack((np.ones(eighty),
                     np.zeros(twenty)))
    return idx


def train_test_split(files, idx):
    """
    Files tagged as 1 are categorised as training files
    Files tagged as 0 are categorised as test files
    """
    train = files[idx == 1]
    test = files[idx == 0]
    print("TRAIN SET: {0}".format(train))
    print("TEST SET: {0}".format(test))
    return train, test


def generate_train_test(df, tag):
    if tag == "noise":
        top_groups = identify_top_groups(df, SIZE)
    else:
        top_groups = list(sort_groups(df).group.unique()[:SIZE])
    
    randomize_files(top_groups)
    eighty, twenty, files = prepare_split(top_groups)
    idx = generate_ids(eighty, twenty)
    train, test = train_test_split(files, idx) 
    
    return train, test


def sort_groups(df):
    """
    Within each mixed groups (hits + noise), we need to sort
    groups such that those with highest hits are selected first
    
    DataFrame --> List(int)
    """
    
    # group by groups and labels and associated counts for each label
    grouped_df = pd.DataFrame(df.groupby(['group', 'label'])['label'].count())
    grouped_df = grouped_df.rename(columns={'label':'count'})
    
    # sort groups based on highest occurance of hits
    grouped_sorted_df = grouped_df.sort_values(grouped_df.columns.tolist())\
                            .sort_index(level=1, ascending=False, sort_remaining=False)\
                            .reset_index()
    
    # Obtain list of groups with highest hits based on sorted order
    sorted_groups_list = pd.DataFrame(grouped_sorted_df.group)

    # Drop duplicate groups
    sorted_groups_list = sorted_groups_list.drop_duplicates()

    # Count the occurances of groups (should only occur once)
    sorted_groups_list['g'] = sorted_groups_list.groupby('group').cumcount()

    # Make copy of dataframe
    copy_df = df
    
    # Save original index positions as a column
    copy_df_indices = copy_df.reset_index()
    
    # Make a count of occurances for each group
    copy_df_indices['group_count'] = copy_df_indices.groupby('group').cumcount() 
    
    # Merge the list of groups with the partial df to obtain corresponding full dataframe
    copy_df = sorted_groups_list.merge(copy_df_indices)\
                                .set_index('index')\
                                .rename_axis(None)\
                                .drop(['group_count', 'g'], axis=1)
    
    # For each group, sort by labels within each group starting from 1 till 0
    df = copy_df.groupby(['group'], sort=False)\
                 .apply(lambda x: x.sort_values(['label'], ascending=False))\
                 .reset_index(drop=True)
    
    return df    


def equal_sampling(df):
    """
    Equally sample points from each timeslice group.
    Use 6550 - min number of points in hits group
    
    **Note** 
    Sorting values on label ensures that max number of
    hits per group is taken. 
    
    **For example**
    Group 1 has 6549 noise and 600 hits. Sorting first on hits will allow
    sample to have 600 hits and 5950 noise. 
    Without any sorting, sample would have had 6549 noise and 1 hit
    """    
    POINTS = 6550
    return df.groupby('group', sort=False)\
             .head(POINTS)\
             .reset_index(drop=True)


def remove_groups(df):
    """
    Remove groups that have less than 6550 members per timeslice
    group
    """
    POINTS = 6550
    g = df.groupby('group')
    
    return g.filter(lambda x: len(x) >= POINTS)


def sample_noise_data(tag, df):
    """
    Sample noise based on specified tag
    """
    if tag == "equal":
        sampled_noise = equal_sampling(df)
        equal_sampled_noise = remove_groups(sampled_noise)

        return equal_sampled_noise
    
    
def sample_mixed_data(tag, df):
    """
    Sample mixed data based on specified tag
    """
    df = sort_groups(df)
    
    if tag == "equal":
        return equal_sampling(df)

In [4]:
def save_noise_test(df_noise, test, WRITE_NOISE_TEST):
    """
    Save unsampled noise test data
    """
    for idx in test:
        file_name = "group_"+str(idx)+".xyz"
        np.savetxt(WRITE_NOISE_TEST + file_name,
                   df_noise[df_noise.group == idx][['pos_x', 'pos_y', 'time']].values)
    
    print("All {0} files saved successfully in {1}!"\
                  .format(len(test), WRITE_NOISE_TEST))
    
def save_mixed_test(df_mixed, test, WRITE_MIXED_TEST):
    """
    Save unsampled mixed test data
    """
    for idx in test:
        file_name = "group_"+str(idx)+".xyz"
        np.savetxt(WRITE_MIXED_TEST + file_name,
                   df_mixed[df_mixed.group == idx][['pos_x', 'pos_y', 'time']].values)
    
    print("All {0} files saved successfully in {1}!"\
                  .format(len(test), WRITE_MIXED_TEST))
    
    
def save_noise_train(sampled_df, train, WRITE_NOISE_TRAIN):
    """
    Save sampled noise train data
    """
    
    for idx in train:
        file_name = "group_" + str(idx) + ".xyz"
        np.savetxt(WRITE_NOISE_TRAIN + file_name,
                   sampled_df[sampled_df.group == idx][['pos_x', 'pos_y', 'time']].values)
    
    print("All {0} files saved successfully in {1}!"\
                .format(len(train), WRITE_NOISE_TRAIN))
    
    
def save_mixed_train(sampled_df, train, WRITE_MIXED_TRAIN):
    """
    Save sampled mixed train data
    """
    
    for idx in train:
        file_name = "group_" + str(idx) + ".xyz"
        np.savetxt(WRITE_MIXED_TRAIN + file_name,
                   sampled_df[sampled_df.group == idx][['pos_x', 'pos_y', 'time']].values)
    
    print("All {0} files saved successfully in {1}!"\
                .format(len(train), WRITE_MIXED_TRAIN))

In [44]:
READ_PATH = "../../../data/simplified_data.csv"
WRITE_NOISE_TRAIN = "../../../data/sampled/xyt/points/noise/train/"
WRITE_NOISE_TEST = "../../../data/sampled/xyt/points/noise/test/"

WRITE_MIXED_TRAIN =  "../../../data/sampled/xyt/points/mixed/train/"
WRITE_MIXED_TEST = "../../../data/sampled/xyt/points/mixed/test/"

SIZE = 200

df = read_csv(READ_PATH)
df_noise, df_mixed = identify_groups(df)
df_noise, df_mixed = generate_dfs(df, df_noise, df_mixed)

In [55]:
####NOISE####
train, test = generate_train_test(df_noise, "noise")
df_train_noise = df_noise[df_noise.group.isin(train)]
df_train_noise = sample_noise_data("equal", df_train_noise)

save_noise_test(df_noise, test, WRITE_NOISE_TEST)
save_noise_train(df_train_noise, train, WRITE_NOISE_TRAIN)

RAND:  None
80/20: 
  [4590 5222 1325 5103 1481 5922  686 3974 6333 4571 3168 3985 2385 3253
 3943 6436 3219 2751 3215  474 3968 4060  903 4799 1055  788 3569 3222
 4870 3594 1639 5337 1667 6589 1405 6560 2869 6181 1955 4426 3355 3440
 5941  190 3487 5067 1346  762  648 3340 4274  347 4444  416 1204  758
 2621 4848 3925 2128 5511 3815 2956  301  601  576 5905  374 1202 4430
 4267  449 5412 1681  999 5190 5605  990  630  134  669 5038 3492 1189
 3792 4695 5648 3799 6630 5434 5009 4391 3507 3042  437 4796 6177 3960
 4647 2093  583 3704  432 4615 4640 3416 6432  532  208 5510 6396 4477
 2719 6132 1620 5578  909 2587 1246 4775 5678  210 5701 2658 6555 1586
 5491 5097 4081 5182 5550 1129 1151 1374 2557 3189 1729 4199 6427 5840
 3403 2533 1531 5308 6433 4900 3758 1846 4969 3123 6117 3763 1413 3060
 4671 2934 4463 6241 4179 4989 2872 6139 5991 4032 1333 1101 5977  914
 5149 5006 4328 3152 4483 3352 5718 4897 1016 2834 1583 4591 1391 1108
 3941 2414 2932 4874 2957 3609 4019 5982 6048 3180 2272

In [56]:
####MIXED####
train, test = generate_train_test(df_mixed, "mixed")
df_train_mixed = df_mixed[df_mixed.group.isin(train)]
df_train_mixed = sample_mixed_data("equal", df_train_mixed)

save_mixed_test(df_mixed, test, WRITE_MIXED_TEST)
save_mixed_train(df_train_mixed, train, WRITE_MIXED_TRAIN)

TOP GROUPS:  [615, 1637, 5866, 5857, 1232, 2021, 554, 4301, 1273, 6495, 3170, 6588, 4958, 3398, 5747, 2211, 4565, 1599, 716, 3483, 4515, 6109, 6530, 5010, 5886, 3727, 5823, 375, 2323, 4735, 477, 1209, 2338, 2759, 1732, 4347, 1276, 3490, 1869, 2330, 3025, 2231, 979, 6378, 6038, 1214, 5234, 5312, 6534, 1584, 4282, 1671, 1344, 5799, 797, 5215, 1275, 1488, 5295, 1658, 6221, 4080, 4327, 1653, 1973, 4717, 2711, 355, 2654, 4243, 3923, 3001, 5114, 4679, 5621, 1918, 6553, 6314, 6542, 4532, 5422, 1194, 1038, 1484, 3310, 2362, 1813, 4428, 5725, 5116, 1516, 4926, 2024, 4417, 6567, 1496, 6084, 1163, 281, 3953, 4227, 5281, 584, 3670, 2367, 5270, 530, 1650, 3845, 5883, 4943, 3700, 2098, 4793, 32, 1357, 4646, 3287, 5133, 857, 941, 809, 1446, 1454, 50, 4215, 6006, 1836, 5032, 4034, 787, 3611, 4562, 2821, 1184, 4023, 5151, 2731, 3252, 1141, 1887, 3063, 6470, 1825, 2795, 2185, 3924, 2374, 3094, 4783, 4620, 5211, 5931, 440, 55, 249, 2389, 2517, 6054, 5812, 5815, 2567, 4854, 6493, 103, 231, 810, 4336, 691,