# xyt Train & Test Data Preparation

Here the data is prepared such that 6550 points, ordered and hits first can be taken for train and random points can be taken for test

In [1]:
def print_df_stats(df_noise, df_mixed):
    """
    General Stats on mixed and noise groups
    """
    noise_stats = df_noise.groupby('group')['label']\
                    .count()\
                    .sort_values(ascending=False)\
                    .describe()
    mixed_stats = df_mixed.groupby('group')['label']\
                    .count()\
                    .sort_values(ascending=False)\
                    .describe()
    
    neg, pos = np.bincount(df_mixed['label'])
    total = neg + pos
    
    hits = df_mixed[df_mixed.label == 1].groupby(['group', 'label'])['label']
    noise = df_mixed[df_mixed.label == 0].groupby(['group', 'label'])['label']
    
    hits_count = hits.count().sort_values(ascending=False)[:SIZE]
    noise_count = noise.count().sort_values(ascending=False)[:SIZE]
    
    hits_to_noise = hits_count.droplevel(level='label')/noise_count.droplevel(level='label')
    
    class_imbalance = df_mixed.groupby(['group', 'label'])['label'].count()[:20]
    
        
    print("NOISE STATS: \n{}\n".format(noise_stats))
    print("MIXED STATS: \n{}\n".format(mixed_stats))
    
    print("Mixed Groups: \n")
    print("Examples:\n Total: {}\n Positive: {} ({:.2f}% of total)\n".format(total,
                                                                             pos, 100 * pos / total))
    
    print("Example of class imbalance in mixed groups: \n ".format(class_imbalance.head()))
           
    print("Hits Only (Within mixed groups):\n")
    print("The largest hits for a group: {}\n".format(hits_count.max()))
    print("The smallest hits for a group: {}\n".format(hits_count.min()))
    print("The mean hits for a group: {}\n".format(hits_count.mean()))
                                                                                                                                    
    print("Noise Only (Within mixed groups):\n") 
    print("The largest noise for a group: {}\n".format(noise_count.max()))
    print("The smallest noise for a group: {}\n".format(noise_count.min())) 
    print("The mean noise for a group: {}\n".format(noise_count.mean()))
    print("Hits-Noise Ratio:\n") 
    print("The highest ratio: {:.2f}% ({})\n".format(hits_to_noise.max()*100,
                                                    hits_to_noise.max()))
    print("The smallest ratio: {:.2f}% ({}) \n".format(hits_to_noise.min()*100,
                                                       hits_to_noise.min())) 
    print("The mean ratio: {:.2f}% ({})\n".format(hits_to_noise.mean()*100, 
                                                  hits_to_noise.mean()))




def plot(df_mixed):
    """
    Generate plots of full data in mixed groups
    """
    pos_df = pd.DataFrame(df_mixed[df_mixed.label == 1], columns = df_mixed.columns)
    neg_df = pd.DataFrame(df_mixed[df_mixed.label == 0], columns = df_mixed.columns)
    sns.jointplot(pos_df['pos_z'], pos_df['time'],
                  kind='hex')
    plt.suptitle("Positive distribution (Hits) for pos_z vs time")
    sns.jointplot(neg_df['pos_z'], neg_df['time'],
                  kind='hex')
    _ = plt.suptitle("Negative distribution (Noise) for pos_z vs time")
    
    plt.savefig("../../assets/Distributon of points in Mixed Groups (posz-time)")
    

def sampled_plot(sampled_mixed):
    """
    Generate plots of equally sampled points in mixed groups
    """
    list_mixed_groups = df_mixed.groupby('group')['label'].count().sort_values(ascending=False)[:SIZE]
    data_subset = sampled_mixed.loc[sampled_mixed['group'].isin(list_mixed_groups)]
    pos_df = data_subset[data_subset.label == 1]
    neg_df = data_subset[data_subset.label == 0]
    
    sns.jointplot(pos_df['pos_z'], pos_df['time'],
                  kind='hex')
    plt.suptitle("EQUAL SAMPLING: Positive distribution (Hits) for pos_z vs time")
    
    sns.jointplot(neg_df['pos_z'], neg_df['time'],
                  kind='hex')
    _ = plt.suptitle("EQUAL SAMPLING: Negative distribution (Noise) for pos_z vs time")

    plt.savefig("../../assets/Distributon of Equally Sampled points in Mixed Groups (posz-time)")

In [2]:
import matplotlib
import pandas as pd
import numpy as np
from random import shuffle

In [3]:
def read_csv(READ_PATH):
    """
    Read CSV at specified Path
    """
    return pd.read_csv(READ_PATH)


def identify_groups(df):
    """
    1. Tag groups that have only noise [0]
    2. Tag groups that have both noise and hits [0,1]
    3. Separate noise groups and hits+noise groups
    """
    # Label groups by noise and and hits
    df_count_label_type = pd.DataFrame(df.groupby('group')['label'].unique()).reset_index()
    
    # Obtain groups with only hits
    df_noise = df_count_label_type.loc[
        np.array(list(map(len, df_count_label_type.label.values))) == 1]
    
    # Obtain groups with noise && hits
    df_mixed = df_count_label_type.loc[
        np.array(list(map(len, df_count_label_type.label.values))) > 1]
    
    return df_noise, df_mixed


def generate_dfs(df, df_noise, df_mixed):
    """
    Obtain full dataframe based on identified noise
    only groups and mixed groups
    """
    df_noise = df[df.group.isin(df_noise.group)]
    df_mixed = df[df.group.isin(df_mixed.group)]
    
    return df_noise, df_mixed


def identify_top_groups(df, SIZE):
    """
    Obtain a list of the top groups to be selected as 
    per SIZE
    
    **Note: SIZE == 200**
    """
    top_groups = df.groupby('group')['label'].count().sort_values(ascending=False)[:SIZE]
    top_groups = list(top_groups.index)
    
    return top_groups


def randomize_files(files):
    """
    Shuffles files in a random order
    """
    return shuffle(files)


def prepare_split(files):
    """
    Identifies the ratios for files to be split into
    
    **Note**
    Current setting is for 80-20
    """
    eighty = int(0.8 * len(files))
    twenty = int(len(files) - eighty)
    files = np.array(files)
    
    return eighty, twenty, files


def generate_ids(eighty, twenty):
    """
    Assigns 1 to 80% of the files and 0 to 20% of the files
    """
    idx = np.hstack((np.ones(eighty),
                     np.zeros(twenty)))
    return idx


def train_test_split(files, idx):
    """
    Files tagged as 1 are categorised as training files
    Files tagged as 0 are categorised as test files
    """
    train = files[idx == 1]
    test = files[idx == 0]
    print("TRAIN SET: {0}".format(train))
    print("TEST SET: {0}".format(test))
    return train, test


def generate_train_test(df):
    top_groups = identify_top_groups(df, SIZE)
    randomize_files(top_groups)
    eighty, twenty, files = prepare_split(top_groups)
    idx = generate_ids(eighty, twenty)
    train, test = train_test_split(files, idx) 
    
    return train, test


def sort_groups(df):
    """
    Within each mixed groups (hits + noise), we need to sort
    groups such that those with highest hits are selected first
    
    DataFrame --> List(int)
    """
    
    # group by groups and labels and associated counts for each label
    grouped_df = pd.DataFrame(df.groupby(['group', 'label'])['label'].count())
    grouped_df = grouped_df.rename(columns={'label':'count'})
    
    # sort groups based on highest occurance of hits
    grouped_sorted_df = grouped_df.sort_values(grouped_df.columns.tolist())\
                            .sort_index(level=1, ascending=False, sort_remaining=False)\
                            .reset_index()
    
    # Obtain list of groups with highest hits based on sorted order
    sorted_groups_list = pd.DataFrame(grouped_sorted_df.group)

    # Drop duplicate groups
    sorted_groups_list = sorted_groups_list.drop_duplicates()

    # Count the occurances of groups (should only occur once)
    sorted_groups_list['g'] = sorted_groups_list.groupby('group').cumcount()

    # Make copy of dataframe
    copy_df = df
    
    # Save original index positions as a column
    copy_df_indices = copy_df.reset_index()
    
    # Make a count of occurances for each group
    copy_df_indices['group_count'] = copy_df_indices.groupby('group').cumcount() 
    
    # Merge the list of groups with the partial df to obtain corresponding full dataframe
    copy_df = sorted_groups_list.merge(copy_df_indices)\
                                .set_index('index')\
                                .rename_axis(None)\
                                .drop(['group_count', 'g'], axis=1)
    
    # For each group, sort by labels within each group starting from 1 till 0
    df = copy_df.groupby(['group'], sort=False)\
                 .apply(lambda x: x.sort_values(['label'], ascending=False))\
                 .reset_index(drop=True)
    
    return df    


def equal_sampling(df):
    """
    Equally sample points from each timeslice group.
    Use 6550 - min number of points in hits group
    
    **Note** 
    Sorting values on label ensures that max number of
    hits per group is taken. 
    
    **For example**
    Group 1 has 6549 noise and 600 hits. Sorting first on hits will allow
    sample to have 600 hits and 5950 noise. 
    Without any sorting, sample would have had 6549 noise and 1 hit
    """    
    POINTS = 6550
    return df.groupby('group', sort=False)\
             .head(POINTS)\
             .reset_index(drop=True)


def remove_groups(df):
    """
    Remove groups that have less than 6550 members per timeslice
    group
    """
    POINTS = 6550
    g = df.groupby('group')
    
    return g.filter(lambda x: len(x) >= POINTS)


def sample_noise_data(tag, df):
    """
    Sample noise based on specified tag
    """
    if tag == "equal":
        sampled_noise = equal_sampling(df)
        equal_sampled_noise = remove_groups(sampled_noise)

        return equal_sampled_noise
    
    
def sample_mixed_data(tag, df):
    """
    Sample mixed data based on specified tag
    """
    df = sort_groups(df)
    
    if tag == "equal":
        return equal_sampling(df)

In [4]:
def save_noise_test(df_noise, test, WRITE_NOISE_TEST):
    """
    Save unsampled noise test data
    """
    for idx in test:
        file_name = "group_"+str(idx)+".xyz"
        np.savetxt(WRITE_NOISE_TEST + file_name,
                   df_noise[df_noise.group == idx][['pos_x', 'pos_y', 'time']].values)
    
    print("All {0} files saved successfully in {1}!"\
                  .format(len(test), WRITE_NOISE_TEST))
    
def save_mixed_test(df_mixed, test, WRITE_MIXED_TEST):
    """
    Save unsampled mixed test data
    """
    for idx in test:
        file_name = "group_"+str(idx)+".xyz"
        np.savetxt(WRITE_MIXED_TEST + file_name,
                   df_mixed[df_mixed.group == idx][['pos_x', 'pos_y', 'time']].values)
    
    print("All {0} files saved successfully in {1}!"\
                  .format(len(test), WRITE_MIXED_TEST))
    
    
def save_noise_train(sampled_df, train, WRITE_NOISE_TRAIN):
    """
    Save sampled noise train data
    """
    
    for idx in train:
        file_name = "group_" + str(idx) + ".xyz"
        np.savetxt(WRITE_NOISE_TRAIN + file_name,
                   sampled_df[sampled_df.group == idx][['pos_x', 'pos_y', 'time']].values)
    
    print("All {0} files saved successfully in {1}!"\
                .format(len(train), WRITE_NOISE_TRAIN))
    
    
def save_mixed_train(sampled_df, train, WRITE_MIXED_TRAIN):
    """
    Save sampled mixed train data
    """
    
    for idx in train:
        file_name = "group_" + str(idx) + ".xyz"
        np.savetxt(WRITE_MIXED_TRAIN + file_name,
                   sampled_df[sampled_df.group == idx][['pos_x', 'pos_y', 'time']].values)
    
    print("All {0} files saved successfully in {1}!"\
                .format(len(train), WRITE_MIXED_TRAIN))

In [5]:
READ_PATH = "../../data/simplified_data.csv"
WRITE_NOISE_TRAIN = "../../data/ensemble/xyt/points/noise/train/"
WRITE_NOISE_TEST = "../../data/ensemble/xyt/points/noise/test/"

WRITE_MIXED_TRAIN =  "../../data/ensemble/xyt/points/mixed/train/"
WRITE_MIXED_TEST = "../../data/ensemble/xyt/points/mixed/test/"

SIZE = 200

df = read_csv(READ_PATH)
df_noise, df_mixed = identify_groups(df)
df_noise, df_mixed = generate_dfs(df, df_noise, df_mixed)

In [6]:
####NOISE####
train, test = generate_train_test(df_noise)
df_train_noise = df_noise[df_noise.group.isin(train)]
df_train_noise = sample_noise_data("equal", df_train_noise)

save_noise_test(df_noise, test, WRITE_NOISE_TEST)
save_noise_train(df_train_noise, train, WRITE_NOISE_TRAIN)

TRAIN SET: [2957  374 3925 3152 1101 4799 3985  788 1189 2719 4591 5511  669 5006
 5922 6436 6139 2875  210  474 5337 4328 1325 5067 3974 4796 5648 5149
  601 4274 1204 6132 3941  909 4477 1016 5905 5840 4081 4870  583 4060
 2272  648 3340 4848 4615 6241 1846 1567 3704 4032 5190 5434 1202 2621
 1391 5701 3758 6177 5718 6630 2934 1129 2557 1955 6181 4391 2956 1681
 1531 5991 1304 5103 4989 4199 2527    0  301 3352 2834 5510 4969  437
 6560 3943 1639 1346 1108 2093  576 5038 1405 3960 4483 6432 4019 5941
  449  208  758 3815 6333 3215 5977 5982 5605  630 3180 4640 6048  903
 1481 5009 4426 3219 3189 3763  914 2197 4897 2751 2872  686 4444 3123
 4571 3060 2533 3507 6427 3355 5182  416 2587 2128 3594 4695 3492 4671
 6433 5222 2869 5578 3440  999  532 4590 4874 2658 1374  432 2932 5308
 5412  990 1246 1583 6117 2414]
TEST SET: [3168 3968 3403  347 1729 5491 1586 4647 5097 3042 6589 3416 3253  762
 4775 4267 3487 4430  190 3222 3792 3799  134 3569 1333 2385 6396 5678
 6555 1620 1151 4463 105

In [7]:
####MIXED####
train, test = generate_train_test(df_mixed)
df_train_mixed = df_mixed[df_mixed.group.isin(train)]
df_train_mixed = sample_mixed_data("equal", df_train_mixed)

save_mixed_test(df_mixed, test, WRITE_MIXED_TEST)
save_mixed_train(df_train_mixed, train, WRITE_MIXED_TRAIN)

TRAIN SET: [  50  103 3845  166 6038 1275 4227 1599 2367 2517 5857 4282 1038 2098
  530  831 5010  979 2795 6084 1973 5032 3727 1893 4791 4515 5847 4428
 2890 5799 3628 5659 3923 3967 1194 2374 2211  554 5422 1516 5272 1836
 1172 2567 4080 1163 4679 4132 4943 3483 2323  281 4717  857 4301 4327
 4075 6534 1650  716  491   32 5795 5211  845  809 3490 5747  231  997
 5295 6378 1209 2024 3001   55 1488  249 6134 6109 1918 6493 5219 3063
 1584 2021 5270  615 6553 6314  411 6495 2711 2389 2642 3611 1232 1671
 4532 3924 2231 6203 6426 4417 1344 4215 4243 2404 1869 6530 1484 1820
 4562 1937 2185 3398  355 2099 3700 4569 1273 1637 4735 3514 5280 2338
 1496  375  797  941 4926 1813 2821 5133 4347 6590 3315 2654 5116 5866
 4646 1357 5281  394  102 3287 5234 4565 6337 5215 5883 6006 5725 3953
 2759 3094 2362 4653  584 1926]
TEST SET: [4793 6567 4854 1276 1446 4034 2330 1154 1454 3025 6542 3310 1184 6588
 1653 2774 4740 5812  810 3670 3252 5312 1732 1214  988 5823 1658 2161
 1825 5886 6221 3170 165

In [9]:
df_mixed.groupby('label').count()

Unnamed: 0_level_0,pos_x,pos_y,pos_z,time,group
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,26996090,26996090,26996090,26996090,26996090
1,489906,489906,489906,489906,489906


In [10]:
df_train_mixed.groupby('label').count()

Unnamed: 0_level_0,group,pos_x,pos_y,pos_z,time
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,935417,935417,935417,935417,935417
1,112583,112583,112583,112583,112583
