# yzt Train & Test Data Preparation

Here the data is prepared such that 6550 points, ordered and hits first can be taken for train and random points can be taken for test

In [1]:
def print_df_stats(df_noise, df_mixed):
    """
    General Stats on mixed and noise groups
    """
    noise_stats = df_noise.groupby('group')['label']\
                    .count()\
                    .sort_values(ascending=False)\
                    .describe()
    mixed_stats = df_mixed.groupby('group')['label']\
                    .count()\
                    .sort_values(ascending=False)\
                    .describe()
    
    neg, pos = np.bincount(df_mixed['label'])
    total = neg + pos
    
    hits = df_mixed[df_mixed.label == 1].groupby(['group', 'label'])['label']
    noise = df_mixed[df_mixed.label == 0].groupby(['group', 'label'])['label']
    
    hits_count = hits.count().sort_values(ascending=False)[:SIZE]
    noise_count = noise.count().sort_values(ascending=False)[:SIZE]
    
    hits_to_noise = hits_count.droplevel(level='label')/noise_count.droplevel(level='label')
    
    class_imbalance = df_mixed.groupby(['group', 'label'])['label'].count()[:20]
    
        
    print("NOISE STATS: \n{}\n".format(noise_stats))
    print("MIXED STATS: \n{}\n".format(mixed_stats))
    
    print("Mixed Groups: \n")
    print("Examples:\n Total: {}\n Positive: {} ({:.2f}% of total)\n".format(total,
                                                                             pos, 100 * pos / total))
    
    print("Example of class imbalance in mixed groups: \n ".format(class_imbalance.head()))
           
    print("Hits Only (Within mixed groups):\n")
    print("The largest hits for a group: {}\n".format(hits_count.max()))
    print("The smallest hits for a group: {}\n".format(hits_count.min()))
    print("The mean hits for a group: {}\n".format(hits_count.mean()))
                                                                                                                                    
    print("Noise Only (Within mixed groups):\n") 
    print("The largest noise for a group: {}\n".format(noise_count.max()))
    print("The smallest noise for a group: {}\n".format(noise_count.min())) 
    print("The mean noise for a group: {}\n".format(noise_count.mean()))
    print("Hits-Noise Ratio:\n") 
    print("The highest ratio: {:.2f}% ({})\n".format(hits_to_noise.max()*100,
                                                    hits_to_noise.max()))
    print("The smallest ratio: {:.2f}% ({}) \n".format(hits_to_noise.min()*100,
                                                       hits_to_noise.min())) 
    print("The mean ratio: {:.2f}% ({})\n".format(hits_to_noise.mean()*100, 
                                                  hits_to_noise.mean()))




def plot(df_mixed):
    """
    Generate plots of full data in mixed groups
    """
    pos_df = pd.DataFrame(df_mixed[df_mixed.label == 1], columns = df_mixed.columns)
    neg_df = pd.DataFrame(df_mixed[df_mixed.label == 0], columns = df_mixed.columns)
    sns.jointplot(pos_df['pos_z'], pos_df['time'],
                  kind='hex')
    plt.suptitle("Positive distribution (Hits) for pos_z vs time")
    sns.jointplot(neg_df['pos_z'], neg_df['time'],
                  kind='hex')
    _ = plt.suptitle("Negative distribution (Noise) for pos_z vs time")
    
    plt.savefig("../../assets/Distributon of points in Mixed Groups (posz-time)")
    

def sampled_plot(sampled_mixed):
    """
    Generate plots of equally sampled points in mixed groups
    """
    list_mixed_groups = df_mixed.groupby('group')['label'].count().sort_values(ascending=False)[:SIZE]
    data_subset = sampled_mixed.loc[sampled_mixed['group'].isin(list_mixed_groups)]
    pos_df = data_subset[data_subset.label == 1]
    neg_df = data_subset[data_subset.label == 0]
    
    sns.jointplot(pos_df['pos_z'], pos_df['time'],
                  kind='hex')
    plt.suptitle("EQUAL SAMPLING: Positive distribution (Hits) for pos_z vs time")
    
    sns.jointplot(neg_df['pos_z'], neg_df['time'],
                  kind='hex')
    _ = plt.suptitle("EQUAL SAMPLING: Negative distribution (Noise) for pos_z vs time")

    plt.savefig("../../assets/Distributon of Equally Sampled points in Mixed Groups (posz-time)")

In [1]:
import matplotlib
import pandas as pd
import numpy as np
from random import shuffle

In [2]:
def read_csv(READ_PATH):
    """
    Read CSV at specified Path
    """
    return pd.read_csv(READ_PATH)


def identify_groups(df):
    """
    1. Tag groups that have only noise [0]
    2. Tag groups that have both noise and hits [0,1]
    3. Separate noise groups and hits+noise groups
    """
    # Label groups by noise and and hits
    df_count_label_type = pd.DataFrame(df.groupby('group')['label'].unique()).reset_index()
    
    # Obtain groups with only hits
    df_noise = df_count_label_type.loc[
        np.array(list(map(len, df_count_label_type.label.values))) == 1]
    
    # Obtain groups with noise && hits
    df_mixed = df_count_label_type.loc[
        np.array(list(map(len, df_count_label_type.label.values))) > 1]
    
    return df_noise, df_mixed


def generate_dfs(df, df_noise, df_mixed):
    """
    Obtain full dataframe based on identified noise
    only groups and mixed groups
    """
    df_noise = df[df.group.isin(df_noise.group)]
    df_mixed = df[df.group.isin(df_mixed.group)]
    
    return df_noise, df_mixed


def identify_top_groups(df, SIZE):
    """
    Obtain a list of the top groups to be selected as 
    per SIZE
    
    **Note: SIZE == 200**
    """
    top_groups = df.groupby('group')['label'].count().sort_values(ascending=False)[:SIZE]
    top_groups = list(top_groups.index)
    
    return top_groups


def randomize_files(files):
    """
    Shuffles files in a random order
    """
    return shuffle(files)


def prepare_split(files):
    """
    Identifies the ratios for files to be split into
    
    **Note**
    Current setting is for 80-20
    """
    eighty = int(0.8 * len(files))
    twenty = int(len(files) - eighty)
    files = np.array(files)
    
    return eighty, twenty, files


def generate_ids(eighty, twenty):
    """
    Assigns 1 to 80% of the files and 0 to 20% of the files
    """
    idx = np.hstack((np.ones(eighty),
                     np.zeros(twenty)))
    return idx


def train_test_split(files, idx):
    """
    Files tagged as 1 are categorised as training files
    Files tagged as 0 are categorised as test files
    """
    train = files[idx == 1]
    test = files[idx == 0]
    print("TRAIN SET: {0}".format(train))
    print("TEST SET: {0}".format(test))
    return train, test


def generate_train_test(df):
    top_groups = identify_top_groups(df, SIZE)
    randomize_files(top_groups)
    eighty, twenty, files = prepare_split(top_groups)
    idx = generate_ids(eighty, twenty)
    train, test = train_test_split(files, idx) 
    
    return train, test


def sort_groups(df):
    """
    Within each mixed groups (hits + noise), we need to sort
    groups such that those with highest hits are selected first
    
    DataFrame --> List(int)
    """
    
    # group by groups and labels and associated counts for each label
    grouped_df = pd.DataFrame(df.groupby(['group', 'label'])['label'].count())
    grouped_df = grouped_df.rename(columns={'label':'count'})
    
    # sort groups based on highest occurance of hits
    grouped_sorted_df = grouped_df.sort_values(grouped_df.columns.tolist())\
                            .sort_index(level=1, ascending=False, sort_remaining=False)\
                            .reset_index()
    
    # Obtain list of groups with highest hits based on sorted order
    sorted_groups_list = pd.DataFrame(grouped_sorted_df.group)

    # Drop duplicate groups
    sorted_groups_list = sorted_groups_list.drop_duplicates()

    # Count the occurances of groups (should only occur once)
    sorted_groups_list['g'] = sorted_groups_list.groupby('group').cumcount()

    # Make copy of dataframe
    copy_df = df
    
    # Save original index positions as a column
    copy_df_indices = copy_df.reset_index()
    
    # Make a count of occurances for each group
    copy_df_indices['group_count'] = copy_df_indices.groupby('group').cumcount() 
    
    # Merge the list of groups with the partial df to obtain corresponding full dataframe
    copy_df = sorted_groups_list.merge(copy_df_indices)\
                                .set_index('index')\
                                .rename_axis(None)\
                                .drop(['group_count', 'g'], axis=1)
    
    # For each group, sort by labels within each group starting from 1 till 0
    df = copy_df.groupby(['group'], sort=False)\
                 .apply(lambda x: x.sort_values(['label'], ascending=False))\
                 .reset_index(drop=True)
    
    return df    


def equal_sampling(df):
    """
    Equally sample points from each timeslice group.
    Use 6550 - min number of points in hits group
    
    **Note** 
    Sorting values on label ensures that max number of
    hits per group is taken. 
    
    **For example**
    Group 1 has 6549 noise and 600 hits. Sorting first on hits will allow
    sample to have 600 hits and 5950 noise. 
    Without any sorting, sample would have had 6549 noise and 1 hit
    """    
    POINTS = 6550
    return df.groupby('group', sort=False)\
             .head(POINTS)\
             .reset_index(drop=True)


def remove_groups(df):
    """
    Remove groups that have less than 6550 members per timeslice
    group
    """
    POINTS = 6550
    g = df.groupby('group')
    
    return g.filter(lambda x: len(x) >= POINTS)


def sample_noise_data(tag, df):
    """
    Sample noise based on specified tag
    """
    if tag == "equal":
        sampled_noise = equal_sampling(df)
        equal_sampled_noise = remove_groups(sampled_noise)

        return equal_sampled_noise
    
    
def sample_mixed_data(tag, df):
    """
    Sample mixed data based on specified tag
    """
    df = sort_groups(df)
    
    if tag == "equal":
        return equal_sampling(df)

In [4]:
def save_noise_test(df_noise, test, WRITE_NOISE_TEST):
    """
    Save unsampled noise test data
    """
    for idx in test:
        file_name = "group_"+str(idx)+".xyz"
        np.savetxt(WRITE_NOISE_TEST + file_name,
                   df_noise[df_noise.group == idx][['pos_y', 'pos_z', 'time']].values)
    
    print("All {0} files saved successfully in {1}!"\
                  .format(len(test), WRITE_NOISE_TEST))
    
def save_mixed_test(df_mixed, test, WRITE_MIXED_TEST):
    """
    Save unsampled mixed test data
    """
    for idx in test:
        file_name = "group_"+str(idx)+".xyz"
        np.savetxt(WRITE_MIXED_TEST + file_name,
                   df_mixed[df_mixed.group == idx][['pos_y', 'pos_z', 'time']].values)
    
    print("All {0} files saved successfully in {1}!"\
                  .format(len(test), WRITE_MIXED_TEST))
    
    
def save_noise_train(sampled_df, train, WRITE_NOISE_TRAIN):
    """
    Save sampled noise train data
    """
    
    for idx in train:
        file_name = "group_" + str(idx) + ".xyz"
        np.savetxt(WRITE_NOISE_TRAIN + file_name,
                   sampled_df[sampled_df.group == idx][['pos_y', 'pos_z', 'time']].values)
    
    print("All {0} files saved successfully in {1}!"\
                .format(len(train), WRITE_NOISE_TRAIN))
    
    
def save_mixed_train(sampled_df, train, WRITE_MIXED_TRAIN):
    """
    Save sampled mixed train data
    """
    
    for idx in train:
        file_name = "group_" + str(idx) + ".xyz"
        np.savetxt(WRITE_MIXED_TRAIN + file_name,
                   sampled_df[sampled_df.group == idx][['pos_y', 'pos_z', 'time']].values)
    
    print("All {0} files saved successfully in {1}!"\
                .format(len(train), WRITE_MIXED_TRAIN))

In [3]:
READ_PATH = "../../data/simplified_data.csv"
WRITE_NOISE_TRAIN = "../../data/ensemble/yzt/points/noise/train/"
WRITE_NOISE_TEST = "../../data/ensemble/yzt/points/noise/test/"

WRITE_MIXED_TRAIN =  "../../data/ensemble/yzt/points/mixed/train/"
WRITE_MIXED_TEST = "../../data/ensemble/yzt/points/mixed/test/"

SIZE = 200

df = read_csv(READ_PATH)
df_noise, df_mixed = identify_groups(df)
df_noise, df_mixed = generate_dfs(df, df_noise, df_mixed)

In [6]:
####NOISE####
train, test = generate_train_test(df_noise)
df_train_noise = df_noise[df_noise.group.isin(train)]
df_train_noise = sample_noise_data("equal", df_train_noise)

save_noise_test(df_noise, test, WRITE_NOISE_TEST)
save_noise_train(df_train_noise, train, WRITE_NOISE_TRAIN)

TRAIN SET: [2957 6432 3440 3925 4848 3403 6427 5991 1405 2557 1955 4969 6048 1586
 5701 3941 3189  669 4483  648 6139 1583 2587 1531  762 6630 5678 5905
 3219 3222 1325 1729 5941 3340 5605  576 5511 1374 3974  208 3799 4989
 6433 1567 3123 4328  449 6181 5434 3168 2875 4870 6177 2533 1151 4444
 6241 4274 2751 6333 1391 4199 2932 2956 1055 4590  134 1346 1639 4463
  758 1101 5006  903 5578  437 4897  210 4647 4799 4671 5982 6132 5840
 2128 1413 5182 3152 2869 5337  583  374 4060 2934  999 3943 3060 3042
 1304 4267 2197 3492 5308  532 2414 4615 1620 3968 6540 4900 5491 1246
 5977 3985 1667  416 3355 4874 3594 3758 5222 2658 4430 4081 5097 3253
 3487 2385 2719 1016 3180 3792  990 4640 3352 4477 2872 5038 1202 1189
 1129 4571 3215 3704 2834 3609 5922 3507 4032  686 4426 2093 3815 1846
  347  914 4695 2272  788 2621]
TEST SET: [4019 1333  301 5412 5009 1108 4179 5103 4796    0 5067 5190 6396 6555
 3763 3569 5149  432 5648 3960 6589  190 4775 1681 6560  630  909  601
 4591 1204 5510 1481 439

In [7]:
####MIXED####
train, test = generate_train_test(df_mixed)
df_train_mixed = df_mixed[df_mixed.group.isin(train)]
df_train_mixed = sample_mixed_data("equal", df_train_mixed)

save_mixed_test(df_mixed, test, WRITE_MIXED_TEST)
save_mixed_train(df_train_mixed, train, WRITE_MIXED_TRAIN)

TRAIN SET: [4428 5215   32 2362 6337 3845 3287 4132 4854 1820 4653 5659 6084 4717
 6134 5799 2211 1214 5866 3953 5280 2231  810 1516  997 4646 1869 4515
 4227 5857 1163 5295 1194  615   36 6567 6314  281 4075  103 5725 2404
  831  809 6534 3063 2759 6203 2654 6378 3094 4347 1599 2323 1813  979
  716 4958 6006 6493 5133 5219 3923 4327 3514 1344 2367 2774 4417 3611
 6590 1209 4243 3001 2099 1836 6495 2890  530 4034  411 5886 5847 1825
 3310  857 4282 3072 4215 6530 5281  491 4926  102 6109 5234 5116 4460
 5312 4943 1275 5270   55 5795 2389 5812 3170 2021 6221 2374 3025 4080
 2517 3252 2821 1671 4740 1637 2567 3670  797 1172 3490 1454 6553 2711
 5211 6542 1038 6588 5010 1488 1918 1650 5032 1937  554  941 4569 5272
 3628  375 1232 4793 1357 5114 4562 1893 4301 1496 2024 1584 1446 5883
 3398 1732 3483 4791 5621 3315]
TEST SET: [3924 5823  477 6038 3967 1653 5422 3727  355 4532 1184 1657 1273 4735
 4679   50 2338 2185 2330 1926 1973  166  394 1658 1154 3700 2642  845
  231 2161 1484  988 574

In [4]:
df_noise.groupby('label').count()

Unnamed: 0_level_0,pos_x,pos_y,pos_z,time,group
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,18334220,18334220,18334220,18334220,18334220


In [6]:
df_mixed.groupby('label').count()

Unnamed: 0_level_0,pos_x,pos_y,pos_z,time,group
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,26996090,26996090,26996090,26996090,26996090
1,489906,489906,489906,489906,489906


In [9]:
27485996 - 18334220

9151776

In [7]:
18334220 == 26996090 + 489906

False

In [10]:
26996090 /26996090 + 489906

489907.0