In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("../data/time_data.csv")

In [3]:
data.head()

Unnamed: 0,dom_id,pmt_id,pos_x,pos_y,pos_z,dir_x,dir_y,dir_z,tot,time,label,group
0,321.0,5.0,-17.661,32.245,65.231,-0.46,-0.266,-0.847,26.0,0.0,0,0
1,1653.0,23.0,11.595,85.465,65.459,-0.955,-0.0,0.296,27.0,0.0,0,0
2,275.0,9.0,-36.464,67.166,160.189,0.415,0.72,-0.556,26.0,0.0,0,0
3,1660.0,23.0,61.66,101.635,169.059,-0.955,-0.0,0.296,26.0,0.0,0,0
4,966.0,16.0,-54.51,-78.323,94.341,-0.827,0.478,-0.296,24.0,0.0,0,0


In [4]:
len(data)

45820216

The data is a set of x, y, z points with a label indicating whether it is a hit or noise with associated metadata. They have been grouped into timeslices of 15000ns each (as discussed earlier). 

To use point cloud neural network, I would require "point clouds" of objects. For example. a chair would be represented by (N x P x 3) points with N point clouds, P points per point cloud and 3 since each point has x, y, z coordinates. 

With the training data, we already know the neutrino events and the hits associated with it. So for this network, we could group the hits by the events that caused it. These events would be the point cloud objects. 

Need:
1. Timeslices with only noise
2. Timeslices with noise + hits

### It would be good to know how many groups have only noise and noise + hits

In [22]:
df_count_label_type = data.groupby('group')['label'].unique()
df_count_label_type = pd.DataFrame(df_count_label_type)
df_count_label_type = df_count_label_type.reset_index()

Now we separate the groups by those that have only one label and those that have two lables (irrespective of lables)

In [23]:
df_count_label_type[:10]

Unnamed: 0,group,label
0,0,[0]
1,1,"[0, 1]"
2,2,"[0, 1]"
3,3,[0]
4,4,[0]
5,5,"[0, 1]"
6,6,"[0, 1]"
7,7,"[0, 1]"
8,8,[0]
9,9,"[0, 1]"


In [38]:
df_noise_hits = df_count_label_type.loc[np.array(
    list(map(len, df_count_label_type.label.values))) > 1]

In [39]:
df_noise_hits.head()

Unnamed: 0,group,label
1,1,"[0, 1]"
2,2,"[0, 1]"
5,5,"[0, 1]"
6,6,"[0, 1]"
7,7,"[0, 1]"


In [40]:
df_noise = df_count_label_type.loc[np.array(
    list(map(len, df_count_label_type.label.values))) == 1]

df_noise.head()

Unnamed: 0,group,label
0,0,[0]
3,3,[0]
4,4,[0]
8,8,[0]
10,10,[0]


Check to see if there are any groups that have just hits.

In [69]:
df_noise.label = df_noise.label.astype(int)
df_noise.label.unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


array([0])

Now we have a list of groups that have only noise and groups that have both noise and hits. We need to obtain the corresponding full rows for each

# Timeslices with Only Noise

Obtain the df groups that only contain noise and arrange by highest to lowest members.

In [82]:
df_noise.group
data_noise = data[data.group.isin(df_noise.group)]

In [88]:
data_noise.head()

Unnamed: 0,dom_id,pmt_id,pos_x,pos_y,pos_z,dir_x,dir_y,dir_z,tot,time,label,group
0,321.0,5.0,-17.661,32.245,65.231,-0.46,-0.266,-0.847,26.0,0.0,0,0
1,1653.0,23.0,11.595,85.465,65.459,-0.955,-0.0,0.296,27.0,0.0,0,0
2,275.0,9.0,-36.464,67.166,160.189,0.415,0.72,-0.556,26.0,0.0,0,0
3,1660.0,23.0,61.66,101.635,169.059,-0.955,-0.0,0.296,26.0,0.0,0,0
4,966.0,16.0,-54.51,-78.323,94.341,-0.827,0.478,-0.296,24.0,0.0,0,0


We can see that the mean size of groups with just noise is 6587 and the lowest have just 1 member. 

In [92]:
data_noise.groupby('group')['label'].count().sort_values(ascending=False).describe()

count     2783.000000
mean      6587.933884
std       1127.594126
min          1.000000
25%       6729.000000
50%       6784.000000
75%       6840.000000
max      12454.000000
Name: label, dtype: float64

In [99]:
data_noise.groupby('group')['label'].count().sort_values(ascending=False)[:10]

group
0       12454
4848     7081
5550     7052
3594     7050
3609     7050
1189     7042
1304     7033
5337     7020
3042     7017
3704     7017
Name: label, dtype: int64

Lets take `group 0` and `group 3704` as a sample.

In [106]:
group_0 = data_noise[data_noise.group == 0][['pos_x', 'pos_y', 'pos_z']]
group_0.to_csv("../data/group_0.csv", index=False)

In [112]:
group_4848 = data_noise[data_noise.group == 4848][['pos_x', 'pos_y', 'pos_z']]
group_4848.to_csv("../data/group_4848.csv", index=False)

# Timeslices with Noise and HIts
Obtain the df groups that only contain noise and hits and arrange by highest to lowest members.

In [113]:
data_noise_hits = data[data.group.isin(df_noise_hits.group)]

In [114]:
data_noise_hits.head()

Unnamed: 0,dom_id,pmt_id,pos_x,pos_y,pos_z,dir_x,dir_y,dir_z,tot,time,label,group
12454,485.0,17.0,-73.568,30.247,47.241,-0.827,-0.478,-0.296,27.0,15001.0,0,1
12455,618.0,31.0,12.056,13.517,151.011,0.719,-0.415,0.558,28.0,15003.0,0,1
12456,1011.0,4.0,-26.602,-94.884,178.231,-0.46,0.266,-0.847,28.0,15009.0,0,1
12457,1720.0,19.0,2.695,104.221,112.441,0.827,-0.478,-0.296,25.0,15011.0,0,1
12458,1442.0,31.0,88.235,50.983,187.211,0.719,-0.415,0.558,28.0,15017.0,0,1


We can see that the mean size of groups with just noise is 6587 and the lowest have just 1 member. 

In [115]:
data_noise_hits.groupby('group')['label'].count().sort_values(ascending=False).describe()

count    3976.000000
mean     6912.976861
std       200.590617
min      6550.000000
25%      6788.000000
50%      6869.000000
75%      6975.000000
max      8542.000000
Name: label, dtype: float64

In [116]:
data_noise_hits.groupby('group')['label'].count().sort_values(ascending=False)[:10]

group
1637    8542
615     8483
5866    8450
5857    8313
554     8196
1273    8157
3170    8084
2021    8064
4301    8048
6495    8003
Name: label, dtype: int64

Lets take `group 1637` and `group 615` as a sample.

In [119]:
group_1637 = data_noise_hits[data_noise_hits.group == 1637][['pos_x', 'pos_y', 'pos_z']]
group_1637.to_csv("../data/group_1637.csv", index=False)

In [120]:
group_615 = data_noise_hits[data_noise_hits.group == 615][['pos_x', 'pos_y', 'pos_z']]
group_615.to_csv("../data/group_615.csv", index=False)