# Tests for maps

# Author JJGC

## ilustration of tests for maps functions

In [6]:
%load_ext autoreload
%autoreload 2
import os
import sys
import time
import warnings
import datetime 
print("Last updated on ", time.asctime())

Last updated on  Mon Feb 11 09:46:18 2019


### Notebook configuration

In [7]:
%matplotlib inline
import matplotlib.pyplot as plt

plt.rcParams["figure.figsize"] = 10, 8
plt.rcParams["font.size"     ] = 14

In [8]:
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib
import tables            as tb
import random
import glob
import warnings
sns.set()

In [9]:
from krcal.core.core_functions      import timeit

In [10]:
from krcal.core.kr_types                 import KrEvent
from krcal.core.analysis_functions       import kr_event
from typing      import List, Tuple, Sequence, Iterable, Dict
from   pandas.core.frame import DataFrame

In [11]:
import logging
log = logging.getLogger()

In [25]:
from krcal.core.io_functions                 import filenames_from_list
from krcal.core.kr_types                     import Number
from krcal.core.kr_types                     import KrFileName
from  invisible_cities.io.dst_io             import load_dsts
from krcal.core.core_functions               import time_delta_from_time
from krcal.core.analysis_functions           import kr_ranges_and_bins
from krcal.core.analysis_functions           import select_xy_sectors
from krcal.core.analysis_functions           import select_xy_sectors_df
from krcal.core.analysis_functions           import event_map
from   invisible_cities.core.core_functions  import in_range
from krcal.core.core_functions               import time_delta_from_time

In [19]:
import cProfile

In [20]:
import logging
log = logging.getLogger()

## Get time series

In [21]:
def get_time_series(time_bins    : Number,
                    time_range   : Tuple[float, float],
                    dst          : DataFrame)->Tuple[np.array, List[np.array]]:
    """

    Returns a time series (ts) and a list of masks which are used to divide
    the event in time tranches.

        Parameters
        ----------
            time_bins
                Number of time bines.
            time_range
                Time range.
            T
                A vector of times.

        Returns
        -------
            A Tuple with:
            np.array       : This is the ts vector
            List[np.array] : This are the list of masks defining the events in the time series.

    """

    logging.debug(f'function: get_time_series')
    nt = time_bins
    x = int((time_range[-1] -  time_range[0]) / nt)
    tfirst = int(time_range[0])
    tlast  = int(time_range[-1])
    if x == 1:
        indx = [(tfirst, tlast)]
    else:
        indx = [(i, i + x) for i in range(tfirst, int(tlast - x), x) ]
        indx.append((x * (nt -1), tlast))

    ts = [(indx[i][0] + indx[i][1]) / 2 for i in range(len(indx))]

    logging.debug(f' number of time bins = {nt}, t_first = {tfirst} t_last = {tlast}')
    logging.debug(f'indx = {indx}')
    logging.debug(f'ts = {ts}')

    masks = [in_range(dst.DT.values, indx[i][0], indx[i][1]) for i in range(len(indx))]

    return np.array(ts), masks


## Select xy sectors

In [15]:
def select_xy_sectors_df(dst        : DataFrame,
                         bins_x     : np.array,
                         bins_y     : np.array)-> Dict[int, List[DataFrame]]:
    """
    Return a dict of selections organized by xy sector

    Parameters
    ----------
        dst:
        The input data frame.
        bins_x:
        An array of bins along x.
        bins_y:
        An array of bins along y.

    Returns
    -------
        A map of selections defined as Dict[int, List[DataFrame]]
        where for each x (the key in the dict) one has a list
        (corresponding to y cells) of DataFrame (the events selected)

    """
    RGES = {}
    nbins_x = len(bins_x) -1
    nbins_y = len(bins_y) -1
    for i in range(nbins_x):
        dstx = dst[in_range(dst.X, *bins_x[i: i+2])]
        RGES[i] = [dstx[in_range(dstx.Y, *bins_y[j: j+2])] for j in range(nbins_y) ]

    return RGES

In [16]:
def x_and_y_ranges(data, xb, yb, nbx, nby):
    r = True
    for i in range(nbx):
        dstx = data[in_range(data.X, *xb[i: i+2])]
        r & in_range(dstx.X.values, xb[i: i+2][0], xb[i: i+2][1]).all()
        for j in range(nby):
            dsty = dstx[in_range(dstx.Y, *yb[j: j+2])]
            r & in_range(dsty.Y.values, yb[j: j+2][0], yb[j: j+2][1]).all()
    return r

In [17]:
def nmap(RGES):
    for i, ldst in RGES.items():
        DLEN[i] =[len(dst) for dst in ldst]
    return pd.DataFrame.from_dict(DLEN)


In [18]:
def data_frames_are_identical(df1, df2):
    df = df1 == df2 # the resulting df is a df of bools.
    
    # first all() gives a bool per column, creating a Series, seond all() gives a bool for the Series
    return df.eq(True).all().all() 

## Prepare data
- For our toy example the dst is a DataFrame with 3 columns: (X,Y,t)

In [42]:
x = np.random.random(20) * 100
y = np.random.random(20) * 100
t = np.arange(0,100,5) 
D = {}
D['X'] = x
D['Y'] = y
D['t'] = t
data = pd.DataFrame.from_dict(D)
print(data)

            X          Y   t
0   62.176172  18.609421   0
1   68.552413  84.868491   5
2   52.493751  33.038107  10
3   42.127054  78.379566  15
4   76.521328  15.337941  20
5   54.420618  77.262200  25
6   61.888440  32.449328  30
7   15.466696  50.860008  35
8   27.814152  76.017899  40
9   25.200884  38.072995  45
10  84.764463  43.107881  50
11  73.952319  47.908672  55
12  48.224708  26.017777  60
13  51.101955  78.379809  65
14  20.137830  31.718900  70
15   8.834018  96.951566  75
16  17.729220  11.384412  80
17  89.433168  64.824041  85
18  90.438037  85.065037  90
19  74.517929  86.943769  95


### Sort data frame in terms of time

In [43]:
dst_time = data.sort_values('t')
print(dst_time)

            X          Y   t
0   62.176172  18.609421   0
1   68.552413  84.868491   5
2   52.493751  33.038107  10
3   42.127054  78.379566  15
4   76.521328  15.337941  20
5   54.420618  77.262200  25
6   61.888440  32.449328  30
7   15.466696  50.860008  35
8   27.814152  76.017899  40
9   25.200884  38.072995  45
10  84.764463  43.107881  50
11  73.952319  47.908672  55
12  48.224708  26.017777  60
13  51.101955  78.379809  65
14  20.137830  31.718900  70
15   8.834018  96.951566  75
16  17.729220  11.384412  80
17  89.433168  64.824041  85
18  90.438037  85.065037  90
19  74.517929  86.943769  95


### Compute a vector of time differences

In [44]:
T       = dst_time.t.values
DT      = time_delta_from_time(T)
print(DT)

[ 0  5 10 15 20 25 30 35 40 45 50 55 60 65 70 75 80 85 90 95]


### Expand the DST with DT

In [45]:
dst = dst_time.assign(DT=DT)
print(dst)

            X          Y   t  DT
0   62.176172  18.609421   0   0
1   68.552413  84.868491   5   5
2   52.493751  33.038107  10  10
3   42.127054  78.379566  15  15
4   76.521328  15.337941  20  20
5   54.420618  77.262200  25  25
6   61.888440  32.449328  30  30
7   15.466696  50.860008  35  35
8   27.814152  76.017899  40  40
9   25.200884  38.072995  45  45
10  84.764463  43.107881  50  50
11  73.952319  47.908672  55  55
12  48.224708  26.017777  60  60
13  51.101955  78.379809  65  65
14  20.137830  31.718900  70  70
15   8.834018  96.951566  75  75
16  17.729220  11.384412  80  80
17  89.433168  64.824041  85  85
18  90.438037  85.065037  90  90
19  74.517929  86.943769  95  95


### And now compute the time series for 5 bins, between DT[0] and DT[-1]

In [48]:
ts, masks = get_time_series(5,(dst.DT.values[0], dst.DT.values[-1]), dst)

In [49]:
print(f'time series bins = {ts}')

time series bins = [ 9.5 28.5 47.5 66.5 85.5]


In [50]:
print(masks)

[array([ True,  True,  True,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False]), array([False, False, False, False,  True,  True,  True,  True, False,
       False, False, False, False, False, False, False, False, False,
       False, False]), array([False, False, False, False, False, False, False, False,  True,
        True,  True,  True, False, False, False, False, False, False,
       False, False]), array([False, False, False, False, False, False, False, False, False,
       False, False, False,  True,  True,  True,  True, False, False,
       False, False]), array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False,  True,  True,
        True, False])]


In [51]:
nt = 5
t0 = dst.DT.values[0]
tf = dst.DT.values[-1]
step = int((tf -  t0) / nt)
print(f' number of bins = {nt}, t0 = {t0}, tf = {tf}, step = {step}')

 number of bins = 5, t0 = 0, tf = 95, step = 19


### indx contains ranges that advance from t0 to tf adding step

In [52]:
t0 = int(t0)
tf = int(tf)
if step == 1:
    indx = [(t0, tl)]
else:
    indx = [(i, i + step) for i in range(t0, int(tf - step), step) ]
    indx.append((step * (nt -1), tf))
print(f'indx = {indx}')

indx = [(0, 19), (19, 38), (38, 57), (57, 76), (76, 95)]


In [53]:
ts = [(indx[i][0] + indx[i][1]) / 2 for i in range(len(indx))]
print(f'time series bins centers = {ts}')

time series bins centers = [9.5, 28.5, 47.5, 66.5, 85.5]


In [54]:
masks = [in_range(dst.DT.values, indx[i][0], indx[i][1]) for i in range(len(indx))]
print(f'masks = {masks}')

masks = [array([ True,  True,  True,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False]), array([False, False, False, False,  True,  True,  True,  True, False,
       False, False, False, False, False, False, False, False, False,
       False, False]), array([False, False, False, False, False, False, False, False,  True,
        True,  True,  True, False, False, False, False, False, False,
       False, False]), array([False, False, False, False, False, False, False, False, False,
       False, False, False,  True,  True,  True,  True, False, False,
       False, False]), array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False,  True,  True,
        True, False])]


In [55]:
masks[0]

array([ True,  True,  True,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False])

In [60]:
for mask in masks:
    print(np.count_nonzero(mask))

4
4
4
4
3


### Given the structure of the time vector, there are always 4 True elements except in the last element of the list

### Define XY bins

In [200]:
xb = np.arange(0,101,25)
yb = np.arange(0,101,25)
nbx = len(xb) -1
nby = len(yb) -1
print(f'xb, nbx = {xb, nbx}')
print(f'yb, nby = {yb, nby}')

xb, nbx = (array([  0,  25,  50,  75, 100]), 4)
yb, nby = (array([  0,  25,  50,  75, 100]), 4)


### Check ranges

In [201]:
x_and_y_ranges(data, xb, yb, nbx, nby)

True

### Fill dict

In [202]:
selDict = {}
for i in range(nbx):
    dstx = data[in_range(data.X, *xb[i: i+2])]
    selDict[i] = [dstx[in_range(dstx.Y, *yb[j: j+2])] for j in range(nby) ]

sel = nmap(selDict)

In [203]:
sel

Unnamed: 0,0,1,2,3
0,0,1,2,0
1,2,1,2,2
2,1,2,1,0
3,1,0,3,2


### Call function and make counting map

In [204]:
selMap = select_xy_sectors_df(data, xb, yb)

In [205]:
sel2 = nmap(selMap)

### Compare data frames

In [206]:
data_frames_are_identical(sel, sel2)

True

## END