In [25]:
# Disable warnings
import warnings
warnings.filterwarnings('ignore')

# Combinatorics tool
import itertools

# Usual library
import numpy             as np
import pandas            as pd
import matplotlib        as mpl
import matplotlib.pyplot as plt
%matplotlib inline

# Plot settings
mpl.rcParams['legend.frameon' ] = False
mpl.rcParams['legend.fontsize'] = 'xx-large'
mpl.rcParams['xtick.labelsize'] = 16
mpl.rcParams['ytick.labelsize'] = 16
mpl.rcParams['axes.titlesize' ] = 18
mpl.rcParams['axes.labelsize' ] = 18
mpl.rcParams['lines.linewidth'] = 2.5

# Dataset exploration functions
import dataset

In [26]:
data = dataset.get_data()
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20343 entries, 0 to 20342
Columns: 327 entries, mc_generator_weights to electronCollection
dtypes: bool(5), float32(73), int32(47), int8(16), object(180), uint32(5), uint64(1)
memory usage: 38.2+ MB


In [31]:
data_collection = data[['jetCollection','muonCollection','electronCollection']]
data_collection.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20343 entries, 0 to 20342
Data columns (total 3 columns):
jetCollection         20343 non-null object
muonCollection        20343 non-null object
electronCollection    20343 non-null object
dtypes: object(3)
memory usage: 476.9+ KB


In [27]:
%timeit -n 1 data.apply(dataset.compute_manydR, axis=1)

1 loop, best of 3: 8.71 s per loop


In [32]:
%timeit -n 1 data_collection.apply(dataset.compute_manydR, axis=1)

1 loop, best of 3: 7.66 s per loop


In [35]:
# Parallelization with ipyparallel
import ipyparallel

cluster = ipyparallel.Client(profile='default')
dview = cluster[:]

print( 'profile:     ', cluster.profile)
print( 'IDs:         ', cluster.ids    ) # Print process id numbers
print( 'direct views ', dview)

('profile:     ', u'default')
('IDs:         ', [0, 1, 2, 3])
('direct views ', <DirectView [0, 1, 2, 3]>)


In [36]:
def parallelized_function(small_df):
    return small_df.apply(dataset.compute_manydR, axis=1)

In [44]:
array_small_df = np.array_split(data, 200)
for df in array_small_df[0:10]:
    %timeit -n 1 df.apply(dataset.compute_manydR, axis=1)

1 loop, best of 3: 39.6 ms per loop
1 loop, best of 3: 41.4 ms per loop
1 loop, best of 3: 42.7 ms per loop
1 loop, best of 3: 39.1 ms per loop
1 loop, best of 3: 42.1 ms per loop
1 loop, best of 3: 40.8 ms per loop
1 loop, best of 3: 48.2 ms per loop
1 loop, best of 3: 39.7 ms per loop
1 loop, best of 3: 40 ms per loop
1 loop, best of 3: 41.1 ms per loop


In [45]:
array_small_df_collection = np.array_split(data_collection, 200)
for df in array_small_df_collection[0:10]:
    %timeit -n 1 df.apply(dataset.compute_manydR, axis=1)

1 loop, best of 3: 44.7 ms per loop
1 loop, best of 3: 39.1 ms per loop
1 loop, best of 3: 44.1 ms per loop
1 loop, best of 3: 45.3 ms per loop
1 loop, best of 3: 45.6 ms per loop
1 loop, best of 3: 46.7 ms per loop
1 loop, best of 3: 50 ms per loop
1 loop, best of 3: 38.9 ms per loop
1 loop, best of 3: 43.8 ms per loop
1 loop, best of 3: 45.7 ms per loop


In [46]:
%timeit -n 1 dview.map(parallelized_function, array_small_df)

1 loop, best of 3: 19 s per loop


In [47]:
%timeit -n 1 dview.map(parallelized_function, array_small_df_collection)

1 loop, best of 3: 3.49 s per loop
