In [1]:
import numpy as np
import pandas as pd

In [24]:
from numpy.testing.utils import assert_almost_equal

####  Build a `numpy.ndarray`, an equivalent dataFrame, and a `numpy.rec.array`

In [2]:
rows = 10000000
# Equivalent numpy array
arr = np.random.uniform(size=rows*3).reshape(rows, 3)
# The pandas dataFrame with column names
df = pd.DataFrame(arr, columns=['x','y','z'])
# a `numpy.recarray`
rec = df.to_records()

In [3]:
df.head()

Unnamed: 0,x,y,z
0,0.140146,0.318773,0.654345
1,0.464587,0.596193,0.631692
2,0.176979,0.156531,0.196591
3,0.061424,0.133908,0.418625
4,0.353051,0.921042,0.122961


In [4]:
df.dtypes

x    float64
y    float64
z    float64
dtype: object

### Simple Array Operation: Sum

`numpy.ndarray`

In [16]:
%timeit arr[:, 2].sum()
arrsum = arr[:, 2].sum()

100 loops, best of 3: 15.2 ms per loop


`pandas.dataFrame`

In [6]:
%timeit df.z.sum()
pdattsum = df.z.sum()

10 loops, best of 3: 76.1 ms per loop


In [29]:
assert_almost_equal(arrsum, pdattsum)

`pandas.dataFrame` 

In [8]:
%timeit df['z'].sum()
pdnstyle = df['z'].sum()

10 loops, best of 3: 75.1 ms per loop


`numpy.rec.array`

In [9]:
%timeit rec['z'].sum()
reccolnames = rec['z'].sum()

10 loops, best of 3: 21.3 ms per loop


`pandas.dataFrame` with object type, expected to be slow

In [None]:
df['z'] = df['z'].astype('object')

In [None]:
df.dtypes

In [None]:
%timeit df['z'].sum()
objectSum = df['z'].sum()

I would have expected `pandas.dataFrame.sum` to be more competitive with `numpy.ndarray.sum`, where the type of the dataFrame column was specified. 

List comprehension style iteration in `numpy.ndarray`

In [20]:
%timeit sum(i for i in arr[:, 2])
itersumnumpy = sum(i for i in arr[:, 2])

1 loops, best of 3: 1.38 s per loop


In [27]:
assert_almost_equal(itersumnumpy,arrsum, decimal=5)

List comprehension style sum on a list: Again expected to be slow

In [14]:
l = arr[:, 2].tolist

In [45]:
%timeit sum(i for i in l)
listsum = sum(i for i in l)

1 loops, best of 3: 514 ms per loop


In [46]:
assert_almost_equal(listsum, arrsum, 5)

In [47]:
%timeit sum(i for i in df['z'])
pandasitersum = sum(i for i in df['z'])

1 loops, best of 3: 1.34 s per loop


In [48]:
t = tuple(l)

In [49]:
%timeit sum(i for i in t)
tuplesum = sum(i for i in t)

1 loops, best of 3: 523 ms per loop


In [50]:
assert_almost_equal(pandasitersum, arrsum, 5)

In [51]:
assert_almost_equal(tuplesum, arrsum, 5)

So for a dataFame with object type, doing array operations like sum (admittedly silly), is about as good as doing this with a list comprehension. But iterating through the dataFrame rows using a list comprehension style is much worse.