In [1]:
import sys
sys.version

'3.5.0 |Continuum Analytics, Inc.| (default, Sep 13 2015, 10:34:39) \n[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]'

In [2]:
import numpy as np
import pandas as pd
import pandas.util.testing as tm

pd.options.display.max_columns = 8
pd.options.display.max_rows = 8
pd.__version__

'0.17.0'

In [3]:
## P3
df = pd.read_csv('adult.data', header=None)
df.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num',
              'marital-status', 'occupation', 'relationship', 'race', 'sexz',
              'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
              'income']
df

Unnamed: 0,age,workclass,fnlwgt,education,...,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,...,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,...,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,...,0,40,United-States,<=50K
3,53,Private,234721,11th,...,0,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...
32557,40,Private,154374,HS-grad,...,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,...,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,...,0,20,United-States,<=50K
32560,52,Self-emp-inc,287927,HS-grad,...,0,40,United-States,>50K


In [4]:
df[['age', 'marital-status']]

Unnamed: 0,age,marital-status
0,39,Never-married
1,50,Married-civ-spouse
2,38,Divorced
3,53,Married-civ-spouse
...,...,...
32557,40,Married-civ-spouse
32558,58,Widowed
32559,22,Never-married
32560,52,Married-civ-spouse


In [5]:
df.groupby('income')['hours-per-week'].mean().to_frame()

Unnamed: 0_level_0,hours-per-week
income,Unnamed: 1_level_1
<=50K,38.84021
>50K,45.473026


#### Data Structures

In [6]:
# NumPy
arr = np.array([1, 2, 3, 4], dtype=np.int64)
arr

array([1, 2, 3, 4])

In [7]:
pd.DataFrame(arr).T

Unnamed: 0,0,1,2,3
0,1,2,3,4


In [8]:
# Series
df['workclass'].to_frame()

Unnamed: 0,workclass
0,State-gov
1,Self-emp-not-inc
2,Private
3,Private
...,...
32557,Private
32558,Private
32559,Private
32560,Self-emp-inc


## pandas Development

### Cython

In [9]:
%load_ext cython

In [10]:
%%cython

import numpy as np
from numpy cimport *
import pandas.util as util # dummy

def ismember(ndarray arr, set values):
    cdef:
        Py_ssize_t i, n
        ndarray[uint8_t] result
        object val

    n = len(arr)
    result = np.empty(n, dtype=np.uint8)
    for i in range(n):
        val = util.get_value_at(arr, i)
        result[i] = val in values

    return result.view(np.bool_)

In [11]:
pd.lib.ismember(np.array([1, 2, 3, 4]), set([2, 3]))

array([False,  True,  True, False], dtype=bool)

In [12]:
df = pd.DataFrame({'X': [1, 2, 3],
                   'Y': [4, 5, 6],
                   'Z': [True, False, True]},
                  index=['a', 'b', 'c'])
df

Unnamed: 0,X,Y,Z
a,1,4,True
b,2,5,False
c,3,6,True


In [13]:
df.reindex(['b', 'a', 'c'])

Unnamed: 0,X,Y,Z
b,2,5,False
a,1,4,True
c,3,6,True


In [14]:
df.index.get_indexer(['b', 'a', 'c'])

array([1, 0, 2])

In [15]:
df.values

array([[1, 4, True],
       [2, 5, False],
       [3, 6, True]], dtype=object)

In [16]:
np.take(df.values, [1, 0, 2], axis=0)

array([[2, 5, False],
       [1, 4, True],
       [3, 6, True]], dtype=object)

## Performance 

### 1. Environment setup

In [17]:
import numpy.distutils.system_info as sysinfo

In [18]:
sysinfo.get_info('lapack')



{'language': 'f77',
 'libraries': ['openblas'],
 'library_dirs': ['/home/ec2-user/miniconda/envs/py35/lib']}

In [19]:
sysinfo.get_info('atlas')



{'define_macros': [('NO_ATLAS_INFO', -1)],
 'include_dirs': ['/home/ec2-user/miniconda/envs/py35/include'],
 'language': 'f77',
 'libraries': ['lapack', 'f77blas', 'cblas', 'atlas'],
 'library_dirs': ['/home/ec2-user/miniconda/envs/py35/lib']}

In [20]:
pd.show_versions()


INSTALLED VERSIONS
------------------
commit: None
python: 3.5.0.final.0
python-bits: 64
OS: Linux
OS-release: 4.1.7-15.23.amzn1.x86_64
machine: x86_64
processor: x86_64
byteorder: little
LC_ALL: None
LANG: ja_JP.UTF-8

pandas: 0.17.0
nose: None
pip: 7.1.2
setuptools: 18.3.2
Cython: 0.23.3
numpy: 1.10.0
scipy: None
statsmodels: None
IPython: 4.0.0
sphinx: None
patsy: None
dateutil: 2.4.2
pytz: 2015.6
blosc: None
bottleneck: None
tables: None
numexpr: None
matplotlib: None
openpyxl: None
xlrd: None
xlwt: None
xlsxwriter: None
lxml: None
bs4: None
html5lib: None
httplib2: None
apiclient: None
sqlalchemy: None
pymysql: None
psycopg2: None


### 2. Use Built-in

In [21]:
np.unique([1, 2, 2, 3, 2, 4])

array([1, 2, 3, 4])

In [22]:
pd.unique([1, 2, 2, 3, 2, 4])

array([1, 2, 3, 4])

In [23]:
np.random.seed(71)
values = np.random.randint(1, 1000, 1000000)
values

array([108, 942,  12, ..., 308, 897,  40])

In [24]:
%timeit np.unique(values)

10 loops, best of 3: 42.9 ms per loop


In [25]:
%timeit pd.unique(values)

100 loops, best of 3: 7.84 ms per loop


### Use built-in

In [26]:
np.random.seed(71)

chars1 = tm.rands_array(5, 100)
chars2 = tm.rands_array(5, 10000)

n = 1000000
df = pd.DataFrame({'a': np.random.randn(n),
                   'b': tm.choice(chars1, size=n),
                   'c': tm.choice(chars2, size=n)})
df.shape

(1000000, 3)

In [27]:
df

Unnamed: 0,a,b,c
0,0.090604,Oh37N,9Rahp
1,-0.361360,Eg4PH,yeMYh
2,0.606186,CW0dx,nzRVA
3,1.052577,Y7wQN,Ul5iI
...,...,...,...
999996,-0.434356,LOsZ0,CtxfZ
999997,-1.956676,FMfxx,vlZac
999998,-1.732466,xYjNP,ONSzC
999999,0.729488,MIBDA,mIAYb


In [28]:
# expected
(df['b'] + df['c']).to_frame()

Unnamed: 0,0
0,Oh37N9Rahp
1,Eg4PHyeMYh
2,CW0dxnzRVA
3,Y7wQNUl5iI
...,...
999996,LOsZ0CtxfZ
999997,FMfxxvlZac
999998,xYjNPONSzC
999999,MIBDAmIAYb


In [29]:
def f1(s):
    return s['b'] + s['c']

%timeit df.apply(f1, axis=1)

1 loops, best of 3: 15.6 s per loop


In [30]:
%timeit df['b'] + df['c']

10 loops, best of 3: 92.5 ms per loop


In [31]:
# validation
import pandas.util.testing as tm
head = df.head()
tm.assert_series_equal(head.apply(f1, axis=1),
                       head['b'] + head['c'])

### 3. Use single op

In [32]:
# silly example...
(df['a'] + 1).to_frame()

Unnamed: 0,a
0,1.090604
1,0.638640
2,1.606186
3,2.052577
...,...
999996,0.565644
999997,-0.956676
999998,-0.732466
999999,1.729488


In [33]:
# silly example...
%timeit df['a'] + 2 - 1

1000 loops, best of 3: 1.58 ms per loop


In [34]:
%timeit df['a'] + 1

1000 loops, best of 3: 647 µs per loop


In [35]:
%timeit df['a'].fillna(0).fillna(0)

100 loops, best of 3: 3.72 ms per loop


In [36]:
%timeit df['a'].fillna(0)

1000 loops, best of 3: 1.65 ms per loop


### 4. Data Types

In [37]:
df.dtypes

a    float64
b     object
c     object
dtype: object

In [38]:
df.memory_usage()

a    8000000
b    8000000
c    8000000
dtype: int64

In [39]:
df.groupby('b').mean()

Unnamed: 0_level_0,a
b,Unnamed: 1_level_1
04L92,0.002298
1gm2k,-0.016342
2mwh9,-0.008071
2qoTC,-0.010341
...,...
xT8SM,-0.002617
xWukM,0.015631
xYjNP,-0.003496
zRsrK,0.011832


In [40]:
%timeit df.groupby('b').mean()

10 loops, best of 3: 46.8 ms per loop


In [41]:
df['b'] = df['b'].astype('category')
# also good for memory
df.memory_usage()

a    8000000
b    1000800
c    8000000
dtype: int64

In [42]:
%timeit df.groupby('b').mean()

10 loops, best of 3: 20.5 ms per loop


In [43]:
df['b'].values

[Oh37N, Eg4PH, CW0dx, Y7wQN, NJgGz, ..., C56mn, LOsZ0, FMfxx, xYjNP, MIBDA]
Length: 1000000
Categories (100, object): [04L92, 1gm2k, 2mwh9, 2qoTC, ..., xT8SM, xWukM, xYjNP, zRsrK]

In [44]:
c = pd.Categorical(list('ababcabaca'))
c

[a, b, a, b, c, a, b, a, c, a]
Categories (3, object): [a, b, c]

In [45]:
c.categories

Index(['a', 'b', 'c'], dtype='object')

In [46]:
c.codes

array([0, 1, 0, 1, 2, 0, 1, 0, 2, 0], dtype=int8)

### Use sorted / unique Index

In [47]:
np.random.seed(71)

df_left = pd.DataFrame({'a': np.random.randn(n),
                        'b': np.random.randn(n)})

n_right = 10000
df_right = pd.DataFrame({'c': np.random.randint(1, 100, n_right)})

In [48]:
df_left.join(df_right)

Unnamed: 0,a,b,c
0,-0.430603,-0.458514,95
1,-1.193928,-0.110882,94
2,-0.444299,0.477022,70
3,0.489412,-3.082245,86
...,...,...,...
999996,-0.525778,1.173011,
999997,1.354277,-0.017652,
999998,1.938507,0.323467,
999999,0.704053,0.937109,


In [49]:
# rhs has sorted index
%timeit df_left.join(df_right)

100 loops, best of 3: 6.85 ms per loop


In [50]:
df_right_shuffled = df_right.sample(n=len(df_right))
%timeit df_left.join(df_right_shuffled)

100 loops, best of 3: 18.7 ms per loop


In [51]:
# validation
tm.assert_frame_equal(df_left.join(df_right), df_left.join(df_right_shuffled))

In [52]:
df_left.index.is_unique

True

In [53]:
df_left.index.is_monotonic_increasing

True

In [54]:
df_right_shuffled.index.is_unique

True

In [55]:
df_right_shuffled.index.is_monotonic_increasing

False

### Date parsing

In [56]:
iso_8641_fmt = '2011-{0:02d}-{1:02d} 00:00:00'
values = [iso_8641_fmt.format(m, d) for m, d in zip([1, 2], [3, 4])]
values 

['2011-01-03 00:00:00', '2011-02-04 00:00:00']

In [57]:
pd.to_datetime(values)

DatetimeIndex(['2011-01-03', '2011-02-04'], dtype='datetime64[ns]', freq=None)

In [58]:
mdy_fmt = '{0:02d}/{1:02d}/2011'
values = [mdy_fmt.format(m, d) for m, d in zip([1, 2], [3, 4])]
values 

['01/03/2011', '02/04/2011']

In [59]:
pd.to_datetime(values)

DatetimeIndex(['2011-01-03', '2011-02-04'], dtype='datetime64[ns]', freq=None)

In [60]:
N = 10000

months = np.random.randint(1, 12, N)
days = np.random.randint(1, 28, N)
dates = [iso_8641_fmt.format(m, d) for m, d in zip(months, days)]
%timeit pd.to_datetime(dates)

100 loops, best of 3: 2.44 ms per loop


In [61]:
dates = [mdy_fmt.format(m, d) for m, d in zip(months, days)]
%timeit pd.to_datetime(dates)

1 loops, best of 3: 808 ms per loop


## Example: Cython

In [62]:
np.random.seed(71)

chars1 = tm.rands_array(5, 100)
chars2 = tm.rands_array(5, 10000)

n = 10000000
df = pd.DataFrame({'a': np.random.randn(n),
                   'b': tm.choice(chars1, size=n),
                   'c': tm.choice(chars2, size=n)})
df.shape

(10000000, 3)

In [63]:
%timeit (df['b'] + df['c']).where(df['a'] > 0, df['c'] + df['b'])

1 loops, best of 3: 2.85 s per loop


In [64]:
def f2_1(a, b, c):
    if a > 0:
        return b + c
    else:
        return c + b

%timeit pd.Series(np.vectorize(f2_1)(df['a'], df['b'], df['c']), index=df.index)

1 loops, best of 3: 5.18 s per loop


In [65]:
%load_ext cython

The cython extension is already loaded. To reload it, use:
  %reload_ext cython


In [66]:
%%cython

import numpy as np
from numpy cimport ndarray
import cython

@cython.wraparound(False)
@cython.boundscheck(False)
cpdef f2_cython(ndarray[double, ndim=1] a,
                ndarray[object, ndim=1] b,
                ndarray[object, ndim=1] c):
    
    cdef:
        int i, length = len(a)
        double aval
        object bval, cval
        ndarray[object, ndim=1] result = np.empty(length, dtype=object)
        
    for i in range(length):
        aval = a[i]
        bval = b[i]
        cval = c[i]
        if aval > 0:
            result[i] = bval + cval
        else:
            result[i] = cval + bval
            
    return result

In [67]:
%timeit pd.Series(f2_cython(df['a'].values, df['b'].values, df['c'].values), index=df.index)

1 loops, best of 3: 1.02 s per loop


In [69]:
# validation
head = df.head(1000)
exp = pd.Series(np.vectorize(f2_1)(head['a'], head['b'], head['c']), index=head.index)
tm.assert_series_equal(exp, (head['b'] + head['c']).where(head['a'] > 0, head['c'] + head['b']))
tm.assert_series_equal(exp, pd.Series(f2_cython(head['a'].values, head['b'].values, head['c'].values), index=head.index))