In [3]:
import numpy as np
import pandas as pd

pd.options.display.max_columns = 8
pd.options.display.max_rows = 8

In [2]:
## P3
df = pd.read_csv('adult.data', header=None)
df.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num',
              'marital-status', 'occupation', 'relationship', 'race', 'sexz',
              'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
              'income']
df

Unnamed: 0,age,workclass,fnlwgt,education,...,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,...,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,...,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,...,0,40,United-States,<=50K
3,53,Private,234721,11th,...,0,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...
32557,40,Private,154374,HS-grad,...,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,...,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,...,0,20,United-States,<=50K
32560,52,Self-emp-inc,287927,HS-grad,...,0,40,United-States,>50K


In [3]:
df[['age', 'marital-status']]

Unnamed: 0,age,marital-status
0,39,Never-married
1,50,Married-civ-spouse
2,38,Divorced
3,53,Married-civ-spouse
...,...,...
32557,40,Married-civ-spouse
32558,58,Widowed
32559,22,Never-married
32560,52,Married-civ-spouse


In [4]:
df.groupby('income')['hours-per-week'].mean().to_frame()

Unnamed: 0_level_0,hours-per-week
income,Unnamed: 1_level_1
<=50K,38.84021
>50K,45.473026


In [5]:
# NumPy
arr = np.array([1, 2, 3, 4], dtype=np.int64)
arr

array([1, 2, 3, 4])

In [6]:
pd.DataFrame(arr).T

Unnamed: 0,0,1,2,3
0,1,2,3,4


## pandas Development

In [7]:
df = pd.DataFrame({'X': [1, 2, 3], 'Y': [4, 5, 6],
                   'Z': [True, False, True]}, index=['a', 'b', 'c'])
df

Unnamed: 0,X,Y,Z
a,1,4,True
b,2,5,False
c,3,6,True


In [8]:
df.reindex(['b', 'a', 'c'])

Unnamed: 0,X,Y,Z
b,2,5,False
a,1,4,True
c,3,6,True


In [9]:
df.index.get_indexer(['b', 'a', 'c'])

array([1, 0, 2])

In [10]:
df.values

array([[1, 4, True],
       [2, 5, False],
       [3, 6, True]], dtype=object)

In [11]:
np.take(df.values, df.index.get_indexer(['b', 'a', 'c']), axis=0)

array([[2, 5, False],
       [1, 4, True],
       [3, 6, True]], dtype=object)

## Performance 

### Environment setup

In [36]:
import numpy.distutils.system_info as sysinfo
sysinfo.get_info('blas')

{'language': 'f77', 'libraries': ['blas'], 'library_dirs': ['/usr/lib']}

In [37]:
sysinfo.get_info('lapack')

{'language': 'f77', 'libraries': ['lapack'], 'library_dirs': ['/usr/lib']}

In [38]:
sysinfo.get_info('atlas')

{}

In [35]:
pd.show_versions()


INSTALLED VERSIONS
------------------
commit: None
python: 2.7.10.final.0
python-bits: 64
OS: Darwin
OS-release: 14.5.0
machine: x86_64
processor: i386
byteorder: little
LC_ALL: None
LANG: ja_JP.UTF-8

pandas: 0.16.2
nose: None
Cython: None
numpy: 1.9.3
scipy: None
statsmodels: None
IPython: 4.0.0
sphinx: None
patsy: None
dateutil: 2.4.2
pytz: 2015.6
bottleneck: None
tables: None
numexpr: None
matplotlib: None
openpyxl: None
xlrd: None
xlwt: None
xlsxwriter: None
lxml: None
bs4: None
html5lib: None
httplib2: None
apiclient: None
sqlalchemy: None
pymysql: None
psycopg2: None


In [5]:
import pandas.util.testing as tm
np.random.seed(1)

chars1 = tm.rands_array(5, 100)
chars2 = tm.rands_array(5, 10000)

N = 100000

df = pd.DataFrame({'x': np.random.randn(N),
                   'y': tm.choice(chars1, size=N),
                   'z': tm.choice(chars2, size=N)})
df.shape

(100000, 3)

In [6]:
df.head()

Unnamed: 0,x,y,z
0,-0.009448,dNx9K,rnndW
1,-0.08643,EBoZt,9rVow
2,0.145731,yqU2p,x64DG
3,0.71,vU8d4,zSSPs
4,-2.217829,jWCZV,JXd51


### Use built-in

In [7]:
def f1(s):
    return s['y'] + s['z']

%timeit df.apply(f1, axis=1)

1 loops, best of 3: 5.95 s per loop


In [8]:
%timeit df['y'] + df['z']

10 loops, best of 3: 19.6 ms per loop


In [None]:
# ToDo: where が使える例
%timeit df.apply(lambda x: x[1] if x[0] > 0 else x[2])
%timeit df[1].where(df[0] > 0, df[2])

In [9]:
def f2_1(s):
    if s['x'] > 0:
        return s['y'] + s['z']
    else:
        return s['z'] + s['y']

%timeit df.apply(f2_1, axis=1)

1 loops, best of 3: 8.02 s per loop


In [10]:
def f2_2(x, y, z):
    if x > 0:
        return y + z
    else:
        return z + y

%timeit pd.Series(np.vectorize(f2_2)(df['x'], df['y'], df['z']), index=df.index)

1 loops, best of 3: 138 ms per loop


In [12]:
%load_ext cython

ImportError: No module named cython

In [13]:
%%cython

import numpy as np
from numpy cimport ndarray

def f2_3(ndarray[double, ndim=1] x,
         ndarray[object, ndim=1] y,
         ndarray[object, ndim=1] z):
    
    cdef:
        int i, length = len(x)
        double xval
        object yval, zval
        ndarray[object, ndim=1] result = np.empty(length, dtype=object)
        
    for i in range(length):
        xval = x[i]
        yval = y[i]
        zval = z[i]
        if xval > 0:
            result[i] = yval + zval
        else:
            result[i] = zval + yval
            
    return result

ERROR: Cell magic `%%cython` not found.


### pandas hash functions may faster than NumPy

In [28]:
x = np.random.randint(1, 10000, n)
%timeit np.unique(x)

10 loops, best of 3: 121 ms per loop


In [29]:
%timeit pd.unique(x)

10 loops, best of 3: 22.2 ms per loop


### Use single op as much 

In [33]:
# silly example...
%timeit df.add(1).sub(2)

1 loops, best of 3: 349 ms per loop


In [34]:
%timeit df.sub(1)

10 loops, best of 3: 115 ms per loop


In [34]:
%timeit df.add(1).sub(2)

1 loops, best of 3: 6.3 s per loop


### Avoid object, use Categorical

In [14]:
df.dtypes

x    float64
y     object
z     object
dtype: object

In [17]:
%timeit df.groupby('y').mean()

100 loops, best of 3: 8.46 ms per loop


In [18]:
df['y'] = df['y'].astype('category')
%timeit df.groupby('y').mean()

100 loops, best of 3: 8.14 ms per loop


In [19]:
df['y'].values

[dNx9K, EBoZt, yqU2p, vU8d4, jWCZV, ..., 6C5QF, WYSP2, lfpaq, EBoZt, XQuSp]
Length: 100000
Categories (100, object): [1jP5y, 2GPsw, 2pfrQ, 3sFCE, ..., yx3bX, z1u56, zBRbs, zVIxh]

### Use sorted / unique Index

In [20]:
%timeit df.join(df, rsuffix='right_')

100 loops, best of 3: 18.3 ms per loop


In [21]:
df2 = df.sample(n=len(df))
%timeit df2.join(df, rsuffix='right_')

10 loops, best of 3: 27.8 ms per loop
