In [None]:
# 这是一片翻译自[](https://realpython.com/python-pandas-tricks/)
# 这篇文章写了一些平常很少用但是用了能增强代码的可读性

In [None]:
# 1.Configure Options & Settings at Interpreter Startup

In [3]:
import pandas as pd

def start():
    options = {
        'display': {
            'max_columns': None,
            'max_colwidth': 25,
            'expand_frame_repr': False,
            'max_rows': 14,
            'max_seq_items': 50,
            'precision': 4,
            'show_dimensions': False
            },
        'mode': {
            'chained_assignment': None
            }
        }

    for category, option in options.items():
        for op, value in option.items():
            pd.set_option(f'{category}.{op}', value)

start()

In [4]:
pd.__name__

'pandas'

In [5]:
pd.get_option('display.max_rows')

14

In [8]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data'
cols = ['sex', 'length', 'diam', 'height', 'weight', 'rings']
abalone = pd.read_csv(url, usecols=[0, 1, 2, 3, 4, 8], names=cols)

In [9]:
abalone

Unnamed: 0,sex,length,diam,height,weight,rings
0,M,0.455,0.365,0.095,0.5140,15
1,M,0.350,0.265,0.090,0.2255,7
2,F,0.530,0.420,0.135,0.6770,9
3,M,0.440,0.365,0.125,0.5160,10
4,I,0.330,0.255,0.080,0.2050,7
5,I,0.425,0.300,0.095,0.3515,8
6,F,0.530,0.415,0.150,0.7775,20
...,...,...,...,...,...,...
4170,M,0.550,0.430,0.130,0.8395,10
4171,M,0.560,0.430,0.155,0.8675,8


In [None]:
# 2.Make Toy Data Structures With Pandas’ Testing Module
# 用pandas测试模块自带的功能模拟数据

In [10]:
import pandas.util.testing as tm

tm.N, tm.K = 15, 3 # 设置默认的行和列

In [11]:
import numpy as np

np.random.seed(444)

In [12]:
tm.makeTimeDataFrame(freq='M').head()

Unnamed: 0,A,B,C
2000-01-31,0.3574,-0.8804,0.2669
2000-02-29,0.3775,0.1526,-0.4803
2000-03-31,1.3823,0.2503,0.3008
2000-04-30,1.1755,0.0785,-0.1791
2000-05-31,-0.9393,-0.9039,1.1837


In [14]:
tm.makeDataFrame().head()

Unnamed: 0,A,B,C
VlCRv6XfVr,2.4729,0.6187,-0.3711
uAj3cKSdxQ,1.0777,2.3786,0.629
jsm9RASFIW,0.7645,-0.473,-0.8595
b1YAu4W0Lz,-1.2064,-2.1505,1.6401
o4Cw2CPfhT,1.9184,-0.634,-2.3843


In [15]:
[i for i in dir(tm) if i.startswith('make')]

['makeBoolIndex',
 'makeCategoricalIndex',
 'makeCustomDataframe',
 'makeCustomIndex',
 'makeDataFrame',
 'makeDateIndex',
 'makeFloatIndex',
 'makeFloatSeries',
 'makeIntIndex',
 'makeIntervalIndex',
 'makeMissingCustomDataframe',
 'makeMissingDataframe',
 'makeMixedDataFrame',
 'makeMultiIndex',
 'makeObjectSeries',
 'makePanel',
 'makePeriodFrame',
 'makePeriodIndex',
 'makePeriodPanel',
 'makePeriodSeries',
 'makeRangeIndex',
 'makeStringIndex',
 'makeStringSeries',
 'makeTimeDataFrame',
 'makeTimeSeries',
 'makeTimedeltaIndex',
 'makeUIntIndex',
 'makeUnicodeIndex']

In [16]:
# 3.Take Advantage of Accessor Methods
# 这里的accessor可以理解为为其他方法服务的接口

In [17]:
pd.Series._accessors
# .cat --> categorical data, .str --> string data, .dt --> datetime data

{'cat', 'dt', 'str'}

In [18]:
addr = pd.Series([
    'Washington, D.C. 20003',
    'Brooklyn, NY 11211-1755',
    'Omaha, NE 68154',
    'Pittsburgh, PA 15211'
    ])

addr.str.upper()

0     WASHINGTON, D.C. 20003
1    BROOKLYN, NY 11211-1755
2            OMAHA, NE 68154
3       PITTSBURGH, PA 15211
dtype: object

In [19]:
addr.str.count(r'\d')

0    5
1    9
2    5
3    5
dtype: int64

In [24]:
regex = (r'(?P<city>[A-Za-z ]+), (?P<state>[A-Z]{2}) (?P<zip>\d{5}(?:-\d{4})?)')

addr.str.replace('.', '').str.extract(regex)

Unnamed: 0,city,state,zip
0,Washington,DC,20003
1,Brooklyn,NY,11211-1755
2,Omaha,NE,68154
3,Pittsburgh,PA,15211


In [25]:
daterng = pd.Series(pd.date_range('2017', periods=9, freq='Q'))

daterng

0   2017-03-31
1   2017-06-30
2   2017-09-30
3   2017-12-31
4   2018-03-31
5   2018-06-30
6   2018-09-30
7   2018-12-31
8   2019-03-31
dtype: datetime64[ns]

In [26]:
daterng.dt.day_name()

0      Friday
1      Friday
2    Saturday
3      Sunday
4    Saturday
5    Saturday
6      Sunday
7      Monday
8      Sunday
dtype: object

In [27]:
daterng[daterng.dt.quarter > 2]

2   2017-09-30
3   2017-12-31
6   2018-09-30
7   2018-12-31
dtype: datetime64[ns]

In [28]:
daterng[daterng.dt.is_year_end]

3   2017-12-31
7   2018-12-31
dtype: datetime64[ns]

In [29]:
# 4. Create a DatetimeIndex From Component Columns

In [30]:
from itertools import product

datecols = ['year', 'month', 'day']

df = pd.DataFrame(list(product([2017, 2016], [1, 2], [1, 2, 3])), columns=datecols)

df

Unnamed: 0,year,month,day
0,2017,1,1
1,2017,1,2
2,2017,1,3
3,2017,2,1
4,2017,2,2
5,2017,2,3
6,2016,1,1
7,2016,1,2
8,2016,1,3
9,2016,2,1


In [31]:
df['data'] = np.random.randn(len(df))

df

Unnamed: 0,year,month,day,data
0,2017,1,1,0.0865
1,2017,1,2,-0.2222
2,2017,1,3,-0.8021
3,2017,2,1,0.1156
4,2017,2,2,-1.2214
5,2017,2,3,-0.5283
6,2016,1,1,1.9551
7,2016,1,2,-0.2408
8,2016,1,3,0.6773
9,2016,2,1,2.715


In [32]:
df.index = pd.to_datetime(df[datecols])

df.head()

Unnamed: 0,year,month,day,data
2017-01-01,2017,1,1,0.0865
2017-01-02,2017,1,2,-0.2222
2017-01-03,2017,1,3,-0.8021
2017-02-01,2017,2,1,0.1156
2017-02-02,2017,2,2,-1.2214


In [33]:
df = df.drop(datecols, axis=1).squeeze()

df.head()

2017-01-01    0.0865
2017-01-02   -0.2222
2017-01-03   -0.8021
2017-02-01    0.1156
2017-02-02   -1.2214
Name: data, dtype: float64

In [34]:
df.index.dtype_str

'datetime64[ns]'

In [35]:
# 5. Use Categorical Data to Save on Time and Space
# 就是#3的.cat

In [36]:
colors = pd.Series([
    'periwinkle',
    'mint green',
    'burnt orange',
    'periwinkle',
    'burnt orange',
    'rose',
    'rose',
    'mint green',
    'rose',
    'navy'
    ])

In [37]:
import sys

colors.apply(sys.getsizeof)

0    59
1    59
2    61
3    59
4    61
5    53
6    53
7    59
8    53
9    53
dtype: int64

In [38]:
mapper = {v: k for k, v in enumerate(colors.unique())}

mapper

{'periwinkle': 0, 'mint green': 1, 'burnt orange': 2, 'rose': 3, 'navy': 4}

In [39]:
as_int = colors.map(mapper)

as_int

0    0
1    1
2    2
3    0
4    2
5    3
6    3
7    1
8    3
9    4
dtype: int64

In [40]:
as_int.apply(sys.getsizeof)

0    24
1    28
2    28
3    24
4    28
5    28
6    28
7    28
8    28
9    28
dtype: int64

In [None]:
# 