In [1]:
import numpy as np, pandas as pd

# 11.3.1 文本分隔

In [3]:
s = pd.Series(['天_地_人','你_我_他',np.nan,'坟_墓_货'], dtype='string')
s

0    天_地_人
1    你_我_他
2     <NA>
3    坟_墓_货
dtype: string

In [4]:
s.str.split('_')

0    [天, 地, 人]
1    [你, 我, 他]
2         <NA>
3    [坟, 墓, 货]
dtype: object

In [6]:
# 取出每行的第二个
s.str.split('_').str[1]
# s.str.split('_').str.get(1) # 同上

0       地
1       我
2    <NA>
3       墓
dtype: object

In [7]:
# 取出第二行
s.str.split('_')[1]

['你', '我', '他']

In [10]:
s.str.split('_').str[1:3]

0    [地, 人]
1    [我, 他]
2      <NA>
3    [墓, 货]
dtype: object

In [11]:
# 默认按空格分隔
s.str.split()

0    [天_地_人]
1    [你_我_他]
2       <NA>
3    [坟_墓_货]
dtype: object

# 11.3.2 字符分隔展开

In [12]:
s.str.split('_', expand=True)

Unnamed: 0,0,1,2
0,天,地,人
1,你,我,他
2,,,
3,坟,墓,货


In [13]:
# 指定展开列数，n为切片右值
s.str.split('_', expand=True, n=1)

Unnamed: 0,0,1
0,天,地_人
1,你,我_他
2,,
3,坟,墓_货


In [14]:
# rsplit从右往左分隔
s.str.rsplit('_', expand=True, n=1)

Unnamed: 0,0,1
0,天_地,人
1,你_我,他
2,,
3,坟_墓,货


In [18]:
s = pd.Series(['你和我及他'])
s

0    你和我及他
dtype: object

In [20]:
# 正则表达式代表分隔位
s.str.split(r"\和|及", expand=True)

Unnamed: 0,0,1,2
0,你,我,他


# 11.3.3 文本切片选择

In [38]:
s = pd.Series(['sum','moon','localhost'])
s

0          sum
1         moon
2    localhost
dtype: object

In [39]:
# 切割掉第一个字符
s.str.slice(1)
# s.str.slice(start=1)  # 同上

0          um
1         oon
2    ocalhost
dtype: object

In [40]:
# 不做任何处理
s.str.slice()

0          sum
1         moon
2    localhost
dtype: object

In [41]:
# 保留最后一个字符
s.str.slice(start=-1)
# s.str[-1] # 同上

0    m
1    n
2    t
dtype: object

In [42]:
# 切割掉第二位以后的字符，即保留前2个字符
s.str.slice(stop=2)
# s.str[:2] 同上

0    su
1    mo
2    lo
dtype: object

In [43]:
localhost
# 切割步长为2，即去掉偶数位置的字符
s.str.slice(step=2)

0       sm
1       mo
2    lclot
dtype: object

# 11.3.4 文本划分

In [44]:
s = pd.Series(['how are you','what are you doing'])
s

0           how are you
1    what are you doing
dtype: object

In [45]:
# 分隔三部分
s.str.partition()

Unnamed: 0,0,1,2
0,how,,are you
1,what,,are you doing


In [46]:
s.str.rpartition()

Unnamed: 0,0,1,2
0,how are,,you
1,what are you,,doing


In [54]:
s.str.partition('are')

Unnamed: 0,0,1,2
0,how,are,you
1,what,are,you doing


In [55]:
# 划分为一个元组
s.str.partition('you', expand=False)

0           (how are , you, )
1    (what are , you,  doing)
dtype: object

In [56]:
idx = pd.Index(['A 123', 'B 456'])
idx

Index(['A 123', 'B 456'], dtype='object')

In [57]:
# 对索引进行划分
idx.str.partition()

MultiIndex([('A', ' ', '123'),
            ('B', ' ', '456')],
           )

# 11.3.5 文本替换

In [58]:
s= pd.Series(['10','-¥20','¥3,000'], dtype='string')
s

0        10
1      -¥20
2    ¥3,000
dtype: string

In [59]:
s.str.replace('¥', '')

0       10
1      -20
2    3,000
dtype: string

In [62]:
# 正则替换¥|,
s.str.replace(r'\¥|,', '',regex=True)

0      10
1     -20
2    3000
dtype: string

# 11.3.6 指定替换

In [78]:
s = pd.Series(['a','bc','cdf','defg'])
s

0       a
1      bc
2     cdf
3    defg
dtype: object

In [79]:
# 保留第一个字符，其他替换或追加T
s.str.slice_replace(1, repl='T')

0    aT
1    bT
2    cT
3    dT
dtype: object

In [80]:
# 指定位置前删除并替换T， 即将第一二字符替换为T，不足则也替换为T
s.str.slice_replace(stop=2, repl='T')

0      T
1      T
2     Tf
3    Tfg
dtype: object

In [85]:
# 指定区间内容被替换，第二个字符被T替换，不足则也替换为T
s.str.slice_replace(start=1, stop=2 ,repl='T')

0      aT
1      bT
2     cTf
3    dTfg
dtype: object

In [82]:
s

0       a
1      bc
2     cdf
3    defg
dtype: object

# 11.3.7 重复替换

In [86]:
# 每个元素重复一次
pd.Series(['a','b','c']).repeat(repeats=2)

0    a
0    a
1    b
1    b
2    c
2    c
dtype: object

In [87]:
# 每行内容重复一次
pd.Series(['a','b','c']).str.repeat(repeats=2)

0    aa
1    bb
2    cc
dtype: object

In [89]:
# 指定每行重复几次
pd.Series(['a','b','c']).str.repeat(repeats=[1,2,3])

0      a
1     bb
2    ccc
dtype: object

# 11.3.8 文本连接

In [91]:
s = pd.Series(['x','y','z'], dtype='string')
s

0    x
1    y
2    z
dtype: string

In [92]:
s.str.cat()

'xyz'

In [93]:
s.str.cat(sep=',')

'x,y,z'

In [95]:
t = pd.Series(['h','i',np.nan,'k'], dtype='string')
t

0       h
1       i
2    <NA>
3       k
dtype: string

In [96]:
t.str.cat(sep=',')

'h,i,k'

In [97]:
# 对NaN值填充
t.str.cat(sep=',', na_rep='-')

'h,i,-,k'

In [98]:
# 将两个Series连接在一起
pd.concat([s,t], axis=1)

Unnamed: 0,0,1
0,x,h
1,y,i
2,,
3,z,k


In [99]:
# 两次连接
s.str.cat(pd.concat([s,t], axis=1), na_rep='-')

0    xxh
1    yyi
2    ---
3    zzk
dtype: string

In [100]:
s

0       x
1       y
2    <NA>
3       z
dtype: string

In [101]:
h = pd.Series(['b','d','a'],index=[1,0,2],dtype='string')
h

1    b
0    d
2    a
dtype: string

In [102]:
# 左索引为准
s.str.cat(h)

0      xd
1      yb
2    <NA>
3    <NA>
dtype: string

In [103]:
s

0       x
1       y
2    <NA>
3       z
dtype: string

In [104]:
# 右索引为准
s.str.cat(h,join='right')

1      yb
0      xd
2    <NA>
dtype: string

# 11.3.9 文本查询

In [105]:
s = pd.Series(['One','Two','Three'])
s

0      One
1      Two
2    Three
dtype: object

In [106]:
# 查询字符是否被包含，没有找到显示[]
s.str.findall('T')

0     []
1    [T]
2    [T]
dtype: object

In [107]:
# 注意区分大小写，没有找ONE
s.str.findall('ONE')

0    []
1    []
2    []
dtype: object

In [108]:
# 忽略大小写
import re
s.str.findall('ONE', flags=re.IGNORECASE)

0    [One]
1       []
2       []
dtype: object

In [109]:
# 查找结尾为o的
s.str.findall('o$')

0     []
1    [o]
2     []
dtype: object

In [110]:
# 返回匹配到的位置，从0开始，-1为不匹配。
s.str.find('One')

0    0
1   -1
2   -1
dtype: int64

In [111]:
s.str.find('e')

0    2
1   -1
2    3
dtype: int64

# 11.3.10 文本包含

In [113]:
df = pd.read_excel('../team2.xlsx')
df

Unnamed: 0,name,team,Q1,Q2,Q3,Q4,1city,what color
0,Liver,E,89,21,24,1,shanghai,yellew
1,Arry,C,55,37,37,2,nanjin,red
2,Ack,A,57,60,33,3,beijin,purple
3,Eorge,C,93,4,71,4,tianjin,blue
4,Oah,D,93,49,44,5,henan,black
5,Harlie,C,24,13,87,6,hebei,pink


In [114]:
s = pd.Series(['One','Two','Three',np.nan])
s

0      One
1      Two
2    Three
3      NaN
dtype: object

In [115]:
s.str.contains('o', regex=False)

0    False
1     True
2    False
3      NaN
dtype: object

In [116]:
# 查询记录中name中包含A的行记录
df.loc[df.name.str.contains('A')]

Unnamed: 0,name,team,Q1,Q2,Q3,Q4,1city,what color
1,Arry,C,55,37,37,2,nanjin,red
2,Ack,A,57,60,33,3,beijin,purple


In [119]:
df.loc[df.name.str.contains('A|E')]

Unnamed: 0,name,team,Q1,Q2,Q3,Q4,1city,what color
1,Arry,C,55,37,37,2,nanjin,red
2,Ack,A,57,60,33,3,beijin,purple
3,Eorge,C,93,4,71,4,tianjin,blue


In [120]:
import re
df.loc[df.name.str.contains('A|E', flags=re.IGNORECASE)]

Unnamed: 0,name,team,Q1,Q2,Q3,Q4,1city,what color
0,Liver,E,89,21,24,1,shanghai,yellew
1,Arry,C,55,37,37,2,nanjin,red
2,Ack,A,57,60,33,3,beijin,purple
3,Eorge,C,93,4,71,4,tianjin,blue
4,Oah,D,93,49,44,5,henan,black
5,Harlie,C,24,13,87,6,hebei,pink


In [121]:
# 正则包含数字
df.loc[df.name.str.contains('\d')]

Unnamed: 0,name,team,Q1,Q2,Q3,Q4,1city,what color


In [123]:
s

0      One
1      Two
2    Three
3      NaN
dtype: object

In [124]:
#  查找开头是O
s.str.startswith('O')

0     True
1    False
2    False
3      NaN
dtype: object

In [126]:
# 对NaN的处理
s.str.startswith('O', na=False)

0     True
1    False
2    False
3    False
dtype: bool

In [127]:
#  查找结尾是e
s.str.endswith('e')

0     True
1    False
2     True
3      NaN
dtype: object

In [128]:
# 字符串与正则匹配
pd.Series(['1','3a','4a','f5','12c'], dtype='string').str.match(r'[0-9][a-z]')

0    False
1     True
2     True
3    False
4    False
dtype: boolean

# 11.3.11 文本提取

In [129]:
s = pd.Series(['a1','b2','c3'], dtype='string')
s

0    a1
1    b2
2    c3
dtype: string

In [130]:
# 正则，匹配第一部a或b，第二部分数字，由于c3无法匹配到，最后两列为空
s.str.extract(r'([ab])(\d)', expand=True)

Unnamed: 0,0,1
0,a,1.0
1,b,2.0
2,,


In [131]:
s.str.extract(r'([ab])?(\d)')

Unnamed: 0,0,1
0,a,1
1,b,2
2,,3


In [132]:
s.str.extract(r'(?P<letter>[ab])(?P<digit>\d)')

Unnamed: 0,letter,digit
0,a,1.0
1,b,2.0
2,,


In [143]:
s = pd.Series(['a1a2','b1b7','c1'], index=['A','B','C'], dtype='string')
two_groups = '(?P<letter>[a-z])(?P<digit>[0-9])'
s.str.extract(two_groups, expand=True)


Unnamed: 0,letter,digit
A,a,1
B,b,1
C,c,1


In [144]:
s.str.extractall(two_groups)

Unnamed: 0_level_0,Unnamed: 1_level_0,letter,digit
Unnamed: 0_level_1,match,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0,a,1
A,1,a,2
B,0,b,1
B,1,b,7
C,0,c,1


# 11.3.12 提取虚拟变量

In [145]:
s = pd.Series(['a/b','b/c',np.nan,'c'], dtype='string')
s

0     a/b
1     b/c
2    <NA>
3       c
dtype: string

In [146]:
# 提取虚拟
s.str.get_dummies(sep='/')

Unnamed: 0,a,b,c
0,1,1,0
1,0,1,1
2,0,0,0
3,0,0,1


In [149]:
# 对索引提取虚拟
idx = pd.Index(['a/b','b/c',np.nan,'c'])
idx.str.get_dummies(sep='/')

MultiIndex([(1, 1, 0),
            (0, 1, 1),
            (0, 0, 0),
            (0, 0, 1)],
           names=['a', 'b', 'c'])

In [None]:
# 