In [1]:
import pandas as pd

# 1.Python内建字符串方法：

#### split

In [2]:
val = "a,b,  cdef"

val.split(",")

['a', 'b', '  cdef']

#### split + strip

In [3]:
pieces = [x.strip() for x in val.split(",")]
pieces

['a', 'b', 'cdef']

#### 使用加法与双冒号分隔符连接子字符串

In [4]:
first, second, third = pieces
first + "::" + second + "::" + third

'a::b::cdef'

#### join

In [5]:
"::".join(pieces)

'a::b::cdef'

#### in + index + find 

In [6]:
"a" in val

True

In [7]:
val.index(",")

1

In [8]:
val.find(":")

-1

In [9]:
val.index(":")

ValueError: substring not found

#### count

In [10]:
val.count(",")

2

#### replace

In [11]:
val.replace(",", "::")

'a::b::  cdef'

In [12]:
val.replace(",", "")

'ab  cdef'

# 2.正则表达式：

In [13]:
import re

In [14]:
text = "foo    bar\t baz  \tqux"

re.split(r"\s+",text)

['foo', 'bar', 'baz', 'qux']

#### 使用  re.compile 自己编译正则表达式，形成一个可重用的正则表达式对象

In [15]:
regex = re.compile(r"\s+")

regex.split(text)

['foo', 'bar', 'baz', 'qux']

#### findall

In [16]:
regex.findall(text)

['    ', '\t ', '  \t']

#### findall + match + search 

In [17]:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com"""

pattern = r"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}"

In [18]:
# re.IGNORECASE使正则表达式不区分大小写
regex = re.compile(pattern , flags=re.IGNORECASE)

In [19]:
regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [20]:
m = regex.search(text)
m

<re.Match object; span=(5, 20), match='dave@google.com'>

In [21]:
text[m.start():m.end()]

'dave@google.com'

In [22]:
print(regex.match(text))

None


#### sub

In [23]:
print(regex.sub("REDACTED", text))

Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED


### 分割成多个模式

In [24]:
pattern = r"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})"
regex = re.compile(pattern, flags=re.IGNORECASE)
regex.findall(text)

[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

In [25]:
m = regex.match("wesm@bright.net")
m.groups()

('wesm', 'bright', 'net')

In [26]:
print(regex.sub(r"Username: \1, Domain: \2, Suffix: \3", text))

Dave Username: dave, Domain: google, Suffix: com
Steve Username: steve, Domain: gmail, Suffix: com
Rob Username: rob, Domain: gmail, Suffix: com
Ryan Username: ryan, Domain: yahoo, Suffix: com
