# 單一個字範例

In [1]:
import re

pattern = r'is'
string = "This is a apple"
match = re.findall(pattern, string)
print(match)

['is', 'is']


# 特殊字元要加 \ 跳脫
[\^$.|?*+()

In [2]:
string = "1+2=3"
pattern = r'1\+2=3'
match = re.findall(pattern, string)
print(match)

['1+2=3']


# 任意字元 .

In [10]:
pattern = r'a.man'
string = "I'm a man"
match = re.findall(pattern, string)
print(match)

['a man']


In [9]:
pattern = r'.a'
string = "banana"
match = re.findall(pattern, string)
print(match)

['ba', 'na', 'na']


# 多個字元 []

In [7]:
pattern = r'[aA]'
string = "A apple"
match = re.findall(pattern, string)
print(match)

['A', 'a']


In [8]:
pattern = r'[aeiou]'
string = "There is a apple"
match = re.findall(pattern, string)
print(match)

['e', 'e', 'i', 'a', 'a', 'e']


# 多個字元 - 連續

a-zA-Z 要分開寫，而不能直接寫 a-Z 因為 ascii code 裡在 z 到 A 之間有夾雜其他字元

In [11]:
pattern = r'[a-z]' # 等同 [abcdefghijklmnopqrstuvwxyz]
string = "There is a apple"
match = re.findall(pattern, string)
print(match)

['h', 'e', 'r', 'e', 'i', 's', 'a', 'a', 'p', 'p', 'l', 'e']


In [13]:
pattern = r'[a-zA-Z]'
string = "I'am 20 years old."
match = re.findall(pattern, string)
print(match)

['I', 'a', 'm', 'y', 'e', 'a', 'r', 's', 'o', 'l', 'd']


In [14]:
pattern = r'[5-8]' # 等同 [5678]
string = "0980987163"
match = re.findall(pattern, string)
print(match)

['8', '8', '7', '6']


# 多個字元 ^ 不是 xx

In [15]:

pattern = r'[^a]' # not a 的其他字元
string = "There is a Apple 123"
match = re.findall(pattern, string)
print(match)

['T', 'h', 'e', 'r', 'e', ' ', 'i', 's', ' ', ' ', 'A', 'p', 'p', 'l', 'e', ' ', '1', '2', '3']


In [16]:
pattern = r'[^0-9]'
string = "I'am 20 years old."
match = re.findall(pattern, string)
print(match)

['I', "'", 'a', 'm', ' ', ' ', 'y', 'e', 'a', 'r', 's', ' ', 'o', 'l', 'd', '.']


# 多個字元的縮寫
\d - digit [0-9]
\w - word [A-Za-z0-9_]
\s - space [\n\r\t]

In [17]:
pattern = r'\we'
string = "There is a apple"
match = re.findall(pattern, string)
print(match)

['he', 're', 'le']


In [18]:
pattern = r'\d\d' # 其實等於 \d{2}  (次數的概念)
string = "I'am 20 5 years old."
match = re.findall(pattern, string)
print(match)

['20']


# 多個字元縮寫 non
\D - non-digit [^\d]
\W - non-word [^\w]
\S - non-space [^\s]

In [19]:
pattern = r'\W' 
string = "I'am 20 years old."
match = re.findall(pattern, string)
print(match)

["'", ' ', ' ', ' ', '.']


In [20]:
string = "1+2=3"
pattern = r'\D' # 等同 r'[^\d]'
match = re.findall(pattern, string)
print(match)

['+', '=']


# 出現次數
*  任意次數
+ 至少一次
?  零或一次

In [22]:
string = "Hello World"
pattern = r'l*' # 等同 r'[^\d]'
match = re.findall(pattern, string)
print(match)

#奇怪不懂為何空白也會進來

['', '', 'll', '', '', '', '', '', 'l', '', '']


In [23]:
string = "banana"
pattern = r'n?a'
match = re.findall(pattern, string)
print(match)

['a', 'na', 'na']


# 出現次數  {次數}  {最少次數, 最多次數}

In [26]:
pattern = r'l{2}'
string = "Hello World"
match = re.findall(pattern, string)
print(match)

['ll']


In [27]:
pattern = r'l{1,}'
string = "Hello World"
match = re.findall(pattern, string)
print(match)

['ll', 'l']


# 頭尾      
^ 開頭
$ 結尾

在多個字元裡，也就是用 [] 包起來的，是指 not，比如 [^0-9] 代表不是數字

In [29]:
pattern = r'^He'
string = "Hello Hello"
match = re.findall(pattern, string)
print(match)

['He']


In [30]:
pattern = r'llo$'
string = "Hello Hello"
match = re.findall(pattern, string)
print(match)

['llo']


In [32]:
pattern = r'^He.*llo$'   # .* 代表任意字元無數次
string = "Hello Hello"
match = re.findall(pattern, string)
print(match)

['Hello Hello']


# 或 (以單字為單元，並且前面符合就不會往後面看)

In [39]:
pattern = r'and|android' # 前面先符合就會先被選走
string = "iOS and android"
match = re.findall(pattern, string)
print(match)

['and', 'and']


In [40]:
pattern = r'and | android' # 含空白，因為後面 android 也要含空白，空白已被前面 "and " 選走，所以選取不到
string = "iOS and android"
match = re.findall(pattern, string)
print(match)

['and ']


In [41]:
pattern = r'and |android' # 含空白，後面 android 不含空白，所以就選到了
string = "iOS and android"
match = re.findall(pattern, string)
print(match)

['and ', 'android']


In [43]:
pattern = r'android|and' # 最正確的版本，比較精準的放前面，注意不要有空白
string = "iOS and android"
match = re.findall(pattern, string)
print(match)

['and', 'android']


# 常用例子 - 西元生日  =>  1996-08-06

In [47]:
pattern = r'[1-9]\d{3}-\d{2}-\d{2}' 
string = "1996-08-06"
match = re.findall(pattern, string)
print(match)

['1996-08-06']


# 常用例子 - 身分證字號  =>  A123456789

In [54]:
pattern = r'^[A-Z]\d{9}$' 
string = "A123456789"
match = re.findall(pattern, string)
print(match)

['A123456789']


# Gmail 信箱

In [62]:
pattern = r'^\w+@gmail\.com$' 
string = "test123@gmail.com"
match = re.findall(pattern, string)
print(match)

['test123@gmail.com']


# 常用例子 - 四則運算  "1+6/3-2"

In [63]:
pattern = r'^[\d\+\-\*\/]*$' 
string = "1+6/3-2"
match = re.findall(pattern, string)
print(match)

['1+6/3-2']


# 額外補充

In [72]:
# \b  boundary
pattern = r'\ba\b' 
string = "This is a apple"
match = re.findall(pattern, string)
print(match)

['a']


In [69]:
# 符合 a 的單字都挑出來 (左右兩邊只要不是空白的任意個)
pattern = r'\S*a\S*' 
string = "This is a apple"
match = re.findall(pattern, string)
print(match)

['a', 'apple']


In [71]:
# 符合 a 的單字都挑出來  (左右兩邊只要是字的任意個)
pattern = r'\w*a\w*' 
string = "This is a apple"
match = re.findall(pattern, string)
print(match)

['a', 'apple']


# 字串取代

In [76]:
import re

string = 'bananb'
string = re.sub(r'n.', 'NA', string)
print(string)

baNANA


In [79]:
import re

string = 'bananb'
string = re.sub(r'n.*', 'NA', string)
print(string)

baNA


# 選取 - 除了 match 還放進一個變數

In [84]:
string = 'apple'
string = re.sub(r'(a.)(p.)e', r'\1\2', string)
print(string)

appl


In [90]:
string = 'apple'
string = re.sub(r'(a.)(p.)e', r'\1\1', string)
print(string)

apap


In [100]:
string = '2017/05/16'
string = re.sub(r'(\d{4})\/(\d{2})\/(\d{2})', r'\1\2', string)
print(string)

201705


In [101]:
string = "<div> 123 </div>"
string = re.sub(r'(<\/?)(div)(>)', r'\1span\3', string)
print(string)

<span> 123 </span>


# 列出 579 結尾的檔案
ls -l | grep '[579]$'