# 정규 표현식 실습

In [1]:
import re

## . 기호

In [2]:
r = re.compile("a.c")
r.search('kkk')

In [3]:
r.search('abc')

<re.Match object; span=(0, 3), match='abc'>

## ? 기호

In [4]:
r = re.compile('ab?c')
r.search('abbc')

In [6]:
r.search('abc')

<re.Match object; span=(0, 3), match='abc'>

In [7]:
r.search('ac')

<re.Match object; span=(0, 2), match='ac'>

## * 기호

In [8]:
r = re.compile('ab*c')
r.search('a')

In [9]:
r.search('ac')

<re.Match object; span=(0, 2), match='ac'>

In [10]:
r.search('abc')

<re.Match object; span=(0, 3), match='abc'>

In [11]:
r.search('abbbbc')

<re.Match object; span=(0, 6), match='abbbbc'>

## + 기호

In [12]:
r = re.compile('ab+c')
r.search('ac')

In [13]:
r.search('abc')

<re.Match object; span=(0, 3), match='abc'>

In [14]:
r.search('abbbbc')

<re.Match object; span=(0, 6), match='abbbbc'>

## ^ 기호

In [15]:
r = re.compile('^ab')

r.search('bbc')
r.search('zab')

In [16]:
r.search('abz')

<re.Match object; span=(0, 2), match='ab'>

## {숫자} 기호

In [17]:
r = re.compile('ab{2}c')

r.search('ac')
r.search('abc')
r.search('abbbbbbc')

In [18]:
r.search('abbc')

<re.Match object; span=(0, 4), match='abbc'>

## {숫자1, 숫자2} 기호

In [20]:
r = re.compile('ab{2,8}c')

r.search('ac')
r.search('abc')
r.search('abbbbbbbbbbbc')

In [21]:
r.search('abbc')

<re.Match object; span=(0, 4), match='abbc'>

In [22]:
r.search('abbbbbbbc')

<re.Match object; span=(0, 9), match='abbbbbbbc'>

## {숫자,} 기호

In [23]:
r = re.compile('a{2,}bc')

r.search('bc')
r.search('aa')

In [24]:
r.search('aabc')

<re.Match object; span=(0, 4), match='aabc'>

In [25]:
r.search('aaaaaaabc')

<re.Match object; span=(0, 9), match='aaaaaaabc'>

## [ ] 기호

In [28]:
r = re.compile('[abc]') # [abc]는 [a-c]와 같다.
r.search('zzz') # 아무런 결과도 출력되지 않는다.

In [29]:
r.search('a')

<re.Match object; span=(0, 1), match='a'>

In [30]:
r.search('aaaaaaa')

<re.Match object; span=(0, 1), match='a'>

In [31]:
r.search('baac')

<re.Match object; span=(0, 1), match='b'>

In [33]:
r = re.compile('[a-z]')

# 아무런 결과도 출력되지 않는다.
r.search('AAA')
r.search('111')

In [34]:
r.search('aBC')

<re.Match object; span=(0, 1), match='a'>

## [^문자] 기호

In [35]:
r = re.compile('[^abc]')

r.search('a')
r.search('ab')
r.search('b')

In [36]:
r.search('d')

<re.Match object; span=(0, 1), match='d'>

In [37]:
r.search('1')

<re.Match object; span=(0, 1), match='1'>

# 정규 표현식 모듈 함수 예제

## re.match() 와 re.search()의 차이

In [40]:
r = re.compile('ab.')
r.match('kkkabc')

In [41]:
r.search('kkkabc')

<re.Match object; span=(3, 6), match='abc'>

In [42]:
r.match('abckkk')

<re.Match object; span=(0, 3), match='abc'>

## re.split()

In [44]:
# 공백 기준 분리
text = '사과 딸기 수박 메론 바나나'
re.split(' ', text)

['사과', '딸기', '수박', '메론', '바나나']

In [45]:
# 줄바꿈 기준 분리
text = '''사과
딸기
수박
메론
바나나'''

re.split('\n', text)

['사과', '딸기', '수박', '메론', '바나나']

In [46]:
# '+'를 기분으로 분리
text = "사과+딸기+수박+메론+바나나"

re.split('\+', text)

['사과', '딸기', '수박', '메론', '바나나']

## re.findall()

In [48]:
text = """이름 : 김철수
전화번호 : 010 - 1234 - 1234
나이 : 30
성별 : 남"""

re.findall('\d+', text)

['010', '1234', '1234', '30']

In [49]:
re.findall('\d+', '문자열입니다.')

[]

## re.sub()

In [51]:
text = "Regular expression : A regular expression, regex or regexp[1] (sometimes called a rational expression)[2][3] is, in theoretical computer science and formal language theory, a sequence of characters that define a search pattern."

preprocessed_text = re.sub('[^a-zA-Z]', ' ', text)
print(preprocessed_text)

Regular expression   A regular expression  regex or regexp     sometimes called a rational expression        is  in theoretical computer science and formal language theory  a sequence of characters that define a search pattern 


# 정규 표현식 텍스트 전처리 예제

In [52]:
text = """100 John    PROF
101 James   STUD
102 Mac   STUD"""

In [53]:
re.split('\s+', text)

['100', 'John', 'PROF', '101', 'James', 'STUD', '102', 'Mac', 'STUD']

In [54]:
re.findall('\d+', text)

['100', '101', '102']

In [55]:
re.findall('[A-Z]', text)

['J', 'P', 'R', 'O', 'F', 'J', 'S', 'T', 'U', 'D', 'M', 'S', 'T', 'U', 'D']

In [56]:
re.findall('[A-Z]{4}', text)

['PROF', 'STUD', 'STUD']

In [57]:
re.findall('[A-Z][a-z]+', text)

['John', 'James', 'Mac']

# 정규 표현식을 이용한 토큰화

In [58]:
from nltk.tokenize import RegexpTokenizer

text = "Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is as cheery as cheery goes for a pastry shop"

tokenizer1 = RegexpTokenizer("[\w]+")
tokenizer2 = RegexpTokenizer('\s+', gaps=True)

print(tokenizer1.tokenize(text))
print(tokenizer2.tokenize(text))

['Don', 't', 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', 'Mr', 'Jone', 's', 'Orphanage', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pastry', 'shop']
["Don't", 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name,', 'Mr.', "Jone's", 'Orphanage', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pastry', 'shop']
