# 8. Strings and Regular Expressions

## 8.1. A string is a sequence

In [1]:
fruit = 'banana'
letter = fruit[1]

In [2]:
letter

'a'

In [3]:
fruit[0]

'b'

In [4]:
i = 1
fruit[i]

'a'

In [5]:
fruit[i + 1]

'n'

In [6]:
fruit[1.5]

TypeError: string indices must be integers, not 'float'

In [7]:
n = len(fruit)
n

6

In [8]:
fruit[n]

IndexError: string index out of range

In [9]:
fruit[n - 1]

'a'

In [10]:
fruit[-1]

'a'

## 8.2. String slices

In [11]:
fruit = 'banana'
fruit[0:3]

'ban'

In [12]:
fruit[:3]

'ban'

In [13]:
fruit[3:]

'ana'

In [14]:
fruit[3:3]

''

## 8.3. Strings are immutable

In [15]:
greeting = 'Hello, world!'
greeting[0] = 'J'

TypeError: 'str' object does not support item assignment

In [16]:
new_greeting = 'J' + greeting[1:]
new_greeting

'Jello, world!'

In [17]:
greeting

'Hello, world!'

## 8.4. String comparison

In [18]:
word = 'banana'

if word == 'banana':
    print('All right, banana.')

All right, banana.


In [19]:
def compare_word(word):
    if word < 'banana':
        print(word, 'comes before banana.')
    elif word > 'banana':
        print(word, 'comes after banana.')
    else:
        print('All right, banana.')

In [20]:
compare_word('apple')

apple comes before banana.


In [21]:
compare_word('Pineapple')

Pineapple comes before banana.


## 8.5. String methods

In [22]:
word = 'banana'
new_word = word.upper()
new_word

'BANANA'

## 8.6. Writing files

In [23]:
reader = open('pg345.txt', encoding='utf-8')

In [24]:
def is_special_line(line):
    return line.startswith('*** ')

In [25]:
for line in reader:
    if is_special_line(line):
        print(line.strip())

*** START OF THE PROJECT GUTENBERG EBOOK DRACULA ***
*** END OF THE PROJECT GUTENBERG EBOOK DRACULA ***


In [26]:
reader = open('pg345.txt', encoding='utf-8')
writer = open('pg345_cleaned.txt', 'w', encoding='utf-8')

In [27]:
for line in reader:
    if is_special_line(line):
        break

In [28]:
line

'*** START OF THE PROJECT GUTENBERG EBOOK DRACULA ***\n'

In [29]:
for line in reader:
    if is_special_line(line):
        break
    writer.write(line)

In [30]:
line

'*** END OF THE PROJECT GUTENBERG EBOOK DRACULA ***\n'

In [31]:
reader.close()
writer.close()

In [32]:
for line in open('pg345_cleaned.txt', encoding='utf-8'):
    line = line.strip()
    if len(line) > 0:
        print(line)
    if line.endswith('Stoker'):
        break

DRACULA
_by_
Bram Stoker


## 8.7. Find and replace

In [33]:
total = 0
for line in open('pg345_cleaned.txt', encoding='utf-8'):
    total += 1

total

15475

In [34]:
total = 0
for line in open('pg345_cleaned.txt', encoding='utf-8'):
    if 'Jonathan' in line:
        total += 1

total

199

In [35]:
total = 0
for line in open('pg345_cleaned.txt', encoding='utf-8'):
    total += line.count('Jonathan')

total

200

In [36]:
writer = open('pg345_replaced.txt', 'w', encoding='utf-8')

for line in open('pg345_cleaned.txt', encoding='utf-8'):
    line = line.replace('Jonathan', 'Thomas')
    writer.write(line)

## 8.8. Regular expressions

In [37]:
text = "I am Dracula; and I bid you welcome, Mr. Harker, to my house."

In [38]:
pattern = 'Dracula'

In [39]:
import re

result = re.search(pattern, text)
result

<re.Match object; span=(5, 12), match='Dracula'>

In [40]:
result.string

'I am Dracula; and I bid you welcome, Mr. Harker, to my house.'

In [41]:
result.group()

'Dracula'

In [42]:
result.span()

(5, 12)

In [43]:
result = re.search('Count', text)
print(result)

None


In [44]:
result == None

True

In [45]:
def find_first(pattern):
    for line in open('pg345_cleaned.txt', encoding='utf-8'):
        result = re.search(pattern, line)
        if result != None:
            return result

In [46]:
result = find_first('Harker')
result.string

'CHAPTER I. Jonathan Harker‚Äôs Journal\n'

In [47]:
pattern = 'Mina|Murray'
result = find_first(pattern)
result.string

'CHAPTER V. Letters‚ÄîLucy and Mina\n'

In [48]:
def count_matches(pattern):
    count = 0
    for line in open('pg345_cleaned.txt', encoding='utf-8'):
        result = re.search(pattern, line)
        if result != None:
            count += 1
    return count

In [49]:
count_matches('Mina|Murray')

229

In [50]:
result = find_first('^Dracula')
result.string

'Dracula, jumping to his feet, said:--\n'

In [51]:
result = find_first('Harker$')
result.string

'by five o‚Äôclock, we must start off; for it won‚Äôt do to leave Mrs. Harker\n'

## 8.9. String substitution

In [52]:
pattern = 'cent(er|re)'

In [53]:
result = find_first(pattern)
result.string

'horseshoe of the Carpathians, as if it were the centre of some sort of\n'

In [54]:
pattern = 'colou?r'

In [55]:
result = find_first(pattern)
line = result.string
line

'undergarment with long double apron, front, and back, of coloured stuff\n'

In [56]:
re.sub(pattern, 'color', line)

'undergarment with long double apron, front, and back, of colored stuff\n'

## 8.10. Debugging

- ÏúàÎèÑÏö∞ ÌôòÍ≤ΩÏúºÎ°ú shell commands ÏÇ¨Ïö©Ïù¥ Ïñ¥Î†§ÏõÄ.

> ```shell  
> !head pg345_cleaned.txt  
> !tail pg345_cleaned.txt  
> !head pg345_cleaned.txt > pg345_cleaned_10_lines.txt  
> !head -100 pg345_cleaned.txt > pg345_cleaned_100_lines.txt  
> ```  

## 8.11. Glossary

- sequence
- character
- index
- slice
- empty string
- object
- immutable
- invocation
- regular expression
- pattern
- string substitution
- shell command

## 8.12. Exercises

### 8.12.1. Ask a virtual assistant

- ÌååÏù¥Ïç¨ Ï†ïÍ∑ú ÌëúÌòÑÏãùÏóêÏÑú Í∞ÄÏû• ÎßéÏù¥ Ïì∞Ïù¥Îäî ÌäπÏàò Î¨∏ÏûêÍ∞Ä Î≠êÏïº?
    > ÌååÏù¥Ïç¨ Ï†ïÍ∑ú ÌëúÌòÑÏãùÏóêÏÑú Í∞ÄÏû• ÎßéÏù¥ Ïì∞Ïù¥Îäî **ÌäπÏàò Î¨∏Ïûê(Î©îÌÉÄ Î¨∏Ïûê)**Îäî Îã§ÏùåÍ≥º Í∞ôÏïÑÏöî. ÏûêÏ£º Ïì∞Ïù¥Îäî ÏàúÏÑúÎåÄÎ°ú Ï†ïÎ¶¨Ìï¥ Î≥ºÍ≤åÏöî:
    > 
    > ### üîü ÏûêÏ£º Ïì∞Ïù¥Îäî Ï†ïÍ∑úÌëúÌòÑÏãù ÌäπÏàò Î¨∏Ïûê TOP 10
    > 
    > | ÏàúÏúÑ  | Î©îÌÉÄ Î¨∏Ïûê | ÏùòÎØ∏                                             |                     |       |
    > | --- | ----- | ---------------------------------------------- | ------------------- | ----- |
    > | 1Ô∏è‚É£ | `.`   | **ÏûÑÏùòÏùò Ìïú Î¨∏Ïûê** (Ï§ÑÎ∞îÍøà Ï†úÏô∏)                          |                     |       |
    > | 2Ô∏è‚É£ | `^`   | **Î¨∏ÏûêÏó¥Ïùò ÏãúÏûë**                                    |                     |       |
    > | 3Ô∏è‚É£ | `$`   | **Î¨∏ÏûêÏó¥Ïùò ÎÅù**                                     |                     |       |
    > | 4Ô∏è‚É£ | `*`   | **0Ìöå Ïù¥ÏÉÅ Î∞òÎ≥µ**                                   |                     |       |
    > | 5Ô∏è‚É£ | `+`   | **1Ìöå Ïù¥ÏÉÅ Î∞òÎ≥µ**                                   |                     |       |
    > | 6Ô∏è‚É£ | `?`   | **0Ìöå ÎòêÎäî 1Ìöå** (ÏûàÍ±∞ÎÇò ÏóÜÍ±∞ÎÇò)                         |                     |       |
    > | 7Ô∏è‚É£ | `[]`  | **Î¨∏Ïûê ÏßëÌï©** (ex: `[abc]`Îäî a, b, c Ï§ë ÌïòÎÇò)          |                     |       |
    > | 8Ô∏è‚É£ | `     | `                                              | **OR Ïó∞ÏÇ∞** (ex: `cat | dog`) |
    > | 9Ô∏è‚É£ | `()`  | **Í∑∏Î£πÌôî** ÎòêÎäî **Ï∫°Ï≤ò**                              |                     |       |
    > | üîü  | `\`   | **Ïù¥Ïä§ÏºÄÏù¥ÌîÑ Î¨∏Ïûê** ÎòêÎäî **ÌäπÏàò Î¨∏Ïûê Ï∂ïÏïΩÌëúÌòÑ** (Ïòà: `\d`, `\w`) |                     |       |
    > 
    > ### ‚úÖ ÏûêÏ£º Ïì∞Ïù¥Îäî Ï∂ïÏïΩ ÌëúÌòÑ (`\` Í¥ÄÎ†®)
    > 
    > | ÌëúÌòÑ   | ÏùòÎØ∏                     |
    > | ---- | ---------------------- |
    > | `\d` | Ïà´Ïûê (0-9)               |
    > | `\D` | Ïà´ÏûêÍ∞Ä ÏïÑÎãå Î¨∏Ïûê              |
    > | `\w` | Îã®Ïñ¥ Î¨∏Ïûê (ÏïåÌååÎ≤≥ + Ïà´Ïûê + `_`) |
    > | `\W` | Îã®Ïñ¥ Î¨∏ÏûêÍ∞Ä ÏïÑÎãå Í≤É            |
    > | `\s` | Í≥µÎ∞± Î¨∏Ïûê (Ïä§ÌéòÏù¥Ïä§, ÌÉ≠, Ï§ÑÎ∞îÍøà Îì±) |
    > | `\S` | Í≥µÎ∞±Ïù¥ ÏïÑÎãå Î¨∏Ïûê              |
    > | `\b` | Îã®Ïñ¥ Í≤ΩÍ≥Ñ                  |
    > | `\B` | Îã®Ïñ¥ ÎπÑÍ≤ΩÍ≥Ñ                 |
    > 
    > ### üéØ ÏòàÏãú
    > 
    > ```python
    > import re
    > 
    > text = "Hello 123 World"
    > 
    > # Ïà´Ïûê Ï∞æÍ∏∞
    > re.findall(r'\d+', text)  # ['123']
    > 
    > # Îã®Ïñ¥ Ï∞æÍ∏∞
    > re.findall(r'\w+', text)  # ['Hello', '123', 'World']
    > ```

- Ïà´Ïûê 10Í∞úÏôÄ ÌïòÏù¥ÌîàÏúºÎ°ú Ïù¥Î£®Ïñ¥ÏßÑ Ï†ÑÌôîÎ≤àÌò∏Î•º Ï∞æÎäî ÌååÏù¥Ïç¨ Ï†ïÍ∑ú ÌëúÌòÑÏãù ÎßåÎì§Ïñ¥Ï§ò.
    > Î¨ºÎ°†Ïù¥Ï£†! Ï†ÑÌôîÎ≤àÌò∏Í∞Ä **Ïà´Ïûê 10Í∞ú + ÌïòÏù¥Ìîà(-)** ÌòïÏãùÏù¥ÎùºÎ©¥, ÏùºÎ∞òÏ†ÅÏù∏ Ìå®ÌÑ¥ÏùÄ Î≥¥ÌÜµ Îã§ÏùåÍ≥º Í∞ôÏïÑÏöî:
    > 
    > ### üìû Ï†ÑÌôîÎ≤àÌò∏ ÌòïÏãù ÏòàÏãú
    > 
    > * `010-1234-5678`
    > * `011-987-1234`
    > 
    > Ïù¥Îü∞ ÌòïÏãùÏùÑ Ï∞æÎäî **Ï†ïÍ∑ú ÌëúÌòÑÏãù**ÏùÄ ÏïÑÎûòÏôÄ Í∞ôÏäµÎãàÎã§:
    > 
    > ### ‚úÖ Ï†ïÍ∑úÌëúÌòÑÏãù Ìå®ÌÑ¥
    > 
    > ```python
    > r'\b\d{2,3}-\d{3,4}-\d{4}\b'
    > ```
    > 
    > ### üîç ÏÑ§Î™Ö
    > 
    > | Ìå®ÌÑ¥        | ÏùòÎØ∏                        |
    > | --------- | ------------------------- |
    > | `\b`      | Îã®Ïñ¥ Í≤ΩÍ≥Ñ (ÌïÑÏöîÏãú)               |
    > | `\d{2,3}` | Ïïû Î≤àÌò∏: 2~3ÏûêÎ¶¨ (ex. 010, 02) |
    > | `-`       | ÌïòÏù¥Ìîà                       |
    > | `\d{3,4}` | Ï§ëÍ∞Ñ Î≤àÌò∏: 3~4ÏûêÎ¶¨              |
    > | `-`       | ÌïòÏù¥Ìîà                       |
    > | `\d{4}`   | ÎßàÏßÄÎßâ Î≤àÌò∏: 4ÏûêÎ¶¨               |
    > | `\b`      | Îã®Ïñ¥ Í≤ΩÍ≥Ñ (ÏÑ†ÌÉùÏ†Å)               |
    > 
    > ### üß™ ÏòàÏ†ú ÏΩîÎìú
    > 
    > ```python
    > import re
    > 
    > text = """
    > ÌôçÍ∏∏Îèô: 010-1234-5678
    > ÍπÄÏ≤†Ïàò: 011-987-1234
    > Ïò§Î•ò: 01012345678
    > """
    > 
    > pattern = r'\b\d{2,3}-\d{3,4}-\d{4}\b'
    > matches = re.findall(pattern, text)
    > 
    > print(matches)
    > ```
    > 
    > **Ï∂úÎ†•:**
    > 
    > ```
    > ['010-1234-5678', '011-987-1234']
    > ```
    > 
    > ### üìå Ï∞∏Í≥†
    > 
    > * ÌïòÏù¥Ìîà ÏóÜÏù¥ Î∂ôÏùÄ Ïà´Ïûê (`01012345678`)Îäî ÏúÑ Ï†ïÍ∑úÏãùÏóêÏÑúÎäî **Î¨¥Ïãú**Îê©ÎãàÎã§.
    > * ÎßåÏïΩ **ÌïòÏù¥ÌîàÏù¥ ÏûàÎäî ÌòïÏãùÎßå Ï∞æÍ≥†Ïûê ÌïúÎã§Î©¥**, Ïù¥ Ìå®ÌÑ¥Ïù¥ Ï†ÅÌï©Ìï©ÎãàÎã§.
    > * ÎßåÏïΩ Î™®Îì† Ï†ÑÌôîÎ≤àÌò∏ Ìè¨Îß∑(ÌïòÏù¥Ìîà Ïú†Î¨¥ Ìè¨Ìï®)ÏùÑ Ìè¨Í¥ÑÌïòÎ†§Î©¥ Î≥ÑÎèÑÎ°ú ÏàòÏ†ïÌï¥ÎìúÎ¶¥ Ïàò ÏûàÏñ¥Ïöî.

- Ïà´ÏûêÏôÄ ST ÌòπÏùÄ AVEÎ°ú ÎÅùÎÇòÎäî ÎèÑÎ°úÎ™ÖÏúºÎ°ú ÌëúÌòÑÎêòÎäî Ï£ºÏÜåÎ•º Ï∞æÎäî ÌååÏù¥Ïç¨ Ï†ïÍ∑ú ÌëúÌòÑÏãù ÎßåÎì§Ïñ¥Ï§ò.
    > Ï¢ãÏïÑÏöî! ÏöîÏ≤≠ÌïòÏã† Ï°∞Í±¥ÏùÑ Ï†ïÎ¶¨ÌïòÎ©¥ Îã§ÏùåÍ≥º Í∞ôÏäµÎãàÎã§:
    > 
    > ### üèôÔ∏è Ï°∞Í±¥
    > 
    > * **Ïà´Ïûê**Î°ú ÏãúÏûë
    > * **ÎèÑÎ°úÎ™Ö** (ÏòÅÏñ¥ Îã®Ïñ¥)
    > * **"ST" ÎòêÎäî "AVE"** Î°ú ÎÅùÎÇ®
    > * Ïòà:
    >   * `123 Main ST`
    >   * `456 Elm AVE`
    > 
    > ## ‚úÖ Ï†ïÍ∑ú ÌëúÌòÑÏãù
    > 
    > ```python
    > r'\b\d+\s+\w+\s+(ST|AVE)\b'
    > ```
    > 
    > ### üîç ÏÑ§Î™Ö
    > 
    > | Ìå®ÌÑ¥    | ÏùòÎØ∏                          |               |
    > | ----- | --------------------------- | ------------- |
    > | `\b`  | Îã®Ïñ¥ Í≤ΩÍ≥Ñ (ÏÑ†ÌÉùÏ†ÅÏúºÎ°ú Ï†ïÌôïÌïú Îß§Ïπ≠ÏùÑ ÏúÑÌï¥ ÏÇ¨Ïö©) |               |
    > | `\d+` | Ïà´Ïûê 1Í∞ú Ïù¥ÏÉÅ                    |               |
    > | `\s+` | Í≥µÎ∞± 1Í∞ú Ïù¥ÏÉÅ                    |               |
    > | `\w+` | ÎèÑÎ°úÎ™Ö (ÏïåÌååÎ≤≥, Ïà´Ïûê, Ïñ∏ÎçîÏä§ÏΩîÏñ¥ Ìè¨Ìï®Îêú Îã®Ïñ¥) |               |
    > | `\s+` | Í≥µÎ∞±                          |               |
    > | `(ST  | AVE)`                       | `ST` ÎòêÎäî `AVE` |
    > | `\b`  | Îã®Ïñ¥ Í≤ΩÍ≥Ñ                       |               |
    > 
    > ### üß™ ÏòàÏ†ú ÏΩîÎìú
    > 
    > ```python
    > import re
    > 
    > text = """
    > 123 Main ST
    > 456 Elm AVE
    > 789 Pine Rd
    > 321 Oak BLVD
    > """
    > 
    > pattern = r'\b\d+\s+\w+\s+(ST|AVE)\b'
    > matches = re.findall(pattern, text)
    > print("Matches:", matches)
    > 
    > # Ï†ÑÏ≤¥ Ï£ºÏÜå Ï∂îÏ∂úÌïòÎ†§Î©¥ re.findall ÎåÄÏã† re.finditer ÏÇ¨Ïö©:
    > full_matches = [m.group() for m in re.finditer(pattern, text)]
    > print("Full addresses:", full_matches)
    > ```
    > 
    > **Ï∂úÎ†•:**
    > 
    > ```
    > Matches: ['ST', 'AVE']
    > Full addresses: ['123 Main ST', '456 Elm AVE']
    > ```
    > 
    > ### üõ†Ô∏è Ï∞∏Í≥†
    > 
    > * ÎåÄÏÜåÎ¨∏ÏûêÎ•º Íµ¨Î∂ÑÌïòÏßÄ ÏïäÍ≤å ÌïòÎ†§Î©¥ `re.IGNORECASE` ÏòµÏÖòÏùÑ Ï∂îÍ∞ÄÌïòÏÑ∏Ïöî:
    > 
    > ```python
    > re.findall(pattern, text, re.IGNORECASE)
    > ```
    > 
    > * ÎèÑÎ°úÎ™ÖÏù¥ Îëê Îã®Ïñ¥ Ïù¥ÏÉÅÏùº Í≤ΩÏö∞(`123 N Main ST`)ÍπåÏßÄ Ìè¨Í¥ÑÌïòÎ†§Î©¥ Ï†ïÍ∑úÏãùÏùÑ Ï°∞Í∏à ÌôïÏû•Ìï¥Ïïº Ìï©ÎãàÎã§. ÏõêÌïòÏãúÎ©¥ Í∑∏ Î≤ÑÏ†ÑÎèÑ ÏïåÎ†§ÎìúÎ¶¥Í≤åÏöî.

- Mr ÎòêÎäî MrsÏôÄ Í∞ôÏùÄ ÏùºÎ∞òÏ†ÅÏù∏ Ìò∏Ïπ≠ÏúºÎ°ú ÏãúÏûëÌïòÍ≥†, ÎåÄÎ¨∏ÏûêÎ°ú ÏãúÏûëÌïòÎäî Ïù¥Î¶ÑÎì§Ïù¥ Ïù¥Ïñ¥ÏßÄÎ©∞, ÏùºÎ∂Ä Ïù¥Î¶ÑÏóêÎäî ÌïòÏù¥ÌîàÏù¥ ÏûàÏùÑ ÏàòÎèÑ ÏûàÎäî Ï†ÑÏ≤¥ Ïù¥Î¶ÑÏùÑ Îß§Ïπ≠ÌïòÎäî ÌååÏù¥Ïç¨ Ï†ïÍ∑ú ÌëúÌòÑÏãù ÎßåÎì§Ïñ¥Ï§ò.
    > ### ‚úÖ ÌååÏù¥Ïç¨ Ï†ïÍ∑ú ÌëúÌòÑÏãù ÏòàÏãú:
    > 
    > ```python
    > import re
    > 
    > pattern = r'\b(Mr|Mrs|Ms|Dr|Prof)\.?\s+([A-Z][a-z]+(-[A-Z][a-z]+)?\s+)*[A-Z][a-z]+(-[A-Z][a-z]+)?\b'
    > ```
    > 
    > ### üîç ÏÑ§Î™Ö
    > 
    > | Ìå®ÌÑ¥                                 | ÏùòÎØ∏                                   |    |    |        |                     |
    > | ---------------------------------- | ------------------------------------ | -- | -- | ------ | ------------------- |
    > | `\b`                               | Îã®Ïñ¥ Í≤ΩÍ≥Ñ                                |    |    |        |                     |
    > | `(Mr                               | Mrs                                  | Ms | Dr | Prof)` | ÏùºÎ∞òÏ†ÅÏù∏ Ìò∏Ïπ≠ (ÌïÑÏöîÏãú Ï∂îÍ∞Ä Í∞ÄÎä•) |
    > | `\.?`                              | Ï†ê(`.`)ÏùÄ ÏÑ†ÌÉùÏ†Å                          |    |    |        |                     |
    > | `\s+`                              | Í≥µÎ∞±                                   |    |    |        |                     |
    > | `([A-Z][a-z]+(-[A-Z][a-z]+)?\s+)*` | ÎåÄÎ¨∏ÏûêÎ°ú ÏãúÏûëÌïòÎäî Ïù¥Î¶Ñ + ÏÑ†ÌÉùÏ†Å ÌïòÏù¥Ìîà Ïù¥Î¶Ñ (0Ìöå Ïù¥ÏÉÅ Î∞òÎ≥µ) |    |    |        |                     |
    > | `[A-Z][a-z]+(-[A-Z][a-z]+)?`       | ÎßàÏßÄÎßâ Ïù¥Î¶Ñ (ÎòêÎäî ÏÑ±)                        |    |    |        |                     |
    > | `\b`                               | Îã®Ïñ¥ Í≤ΩÍ≥Ñ                                |    |    |        |                     |
    > 
    > ### üß™ ÏòàÏ†ú ÏΩîÎìú
    > 
    > ```python
    > import re
    > 
    > text = """
    > Mr. John Smith
    > Mrs Jane Doe-Smith
    > Dr Albert Einstein
    > Prof. Emily-Marie Clark
    > Ms Anna
    > """
    > 
    > pattern = r'\b(Mr|Mrs|Ms|Dr|Prof)\.?\s+([A-Z][a-z]+(-[A-Z][a-z]+)?\s+)*[A-Z][a-z]+(-[A-Z][a-z]+)?\b'
    > matches = re.findall(pattern, text)
    > 
    > # Ï†ÑÏ≤¥ Îß§Ïπ≠Îêú Ïù¥Î¶Ñ Ï∂îÏ∂ú
    > full_matches = [m[0] for m in re.finditer(pattern, text)]
    > print("Full names:", full_matches)
    > ```
    > 
    > ### ‚úÖ Ï∂úÎ†• Í≤∞Í≥º:
    > 
    > ```
    > Full names: ['Mr. John Smith', 'Mrs Jane Doe-Smith', 'Dr Albert Einstein', 'Prof. Emily-Marie Clark', 'Ms Anna']
    > ```
    > 
    > ### üìå Ï∞∏Í≥†
    > 
    > * `Mr`, `Mrs`, `Ms`, `Dr`, `Prof` Îì± **ÌïÑÏöîÌïú Ìò∏Ïπ≠ÏùÑ ÏûêÏú†Î°≠Í≤å Ï∂îÍ∞Ä**Ìï† Ïàò ÏûàÏñ¥Ïöî.
    > * Ïù¥Î¶Ñ ÏÇ¨Ïù¥Ïóê Ïó¨Îü¨ Í∞úÏùò Í≥µÎ∞±Ïù¥ ÏûàÍ±∞ÎÇò, Î≥µÏû°Ìïú ÏÑ±ÏùÑ Í∞ÄÏßÑ Í≤ΩÏö∞Îäî **Ï∂îÍ∞ÄÏ†ÅÏù∏ Ï°∞Ï†ï**Ïù¥ ÌïÑÏöîÌï† Ïàò ÏûàÏñ¥Ïöî.
    > * `re.IGNORECASE` ÏòµÏÖòÏùÄ Î≥¥ÌÜµ ÌïÑÏöî ÏóÜÏßÄÎßå, ÌïÑÏöîÌïòÎ©¥ Ìï®Íªò ÏÇ¨Ïö© Í∞ÄÎä•Ìï©ÎãàÎã§.

- ÌååÏù¥Ïç¨ÏóêÏÑú raw stringÏù¥ Î≠êÏïº?
    > Ï¢ãÏùÄ ÏßàÎ¨∏Ïù¥ÏóêÏöî!
    > 
    > ### üìå ÌååÏù¥Ïç¨ÏóêÏÑú **raw string**Ïù¥ÎûÄ?
    > 
    > Î¨∏ÏûêÏó¥ ÏïûÏóê Ï†ëÎëêÏÇ¨Î°ú `r` ÎòêÎäî `R`ÏùÑ Î∂ôÏó¨ **"ÏûàÎäî Í∑∏ÎåÄÎ°ú" Ìï¥ÏÑù**ÎêòÎèÑÎ°ù ÎßåÎì† Î¨∏ÏûêÏó¥ÏûÖÎãàÎã§.
    > Ï¶â, Î∞±Ïä¨ÎûòÏãú(`\`)Î•º **Ïù¥Ïä§ÏºÄÏù¥ÌîÑ Î¨∏ÏûêÎ°ú Ìï¥ÏÑùÌïòÏßÄ ÏïäÍ≥†**, **Í∑∏ ÏûêÏ≤¥Î°ú** ÏÇ¨Ïö©ÌïòÎäî Î¨∏ÏûêÏó¥Ïù¥ÏóêÏöî.
    > 
    > ### ‚úÖ ÏùºÎ∞ò Î¨∏ÏûêÏó¥ vs Raw Î¨∏ÏûêÏó¥ ÎπÑÍµê
    > 
    > ```python
    > # ÏùºÎ∞ò Î¨∏ÏûêÏó¥ (Ïù¥Ïä§ÏºÄÏù¥ÌîÑ Ï≤òÎ¶¨Îê®)
    > s1 = "Hello\nWorld"
    > print(s1)
    > ```
    > 
    > Ï∂úÎ†•:
    > 
    > ```
    > Hello
    > World
    > ```
    > 
    > ```python
    > # Raw Î¨∏ÏûêÏó¥ (Î∞±Ïä¨ÎûòÏãúÎ•º Î¨∏Ïûê Í∑∏ÎåÄÎ°ú Ï∂úÎ†•)
    > s2 = r"Hello\nWorld"
    > print(s2)
    > ```
    > 
    > Ï∂úÎ†•:
    > 
    > ```
    > Hello\nWorld
    > ```
    > 
    > ### üí° Ïôú ÌïÑÏöîÌï†Íπå?
    > 
    > #### Ï£ºÎ°ú **Ï†ïÍ∑ú ÌëúÌòÑÏãù**ÏóêÏÑú ÎßéÏù¥ ÏÇ¨Ïö©ÎèºÏöî!
    > 
    > Ï†ïÍ∑úÌëúÌòÑÏãùÏóêÎäî Î∞±Ïä¨ÎûòÏãúÍ∞Ä ÏûêÏ£º ÎÇòÏò§ÎäîÎç∞,
    > ÏùºÎ∞ò Î¨∏ÏûêÏó¥Î°ú Ïì∞Î©¥ `\\`Ï≤òÎüº Îëê Î≤à Ïç®Ïïº Ìï¥ÏÑú Î≥µÏû°Ìï¥Ï†∏Ïöî.
    > 
    > ```python
    > import re
    > 
    > # ÏùºÎ∞ò Î¨∏ÏûêÏó¥ (Î∞±Ïä¨ÎûòÏãú 2Í∞ú)
    > pattern1 = "\\d+"
    > 
    > # raw Î¨∏ÏûêÏó¥ (Í∑∏ÎÉ• Î∞±Ïä¨ÎûòÏãú Ìïú Í∞ú)
    > pattern2 = r"\d+"
    > 
    > re.findall(pattern2, "My number is 1234")  # ['1234']
    > ```
    > 
    > ### üéØ Ï†ïÎ¶¨ÌïòÎ©¥
    > 
    > | Íµ¨Î∂Ñ       | ÏùòÎØ∏                            |
    > | -------- | ----------------------------- |
    > | `"..."`  | ÏùºÎ∞ò Î¨∏ÏûêÏó¥ (Ïù¥Ïä§ÏºÄÏù¥ÌîÑ Î¨∏Ïûê Ìï¥ÏÑù)          |
    > | `r"..."` | raw Î¨∏ÏûêÏó¥ (Ïù¥Ïä§ÏºÄÏù¥ÌîÑ Î¨∏Ïûê Î¨¥Ïãú, Í∑∏ÎåÄÎ°ú ÏÇ¨Ïö©) |

### 8.12.2. Exercise

In [57]:
def head(read_file, num_lines, write_file=None):
    read_file_object = open(read_file, encoding='utf-8')
    if write_file:
        write_file_object = open(write_file, 'w', encoding='utf-8')
    for i in range(num_lines):
        line = read_file_object.readline()
        if write_file:
            write_file_object.write(line)
        else:
            print(line, end='')
    
    read_file_object.close()
    if write_file:
        write_file_object.close()

In [58]:
head('pg345_cleaned.txt', 10)





                                DRACULA

                                  _by_

                              Bram Stoker



In [59]:
head('pg345_cleaned.txt', 100, 'pg345_cleaned_100_lines.txt')

### 8.12.3. Exercise

In [60]:
def uses_any(word, letters):
    for letter in word.lower():
        if letter in letters.lower():
            return True
    return False

In [61]:
def check_word(word):
    word = word.lower()
    if 'e' not in word:
        return False
    if word[2] == 'e' or word[4] == 'e':
        return False
    if uses_any(word, 'spadclrk'):
        return False
    
    return True

In [62]:
for line in open('words.txt'):
    word = line.strip()
    if len(word) == 5 and check_word(word):
        print(word)

befit
befog
beget
begin
begot
begum
begun
beigy
being
bemix
benni
benny
beton
bewig
bogey
boney
buteo
ebbet
ebony
eight
embow
emmet
enfin
enjoy
ennui
envoi
envoy
enzym
eying
feign
feint
fenny
feoff
fogey
fumet
gemmy
gemot
genet
genii
genom
given
gooey
hefty
heigh
hemin
heugh
homey
honey
hooey
hymen
jemmy
jenny
jeton
jetty
meiny
mezzo
mizen
money
motet
motey
neigh
netty
nomen
numen
often
quiet
tenet
tenon
tenth
tenty
teugh
thief
totem
unmet
unmew
veiny
venin
venom
vimen
vixen
webby
weigh
wenny
winey
witen
wizen
women
woven
xenon
zibet


### 8.12.4. Exercise

In [63]:
def check_word2(word):
    word = word.lower()
    if 'e' not in word:
        return False
    if word[2] == 'e' or word[3] == 'e' or word[4] == 'e':
        return False
    if word[4] != 'm':
        return False
    if uses_any(word, 'spadclrk'):
        return False
    
    return True

In [64]:
for line in open('words.txt'):
    word = line.strip()
    if len(word) == 5 and check_word2(word):
        print(word)

begum
enzym
genom
venom


### 8.12.5. Exercise

In [65]:
def clean_file(input_file, output_file):
    reader = open(input_file, encoding='utf-8')
    writer = open(output_file, 'w', encoding='utf-8')

    for line in reader:
        if is_special_line(line):
            break

    for line in reader:
        if is_special_line(line):
            break
        writer.write(line)
        
    reader.close()
    writer.close()

clean_file('pg1184.txt', 'pg1184_cleaned.txt')

In [66]:
def count_matches(pattern):
    count = 0
    for line in open('pg1184_cleaned.txt', encoding='utf-8'):
        result = re.search(pattern, line)
        if result:
            count += 1
    return count

In [67]:
pattern = r'\b(pale|pales|paled|paleness|pallor)\b'
count_matches(pattern)

223