# 第1章 文字

## 1.2 文字コード

In [1]:
# Listing 1.1

query = '京都'
s1 = '清水寺は京都にある'
s2 = '浅草寺は東京にある'
print(query in s1)
print(query in s2)

True
False


In [2]:
# Listing 1.2 #

def check_query(filename, query):
    with open(filename, 'r', encoding='UTF-8') as f:
        s = f.read()
        return query in s

In [3]:
# Listing 1.3 #

print(check_query('data/ch01/01.txt', '京都'))

True


In [4]:
# Listing 1.4 #

query = '京都'
file_list = ['data/ch01/%02d.txt' % x for x in (1, 2, 3, 4)]                    
for f in file_list:
    r = check_query(f, query)
    print('{} in {}...{}'.format(query, f, r))

京都 in data/ch01/01.txt...True


UnicodeDecodeError: 'utf-8' codec can't decode byte 0xbd in position 0: invalid start byte

In [5]:
# Listing 1.5 #

print(list('a'.encode()))
print(list('abc'.encode()))

[97]
[97, 98, 99]


In [6]:
# Listing 1.6 #

print(list('京'.encode('EUC-JP')))
print(list('京'.encode('SHIFT_JIS')))
print(list('京'.encode('UTF-8')))

[181, 254]
[139, 158]
[228, 186, 172]


In [7]:
# Listing 1.7 #

import chardet

print(chardet.detect('明日，京都に行きます'.encode('EUC-JP')))

{'encoding': 'EUC-JP', 'confidence': 0.99, 'language': 'Japanese'}


In [8]:
# Listing 1.8 #

print(chardet.detect('京'.encode('EUC-JP')))

{'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}


In [9]:
# Listing 1.9 #

import chardet

def get_string_from_file(filename):
    with open(filename, 'rb') as f:
        d = f.read()
        e = chardet.detect(d)['encoding']
        # 推定できなかったときはUTF-8で
        if e == None:
            e = 'UTF-8'
        return d.decode(e)

In [10]:
# Listing 1.10 #

def check_encoding_and_query(filename, query):
    s = get_string_from_file(filename)
    return query in s

In [11]:
# Listing 1.11 #
#
# query = '京都'
# file_list = ['data/ch01/%02d.txt' % x for x in (1, 2, 3, 4)]

for f in file_list:
    r = check_encoding_and_query(f, query)
    print('{} in {}...{}'.format(query, f, r))

京都 in data/ch01/01.txt...True
京都 in data/ch01/02.txt...False
京都 in data/ch01/03.txt...True
京都 in data/ch01/04.txt...True


## 1.3 文字Nグラム

In [12]:
# Listing 1.12 #

def get_ngram(string, N=1):
    return [string[i:i+N] for i in range(len(string) - N + 1)]

string = '情報検索'
print(get_ngram(string, N=1))
print(get_ngram(string, N=2))

['情', '報', '検', '索']
['情報', '報検', '検索']


In [13]:
# Listing 1.13 #

from collections import Counter

def get_most_common_ngram(filename, N=1, k=1):
    s = get_string_from_file(filename)
    return Counter(get_ngram(s, N=N)).most_common(k)

In [14]:
# Listing 1.14 #

print(get_most_common_ngram('data/ch01/melos.txt', N=3, k=5))
print(get_most_common_ngram('data/ch01/album.txt', N=3, k=5))

[('メロス', 76), ('った。', 53), ('ロスは', 47), ('のだ。', 37), ('。メロ', 33)]
[('です。', 46), ('ている', 46), ('の写真', 34), ('した。', 33), ('のです', 32)]


## 1.4 正規表現

In [15]:
# Listing 1.15 #

import re

string = 'やっぱり『つぶ餡』が好き'
pattern = '『.*』'
result = re.search(pattern, string)
print(result.group(0))

『つぶ餡』


In [16]:
# Listing 1.16 #

string = 'やっぱり『つぶ餡』が好き'
pattern = '『(.*)』'
result = re.search(pattern, string)
print(result.group(1))

つぶ餡


In [17]:
# Listing 1.17 #

string = 'やっぱり『つぶ餡』が好き'
pattern = '『((..).*)』'
result = re.search(pattern, string)
print(result.group(1))
print(result.group(2))

つぶ餡
つぶ


In [18]:
# Listing 1.18 #

string = 'このぼたもちはとてももちもちしている'
pattern = r'(..)\1'
result = re.search(pattern, string)
print(result.group(0))

もちもち


In [19]:
# Listing 1.19 #

string = '『つぶ餡』にするか『こし餡』にするか'
pattern = '『(.*)』'
result = re.search(pattern, string)
print(result.group(1))

つぶ餡』にするか『こし餡


In [20]:
# Listing 1.20 #

string = '『つぶ餡』にするか『こし餡』にするか'
pattern = '『(.*?)』'
result = re.search(pattern, string)
print(result.group(1))

つぶ餡


In [21]:
# Listing 1.21 #

string = '『つぶ餡』にするか『こし餡』にするか'
pattern = '『(.*?)』'
result = re.findall(pattern, string)
print(result)

['つぶ餡', 'こし餡']


In [22]:
# Listing 1.22 #

import re

def get_snippet_from_file(filename, query, width=2):
    s = get_string_from_file(filename)
    p = '.{0,%d}%s.{0,%d}' % (width, query, width)                              
    r = re.search(p, s)
    if r:
        return r.group(0)
    else:
        return None

In [23]:
# Listing 1.23 #

query = '京都'
file_list = ['data/ch01/%02d.txt' % x for x in (1, 2, 3, 4)]                    
for f in file_list:
    print(f, get_snippet_from_file(f, query, width=6))

data/ch01/01.txt 祇園祭は京都三大祭の一つ
data/ch01/02.txt None
data/ch01/03.txt 京都のお土産でお
data/ch01/04.txt 築地市場は東京都江東区の豊洲
