In [6]:
from urllib.parse import quote
from urllib import request
import codecs
import re, html
from os import listdir
from os.path import isfile, join

class GoogleCrawler(object):
    def __init__(self, sample, max_padding = 18):
        for (no, content, a, b, c, d, e) in re.findall(r'\[(\d+)\](.*)### a:(.*), b:(.*), c:(.*), d:(.*), e:(.*)\[END\]', sample):
            self.no = int(no)
            content = re.sub('(")(.?)', r'\2', content.strip())
            self.raw_query = content.replace('︽⊙＿⊙︽', '*')
            self.raw_pattern = re.sub(r'\s+', '\\s?', content.replace('︽⊙＿⊙︽', '(.*?)'))
            self.options = [x.strip() for x in [a, b, c, d, e]]
            break
        self.max_padding = max_padding
        self.short_query = self.make_short_string(self.raw_query, '*')
        self.short_pattern = self.make_short_string(self.raw_pattern, '(.*?)')
        
    def set_padding(self, padding):
        self.max_padding = padding
        self.short_query = self.make_short_string(self.raw_query, '*')
        self.short_pattern = self.make_short_string(self.raw_pattern, '(.*?)')

    def get_link(self, query = None):
        if query == None: query = self.short_query
        if self.link == None:
            self.link = "https://www.google.com/search?q=" + quote(query)
#             + '&lr=lang_zh-TW' + '&nfpr=1'
        return self.link

    def make_short_string(self, string, pattern):
        pattern_len = len(pattern)
        string_len = len(string)
        idx = string.find(pattern)
        left_bound = idx - self.max_padding
        right_bound = idx + pattern_len + self.max_padding
        if left_bound < 0: left_bound = 0
        if right_bound >= string_len : right_bound = string_len
            
        if idx - left_bound <= 1: left_bound = idx
        if right_bound - (idx + pattern_len) == 1: right_bound = right_bound - 1
        return string[left_bound : right_bound]
    
    def search_answer(self):
        cleaned_content = self.clean_html(self.google_crawl(self.short_query))
        for (m) in re.findall(self.short_pattern, cleaned_content):
            ans = m.strip().lower()
#             print(ans)
            if ans in self.options:
                return (self.options.index(ans), ans)
                break
    
    def search_fast_answer(self):
        raw_html = self.google_crawl(self.raw_query).lower()
        clean_html = self.clean_html(raw_html)
        for x in self.options:
            if clean_html.find(x) > 0:
                return (self.options.index(x), x)
        
    def google_crawl(self, query = None):
        if query == None: query = self.short_query
        self.link = "https://www.google.com/search?q=" + quote(query)  + '&ie=utf8&oe=utf8' # + '&lr=lang_zh-TW'
        req = request.Request(self.link, headers = {'User-Agent' : "Magic Browser"})
        raw = html.unescape(request.urlopen(req).read().decode('utf-8'))
        return raw
    
    def clean_html(self, raw_html = None):
        if raw_html == None: raw_html = self.google_crawl(self.short_query)
        clean_html = re.sub(re.compile(r'(<br?>)|(</br?>)|\n|\r|\s'), '', raw_html)
        return clean_html

In [12]:
path = 'question_samples/'
sample_files = [f for f in listdir(path) if isfile(join(path, f))]
i, j = 0, 0
for file in sample_files:
    with codecs.open(path + file, encoding='utf-8') as f:
        for line in f:
            if line.find('[END]') >= 0:
                crawler = GoogleCrawler(line)
                try:
                    ans = crawler.search_fast_answer()
                    if ans == None:
                        print('[failure]{}:{}'.format(file, line[:3]))
                        j += 1
#                     else:
#                         print(ans)
                except:
                    print('[exception]:{}:{}'.format(file, line[:3]))
                    j += 1
                i += 1
                if i % 100 == 1: print(1. - float(j) / float(i))
print(1. - float(j) / float(i))
                
# for i in range(20):
#     print(i)
#     question = samples[i]
#     crawler = GoogleCrawler(question)
#     print('{}\n{}'.format(crawler.short_query, crawler.short_pattern))    
#     print(crawler.search_fast_answer())
#     print('----------')

1.0
[failure]2016-08-02-07-28-35.txt:[1]
[failure]2016-08-07-01-23-23.txt:[1]
[failure]2016-08-07-12-46-48.txt:[1]
[failure]2016-08-07-12-46-48.txt:[5]
[failure]2016-08-04-01-56-08.txt:[1]
[failure]2016-08-04-01-56-08.txt:[5]
[failure]2016-08-03-19-43-20.txt:[1]
[failure]2016-08-03-19-43-20.txt:[3]
0.9207920792079208
[failure]2016-08-03-18-41-11.txt:[2]
[failure]2016-08-03-18-41-11.txt:[4]
[failure]2016-08-05-03-49-29.txt:[5]
[failure]2016-08-05-16-15-09.txt:[3]
[failure]2016-08-01-13-49-09.txt:[1]
[failure]2016-08-05-17-17-17.txt:[1]
[failure]2016-08-07-06-34-01.txt:[1]
[failure]2016-08-07-06-34-01.txt:[3]
0.9203980099502488
[failure]2016-08-05-06-55-54.txt:[3]
[failure]2016-08-05-07-58-03.txt:[3]
[failure]2016-08-01-10-42-46.txt:[3]
[failure]2016-08-05-14-10-51.txt:[2]
[failure]2016-08-05-22-27-57.txt:[1]
[failure]2016-08-07-15-53-08.txt:[1]
[failure]2016-08-02-12-39-15.txt:[5]
0.9235880398671097
[failure]2016-08-04-14-21-42.txt:[5]
[failure]2016-08-06-13-59-56.txt:[2]
[failure]2016-

```
[failure]2016-08-02-07-28-35.txt:[1]
[failure]2016-08-07-01-23-23.txt:[1]
[failure]2016-08-07-12-46-48.txt:[1]
[failure]2016-08-07-12-46-48.txt:[5]
[failure]2016-08-04-01-56-08.txt:[1]
[failure]2016-08-04-01-56-08.txt:[5]
[failure]2016-08-03-19-43-20.txt:[1]
[failure]2016-08-03-19-43-20.txt:[3]
[failure]2016-08-03-18-41-11.txt:[2]
[failure]2016-08-03-18-41-11.txt:[4]
[failure]2016-08-05-03-49-29.txt:[5]
[failure]2016-08-05-16-15-09.txt:[3]
[failure]2016-08-01-13-49-09.txt:[1]
[failure]2016-08-05-17-17-17.txt:[1]
[failure]2016-08-07-06-34-01.txt:[1]
[failure]2016-08-07-06-34-01.txt:[3]
[failure]2016-08-05-06-55-54.txt:[3]
[failure]2016-08-05-07-58-03.txt:[3]
[failure]2016-08-01-10-42-46.txt:[3]
[failure]2016-08-05-14-10-51.txt:[2]
[failure]2016-08-05-22-27-57.txt:[1]
[failure]2016-08-07-15-53-08.txt:[1]
[failure]2016-08-02-12-39-15.txt:[5]
[failure]2016-08-04-14-21-42.txt:[5]
[failure]2016-08-06-13-59-56.txt:[2]
[failure]2016-08-04-05-02-34.txt:[1]
[failure]2016-08-02-06-26-27.txt:[1]
[failure]2016-08-03-22-49-45.txt:[5]
[failure]2016-08-02-03-20-07.txt:[2]
[failure]2016-08-05-23-30-05.txt:[5]
[failure]2016-08-06-04-40-43.txt:[1]
[failure]2016-08-02-11-37-06.txt:[1]
[failure]2016-08-03-23-51-53.txt:[3]
[failure]2016-08-05-05-53-48.txt:[5]
[failure]2016-08-03-14-32-37.txt:[1]
[failure]2016-08-01-06-34-12.txt:[5]
[failure]2016-08-01-09-40-39.txt:[3]
[failure]2016-08-01-03-27-45.txt:[3]
[failure]2016-08-01-03-27-45.txt:[5]
[failure]2016-08-02-05-24-21.txt:[3]
[failure]2016-07-31-17-06-20.txt:[5]
[failure]2016-08-04-12-17-27.txt:[4]
[failure]2016-08-04-12-17-27.txt:[3]
[failure]2016-08-05-00-43-00.txt:[3]
[failure]2016-08-05-00-43-00.txt:[5]
[failure]2016-07-31-15-02-02.txt:[4]
[failure]2016-07-31-15-02-02.txt:[3]
[failure]2016-08-03-03-09-08.txt:[2]
[failure]2016-08-07-10-42-31.txt:[3]
[failure]2016-08-07-09-40-23.txt:[5]
[failure]2016-08-07-05-31-54.txt:[2]
[failure]2016-08-07-05-31-54.txt:[4]
[failure]2016-08-05-18-19-25.txt:[1]
[failure]2016-08-01-08-38-29.txt:[5]
[failure]2016-08-02-17-49-56.txt:[1]
0.934052757793765
```

In [40]:
with open('question_samples/failure_log.txt', 'r') as f:
    for line in f:
        filename, no = re.findall(r'\[failure\](.*):\[(\d+)\]', line)[0]
        with codecs.open('question_samples/' + filename, 'r', 'utf-8') as g:
            for s in g:
                try:
                    crawler = GoogleCrawler(s)
                except:
                    continue
                if int(no) == int(crawler.no):
                    print('filename/no: {}/{},\tqLen: {},\tqNum: {}'.format(filename, no, len(crawler.raw_query), len(re.findall(r'\*', crawler.raw_query))))


filename/no: 2016-08-02-07-28-35.txt/1,	qLen: 107,	qNum: 1
filename/no: 2016-08-07-01-23-23.txt/1,	qLen: 141,	qNum: 1
filename/no: 2016-08-07-12-46-48.txt/1,	qLen: 36,	qNum: 1
filename/no: 2016-08-07-12-46-48.txt/5,	qLen: 50,	qNum: 1
filename/no: 2016-08-04-01-56-08.txt/1,	qLen: 36,	qNum: 1
filename/no: 2016-08-04-01-56-08.txt/5,	qLen: 50,	qNum: 1
filename/no: 2016-08-03-19-43-20.txt/1,	qLen: 187,	qNum: 1
filename/no: 2016-08-03-19-43-20.txt/3,	qLen: 183,	qNum: 1
filename/no: 2016-08-03-18-41-11.txt/2,	qLen: 145,	qNum: 1
filename/no: 2016-08-03-18-41-11.txt/4,	qLen: 35,	qNum: 1
filename/no: 2016-08-05-03-49-29.txt/5,	qLen: 39,	qNum: 1
filename/no: 2016-08-05-16-15-09.txt/3,	qLen: 40,	qNum: 1
filename/no: 2016-08-01-13-49-09.txt/1,	qLen: 105,	qNum: 1
filename/no: 2016-08-05-17-17-17.txt/1,	qLen: 95,	qNum: 1
filename/no: 2016-08-07-06-34-01.txt/1,	qLen: 187,	qNum: 1
filename/no: 2016-08-07-06-34-01.txt/3,	qLen: 183,	qNum: 1
filename/no: 2016-08-05-06-55-54.txt/3,	qLen: 106,	qNum: 1
filen

In [66]:
path = 'question_samples/'
filename = '2016-07-31-15-02-02.txt'
no = 4
def get_sample(file_name, no_sample):
    with codecs.open(file_name, 'r', 'utf-8') as f:
        for line in f:
            for (no, content, a, b, c, d, e) in re.findall(r'\[(\d+)\](.*)### a:(.*), b:(.*), c:(.*), d:(.*), e:(.*)\[END\]', line):
                if int(no) == int(no_sample):
                    return line
            
crawler = GoogleCrawler(get_sample(path + filename, no))
print(crawler.raw_query)
print(crawler.options)
crawler.search_fast_answer()

因為地處*的高點--號稱"全*最高的溫泉",又有"*101"的別
['烏來', '熊肉', '農會', '新港鴨肉羹', '夜釣小管']


In [28]:
len(crawler.raw_query)

167

In [85]:
import difflib
sm = difflib.SequenceMatcher(None, a=src, b=tar )
sm.get_opcodes()

[('insert', 0, 0, 0, 16401),
 ('equal', 0, 36, 16401, 16437),
 ('insert', 36, 36, 16437, 47730)]

In [4]:
import jieba
a = '他說 然後擦*前有刮鬍子  他覺得這款保濕露對刮鬍子後的刺激感有舒緩功'
# a.find('*')
# m = list(re.finditer(r'\*',a))
# m

In [70]:
len('因為地處*的高點--號稱"全*最高的溫泉",又有"*')

26