In [27]:
from urllib.parse import quote
from urllib import request
import codecs
import re, html
from os import listdir
from os.path import isfile, join

class GoogleCrawler(object):
    def __init__(self, sample, max_padding = 18):
        for (no, content, a, b, c, d, e) in re.findall(r'\[(\d+)\](.*)### a:(.*), b:(.*), c:(.*), d:(.*), e:(.*)\[END\]', sample):
            self.no = int(no)
            content = re.sub('(")(.)', r'\2', content.strip())
            self.raw_query = content.replace('︽⊙＿⊙︽', '*')
            self.raw_pattern = re.sub(r'\s+', '\\s?', content.replace('︽⊙＿⊙︽', '(.*?)'))
            self.options = [x.strip() for x in [a, b, c, d, e]]
            break
        self.max_padding = max_padding
        self.short_query = self.make_short_string(self.raw_query, '*')
        self.short_pattern = self.make_short_string(self.raw_pattern, '(.*?)')
        
    def set_padding(self, padding):
        self.max_padding = padding
        self.short_query = self.make_short_string(self.raw_query, '*')
        self.short_pattern = self.make_short_string(self.raw_pattern, '(.*?)')

    def get_link(self, query = None):
        if query == None: query = self.short_query
        if self.link == None:
            self.link = "https://www.google.com.tw/search?q=" + quote(query) + '&lr=lang_zh-TW'
        return self.link

    def make_short_string(self, string, pattern):
        pattern_len = len(pattern)
        string_len = len(string)
        idx = string.find(pattern)
        left_bound = idx - self.max_padding
        right_bound = idx + pattern_len + self.max_padding
        if left_bound < 0: left_bound = 0
        if right_bound >= string_len : right_bound = string_len
            
        if idx - left_bound <= 1: left_bound = idx
        if right_bound - (idx + pattern_len) == 1: right_bound = right_bound - 1
        return string[left_bound : right_bound]
    
    def search_answer(self):
        cleaned_content = self.clean_html(self.google_crawl(self.short_query))
        for (m) in re.findall(self.short_pattern, cleaned_content):
            ans = m.strip().lower()
#             print(ans)
            if ans in self.options:
                return (self.options.index(ans), ans)
                break
    
    def search_fast_answer(self):
        raw_html = self.google_crawl(self.short_query).lower()
        for x in self.options:
            if raw_html.find(x) > 0:
                return (self.options.index(x), x)
        
    def google_crawl(self, query = None):
        if query == None: query = self.short_query
        self.link = "https://www.google.com.tw/search?q=" + quote(query) + '&lr=lang_zh-TW'
        req = request.Request(self.link, headers = {'User-Agent' : "Magic Browser"})
        try:
            raw = request.urlopen(req).read().decode('cp950')
        except:
            raw = request.urlopen(req).read().decode('big5')
        return raw
    
    def clean_html(self, raw_html = None):
        if raw_html == None: raw_html = self.google_crawl(self.short_query)
        raw_html = raw_html.replace(self.short_query , '')
        try:
            unescaped_html = html.unescape(raw_html)
        except:
            unescaped_html = raw_html
        clean_html = re.sub(re.compile(r'(<br?>)|(</br?>)|\n|\r|\s'), '', unescaped_html.lower())
        return clean_html

In [30]:
path = 'question_samples/'
sample_files = [f for f in listdir(path) if isfile(join(path, f))]
i, j = 0, 0
for file in sample_files:
    with codecs.open(path + file, encoding='utf-8') as f:
        for line in f:
            if line.find('[END]') >= 0:
                crawler = GoogleCrawler(line)
                try:
                    ans = crawler.search_fast_answer()
                    if ans == None:
                        j += 1
#                     else:
#                         print(ans)
                except:
                    j += 1
                i += 1
print(1. - float(j) / float(i))
                
# for i in range(20):
#     print(i)
#     question = samples[i]
#     crawler = GoogleCrawler(question)
#     print('{}\n{}'.format(crawler.short_query, crawler.short_pattern))    
#     print(crawler.search_fast_answer())
#     print('----------')

0.8210526315789474


2
