In [183]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import glob 
import json
import re
import string
import operator
import collections
import numpy as np
import string
import math
from tqdm import tqdm
from array import array
import pandas as pd
import nltk
import nltk.data
from nltk.corpus import treebank
from textblob import TextBlob
from nltk import Tree
from nltk.chunk.regexp import *

In [55]:
def splitSentence(paragraph):
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    sentences = tokenizer.tokenize(paragraph)
    return sentences

In [15]:
NONE_STR = '<s>'

In [16]:
def delete_other_letter(line):
    punctuation ="""!"#$%&\()*+,./:;<=>?@[\\]^_`{|}~\\n"""
    re_punctuation ="[{}]+".format(punctuation)
    line =re.sub(re_punctuation, "", line)
    return line.lower()

In [18]:
def read_articles(paths):
    alllines = []
    for path in paths:
        filenames = glob.glob(path+"/*txt")
        for filename in filenames:
            with open(filename, 'r', encoding='utf-8') as fpr:
                data_raw = json.load(fpr)
                article = data_raw['article']
                
#                 lines = article.split('.')
                lines = splitSentence(article)
                for line in lines:
                    line = delete_other_letter(line)
                    alllines.append(line)
                
    return alllines

In [56]:
lines = read_articles(['./RACE/train/high','./RACE/train/middle'])

In [57]:
print(len(lines))

443717


In [23]:
def word_dict(lines,n):
                
    pre_words = collections.defaultdict(int)
    be_words = collections.defaultdict(int)
    n = n-1
    for line in lines:
        for k in range(n):
            line = NONE_STR+' '+line+' '+NONE_STR
        line_words = line.split()
        for i in range(len(line_words)-(n-1)):
            ngramTemp = ' '.join(line_words[i:i+n])

            pre_words[ngramTemp] += 1
            be_words[line_words[i]] += 1
            
    return pre_words,be_words

In [24]:
def replace_punctuation(word):
    for puc in punctuation_list:
        word = word.replace(puc,'')
    return word

In [47]:
def replace_Chinese_punctuation(content):
    content = content.replace('’','\'')
    content = content.replace('”','"')
    content = content.replace('“','"')
    return content

In [26]:
def get_options(options_str):
    options_str = replace_Chinese_punctuation(options_str)
    options = []
    opt_rows = options_str.split('#')
    for opt in opt_rows:
        indexA = opt.find('A.')
        indexB = opt.find('B.')
        indexC = opt.find('C.')
        indexD = opt.find('D.')
        As = opt[indexA+2:indexB].strip().lower().split(' ')
        Bs = opt[indexB+2:indexC].strip().lower().split(' ')
        Cs = opt[indexC+2:indexD].strip().lower().split(' ')
        Ds = opt[indexD+2:].strip().lower().split(' ')
        options.append([As,Bs,Cs,Ds])
    return options

In [323]:
def read_train_data(ngram=3):
    path = './data'
    filenames = glob.glob(path+"/*txt")
    
    answers_list =[]
    options_list = []
    content_list = [] #问题空的前后缀
    sentence_list = [] #问题所在句子
    for filename in filenames:
        with open(filename, 'r', encoding='utf-8') as fpr:
            data_raw = json.load(fpr)
            article = data_raw['article']
            content = get_pre_and_be_words(article,ngram)
            sentence = get_question_sentences(article)
            options_str = data_raw['options']
            options = get_options(options_str)
            answers = list(data_raw['answers'])
            answers_list.append(answers)
            options_list.append(options)
            content_list.append(content)
            sentence_list.append(sentence)
    return content_list,sentence_list,options_list,answers_list

In [343]:
def is_options_are_the_same_tag(options):
    tag = ''
    for opt in options:
        sen_pos = nltk.pos_tag(nltk.word_tokenize(opt[0]))
        opt_tag = sen_pos[0][1]
        if tag == '':
            tag = opt_tag
        elif tag != opt_tag:
            return False
    return True

In [346]:
def get_pre_and_be_words(article,ngram):
    article = replace_Chinese_punctuation(article)
    art_words = article.split()
    n = len(art_words)
    index = 1
    content = []
    for i in range(n):
        word = art_words[i]
        if str(index) == word:
            pre_word_list = []
            be_word_list = []
            if i < ngram-1:
                for k in range(ngram-1-i):
                    pre_word_list.append(NONE_STR)
            else:
                pre_word_list = art_words[i-1-(ngram-2):i]
            pre_word_list = check_punctuation_and_replace_with_none(pre_word_list,True)

            if i > n-ngram:
                for k in range(ngram-1-(n-ngram)):
                    be_word_list.append(NONE_STR)
            else:
                be_word_list = art_words[i+1:i+1+ngram-1]
            be_word_list = check_punctuation_and_replace_with_none(be_word_list,False)
            
            content.append([pre_word_list,be_word_list])
            index += 1
    return content

自动做出选择的代码，逻辑如下


1.先判断4个选项的词性，相同则转2，不同则转3

2.如果能找到有效分块则采用模型2.0

3.如果不能则采用trigram模型2.0

In [373]:
def get_probability_of_block(block,option,opt_pos):
        
    if block is None:
        return 0
    
    n = len(option)
    for i in range(n):
        if i == 0:
            block[opt_pos]=option[i]
            continue
        block.insert(opt_pos+i,option[i])
    
    ngram = len(block)
    if ngram > 4:
        return 0
    
    pre_word = ' '.join(block[0:ngram-1])
    prob = math.log(probability(pre_word,block[ngram-1],ngram))
    
    return prob

In [368]:
def auto_select_model(question,options,ngram=3,block=None,opt_pos=-1,rate=0):
    index = 0
    maxprob = -10000000000
    n = len(options)
    for i in range(n):
        
        option = options[i]
        block_prob = get_probability_of_block(block,option,opt_pos)
        
        word_list = []
        word_list = question[0].copy()
        word_list.append(options[i][0])
        word_list.extend(question[1])
        wln = len(word_list)
        prob = 0
        for k in range(wln-ngram+1):
            pre_word = ' '.join(word_list[k:k+ngram-1])
            prob += math.log(probability(pre_word,word_list[k+ngram-1],ngram))
        
        if block_prob > 0:
            prob = block_prob*rate+prob*(1-rate)
        if prob > maxprob:
            maxprob = prob
            index = i
    return chr(ord('A')+index)

In [381]:
def auto_select(question,sentence,options,index):
    #对于模型2.0 ngram默认用3
    ngram = 3
    flag = is_options_are_the_same_tag(options)    
    if flag:
        print('--',sentence)
        print('---',index)
        print('----',options)
        block,opt_pos = get_sentence_option_block(sentence,index,options)
        print(block,opt_pos)
        if block != None:
            ngram = len(block)
            print('使用model3.0做出的预测')
            print('有用的分块',block)
            answer = auto_select_model(question,options,ngram=ngram,block=block,opt_pos=opt_pos)
            return answer
    print('使用model2.0做出的预测')
    answer = auto_select_model(question,options,ngram)

    return answer

In [386]:
test(content_list,sentence_list,options_list,answers_list,showAll=True)

---------1---------
before: [['since', "i'd"], ['seen', 'him']]
使用model2.0做出的预测
after: [['since', "i'd"], ['seen', 'him']]
predict: A ground truth: D
空前后的词 [['since', "i'd"], ['seen', 'him']]
填空所在的句子 He'd moved to England with his mum when he was three and it had been 13 years since I'd  1  seen him.
选项 [['also'], ['often'], ['even'], ['last']]
---------2---------
before: [['imagine', 'my'], ['when', 'he']]
-- So imagine my  2   when he emailed me saying he wanted to come to visit me.
--- 2
---- [['delight'], ['relief'], ['anger'], ['worry']]
##
['So', 'imagine', 'my', 'delight'] 3
使用model3.0做出的预测
有用的分块 ['So', 'imagine', 'my', 'delight']
after: [['imagine', 'my'], ['when', 'he']]
predict: A ground truth: A
空前后的词 [['imagine', 'my'], ['when', 'he']]
填空所在的句子 So imagine my  2   when he emailed me saying he wanted to come to visit me.
选项 [['delight'], ['relief'], ['anger'], ['worry']]
---------3---------
before: [['i', 'was'], ['<s>', '<s>']]
-- I was   3  !
--- 3
---- [['scared'], ['shocke

TypeError: 'NoneType' object is not iterable

In [375]:
def test(content_list,sentence_list,options_list,answers_list,showAll=False,predict=None):
    
    cn = len(content_list)
    right_n = 0
    total_n = 0
    for k in range(cn):
        content = content_list[k]
        answers = answers_list[k]
        options = options_list[k]
        sentence = sentence_list[k]
    
        n = len(answers)
        total_n += n
        for i in range(n):
            if showAll:
                print('---------'+str(i+1)+'---------')
            if predict is None:
                print('before:',content[i])
                choice = auto_select(content[i],sentence[i],options[i],i+1)
                print('after:',content[i])
            else:
                choice = predict
            if choice == answers[i]:
                right_n += 1
                if showAll == False:
                    print('---------'+str(i+1)+'---------')
                    print('predict:',choice,'ground truth:',answers[i])
                    print('空前后的词',content[i])
                    print('填空所在的句子',sentence[i])
                    print('选项',options[i])
            if showAll:
                print('predict:',choice,'ground truth:',answers[i])
                print('空前后的词',content[i])
                print('填空所在的句子',sentence[i])
                print('选项',options[i])
    accuracy = float(right_n)/total_n
    print('accuracy:',accuracy)

In [292]:
def get_words_matrix(pre_words,be_words,lines,n):  
    n = n-1
    for line in tqdm(lines):
        for k in range(n):
            line = NONE_STR+' '+line+' '+NONE_STR
        line_words = line.split()
        words_n = len(line_words)-n
        for k in range(words_n):
            pre_word = ' '.join(line_words[k:k+n])
                
            be_word = line_words[k+n]
            
            if pre_word in pre_words and be_word in be_words:
                i = pre_words.index(pre_word)
                j = be_words.index(be_word)
                words_matrix[i][j] += 1
                    
#             if n > 1:
#                 if pre_word in pre_words:
#                     i = pre_words.index(pre_word)
#                     j = be_words.index(be_word)
#                     words_matrix[i][j] += 1
#             else:
#                 i = pre_words.index(pre_word)
#                 j = be_words.index(be_word)
#                 words_matrix[i][j] += 1
            
    return words_matrix

In [32]:
def count_word_num(lines,n):
    pre_words = collections.defaultdict(int)
    be_words = collections.defaultdict(int)
    
    words = collections.defaultdict(int)
    n = n-1
    
    index = 0
    for line in lines:
        for k in range(n):
            line = NONE_STR+' '+line+' '+NONE_STR
        line_words = line.split()
        for i in range(len(line_words)-(n-1)):
            pre_word = ' '.join(line_words[i-1:i+n-1])
            be_word = line_words[i+n-1]
            
            pre_words[pre_word] += 1
            be_words[be_word] += 1
            
            words[pre_word+' '+be_word] += 1
    return pre_words,be_words,words

In [33]:
punctuation_list = ['.',',','?','!','\"','“']

In [34]:
def check_punctuation_and_replace_with_none(word_list,reverse = False):
    n = len(word_list)
    f_flag = False #第一个字符是否为标点
    b_flag = False #最后一个字符是否为标点
    if reverse == True:
        for i in range(n-1, -1, -1):
            word = word_list[i]
            if b_flag == False:
                for puc in punctuation_list:
                    if word[-1]==puc :
                        b_flag = True 
                        break
                    elif word[0] == puc:
                        f_flag = True
                        break
            if b_flag == True:
                word_list[i] = NONE_STR
            else:
                word_list[i] = replace_punctuation(word).lower()
            if f_flag == True:
                b_flag = True
    else:
        for i in range(n):
            word = word_list[i]
            if f_flag == False:
                for puc in punctuation_list:
                    if word[0] == puc:
                        f_flag = True
                        break
                    elif word[-1]==puc :
                        b_flag = True 
                        break
            if f_flag == True:
                word_list[i] = NONE_STR
            else:
                word_list[i] = replace_punctuation(word).lower()
            if b_flag == True:
                f_flag = True
    return word_list

In [184]:
def get_question_sentences(article):
    article = replace_Chinese_punctuation(article)
    sentences = splitSentence(article)
    
    content = []
    index = 1
    for sentence in sentences:
        art_words = sentence.split()
        n = len(art_words)
        for i in range(n):
            word = art_words[i]
            if str(index) == word:
                content.append(sentence)
                index += 1
    return content

In [75]:
def delete_sparse_words(words_dict,threshold=4):
    words_list = []
    for key, value in words_dict.items():
        if value > 4:
            words_list.append(key)
    return words_list

In [281]:
def probability(word1,word2,ngram):
    pre_words = pre_words_list[ngram-2]
    words = words_list[ngram-2]
    
    eclipse = 0.0000000000001
    all = pre_words[word1]
    single = words[word1+' '+word2]
    
    if all == 0:
        return eclipse
    
    prob = float(single)/all
    if prob == 0:
        prob = eclipse
    return prob

<font color=blue size=8 face=雅黑>使用NLTK将句子划分开</font>

计算bi、tri、four-gram的数据，准备之后用

In [273]:
pre_words_list = []
be_words_list = []
words_list = []
for i in range(2,5):
    pre_words,be_words,words = count_word_num(lines,i)
    pre_words_list.append(pre_words)
    be_words_list.append(be_words)
    words_list.append(words)

In [328]:
content_list,sentence_list,options_list,answers_list = read_train_data()

In [329]:
print(content_list[0][8])
print(sentence_list[0][8])

[['t-shirt', 'and'], ['into', 'the']]
I took off my T-shirt and  9   into the water.


In [295]:
print(options_list[0][8])

[['stared'], ['sank'], ['dived'], ['fell']]


In [296]:
def get_sentence_block(sentence):
    test_sent_pos = nltk.pos_tag(nltk.word_tokenize(sentence))
    sentence_tree = reg_parser.parse(test_sent_pos)
    # sentence_tree.draw()

    def get_words_from_tree(tree,word_list):
        if isinstance(tree, Tree): 
            for i in range(len(tree)):
                get_words_from_tree(tree[i],word_list)
        else:
            word_list.append(tree[0])


    temp_list = []
    for child in sentence_tree:
        if isinstance(child, Tree):               
            if child.label() == 'VERB_CON' or child.label() == 'NP':
                if len(child)>1 and len(child)<5:
                    item = []
                    get_words_from_tree(child,item)
                    temp_list.append(item)

    return temp_list

In [297]:
def does_list_exits_in_other_list(list1,list2):
    index = -1
    pos = -1
    n = len(list1)
    flag = False
    for opt in list2:
        for i in range(index+1,n):
            word = list1[i]
            if flag == True:
                if opt != word:
                    return False
                else:
                    index += 1
                    break
            else:
                if opt == word:
                    index = i
                    pos = index
                    flag = True
                    break
    return flag,pos

In [396]:
def get_sentence_option_block(sentence,index,options):
    #用A选项替换空中的标号
    A = options[0]
    sentence = sentence.replace(' '+str(index)+' ',' '.join(A))
    sentence = sentence.replace(str(index)+' ',' '.join(A))
    sentence = sentence.replace(' '+str(index),' '.join(A))
    #获得句子分块
    blocks = get_sentence_block(sentence)
    #找到分块中与选项相关的部分
    for block in blocks:
        flag,pos = does_list_exits_in_other_list(block,A)
        if flag:
            return block,pos

In [299]:
print(content_list[0])

["He'd moved to England with his mum when he was three and it had been 13 years since I'd  1  seen him.", 'So imagine my  2   when he emailed me saying he wanted to come to visit me.', 'I was   3  !', 'I arrived early at Byron Bay where we were supposed to   4  .', 'The bay was  5   in sunshine, and there was a group of kayakers around 150m off the shore.', 'Getting a little   6  , I realized one kayak  was in  7  .', 'Getting a little   6  , I realized one kayak  was in  7  .', '"Something\'s not   8  !"', 'I took off my T-shirt and  9   into the water.', 'He was  10   violently.', 'Linking arms with one of the instructors， I helped  11   the young man out of the water.', 'He was unconscious and as I looked at his face, something  12   to me.', 'Those brown eyes were very   13  .', '"Ben," he replied, and immediately I  14  .', '15 , after a brief stay in hospital, Ben was well enough to be allowed to  16   and later the family met up for dinner.', '15 , after a brief stay in hospital

In [300]:
print(options_list[0])

[[['also'], ['often'], ['even'], ['last']], [['delight'], ['relief'], ['anger'], ['worry']], [['scared'], ['shocked'], ['thrilled'], ['ashamed']], [['talk'], ['stay'], ['meet'], ['settle']], [['bathed'], ['clean'], ['deep'], ['formed']], [['faster'], ['closer'], ['heavier'], ['wiser']], [['trouble'], ['advance'], ['question'], ['battle']], [['real'], ['right'], ['fair'], ['fit']], [['stared'], ['sank'], ['dived'], ['fell']], [['arguing'], ['fighting'], ['shouting'], ['shaking']], [['lead'], ['persuade'], ['carry'], ['keep']], [['happened'], ['occurred'], ['applied'], ['appealed']], [['sharp'], ['pleasant'], ['attractive'], ['familiar']], [['agreed'], ['hesitated'], ['doubted'], ['knew']], [['fortunately'], ['frankly'], ['sadly'], ['suddenly']], [['return'], ['relax'], ['speak'], ['leave']], [['joked'], ['turned'], ['listened'], ['pointed']], [['created'], ['honored'], ['saved'], ['guided']], [['coincidence'], ['change'], ['pity'], ['pain']], [['on', 'board'], ['in', 'time'], ['for', 's

In [301]:
print(answers_list[0])

['D', 'A', 'C', 'C', 'A', 'B', 'A', 'B', 'C', 'D', 'C', 'B', 'D', 'D', 'A', 'D', 'B', 'C', 'A', 'B']


In [397]:
test(content_list,sentence_list,options_list,answers_list,showAll=True)

---------1---------
before: [['since', "i'd"], ['seen', 'him']]
使用model2.0做出的预测
after: [['since', "i'd"], ['seen', 'him']]
predict: A ground truth: D
空前后的词 [['since', "i'd"], ['seen', 'him']]
填空所在的句子 He'd moved to England with his mum when he was three and it had been 13 years since I'd  1  seen him.
选项 [['also'], ['often'], ['even'], ['last']]
---------2---------
before: [['imagine', 'my'], ['when', 'he']]
-- So imagine my  2   when he emailed me saying he wanted to come to visit me.
--- 2
---- [['delight'], ['relief'], ['anger'], ['worry']]
[['So', 'imagine', 'my', 'delight'], ['emailed', 'me'], ['saying', 'he'], ['wanted', 'to', 'come', 'to', 'visit', 'me']]
['So', 'imagine', 'my', 'delight'] 3
使用model3.0做出的预测
有用的分块 ['So', 'imagine', 'my', 'delight']
after: [['imagine', 'my'], ['when', 'he']]
predict: A ground truth: A
空前后的词 [['imagine', 'my'], ['when', 'he']]
填空所在的句子 So imagine my  2   when he emailed me saying he wanted to come to visit me.
选项 [['delight'], ['relief'], ['anger'], 

TypeError: 'NoneType' object is not iterable

功能测试代码。

In [232]:

sentence = 'I took off my T-shirt and  stared    into the water.'
# opt_word = 

# ‘<XX>’ 代表XX类型的词 ‘<XX>?’代表必须含有这个类型的词1或者0次 '<XX>*‘表示含有有这个类型的词0或者多次 '<XX>+'表示一定含有这个词

# NP DT:限定词 JJ:形容词 NNP:专有名词 NN:名词单数 NNS:名词复数
# Preposition
# Verb
# PP -> P NP
# VP -> V (NP|PP)*

reg_parser = RegexpParser('''
        NP: {<PRP.*>?<MD>?<PRP$>?<DT>?<JJ.*>*<NN.*>*}   
         P: {<IN>}    
      VERB: {<RB.*>?<V.*>?<RB.*>?<TO>*<RP>?}                
  VERB_CON: {<VERB>+<NP>?<P>?<NP>?}             
''')

test_sent = "Mr. Obama played a big role in the insurance bill"
# test_sent = "He'd moved to England with his mum"
test_sent_pos = nltk.pos_tag(nltk.word_tokenize(sentence))
sentence_tree = reg_parser.parse(test_sent_pos)
print(sentence_tree)
# sentence_tree.draw()

def get_words_from_tree(tree,word_list):
    if isinstance(tree, Tree): 
        for i in range(len(tree)):
            get_words_from_tree(tree[i],word_list)
    else:
        word_list.append(tree[0])


temp_list = []
for child in sentence_tree:
    if isinstance(child, Tree):               
        if child.label() == 'VERB_CON' or child.label() == 'NP':
            if len(child)>1 and len(child)<5:
                item = []
                get_words_from_tree(child,item)
                temp_list.append(item)
                
print(temp_list)

(S
  (NP I/PRP)
  (VERB_CON (VERB took/VBD off/RP) (NP my/PRP$ T-shirt/NN))
  and/CC
  (VERB_CON (VERB stared/VBD) (P into/IN) (NP the/DT water/NN))
  ./.)
[['took', 'off', 'my', 'T-shirt'], ['stared', 'into', 'the', 'water']]


In [272]:
block = get_sentence_option_block(content_list[0][8],9,options_list[0][8])
print(block)

['stared', 'into', 'the', 'water']
