### Preparation

In [1]:
import re
import spacy
import xlwt 

In [2]:
f=open('F:\\Y4\\NLP\\triage\\europarl\\result\\model\\prediction.txt','r')
alllines=f.readlines()
f.close()
f=open('F:\\Y4\\NLP\\triage\\europarl\\result\\model\\prediction.txt','w+')
for eachline in alllines:
    a=re.sub('shall','will',eachline)
    b=re.sub('Shall','Will',a)
    f.writelines(b)
f.close()

In [3]:
class TenseSegmentModel:
    def __init__(self, token, lemma, tag):
        self.token = token
        self.lemma = lemma
        self.tag = tag

In [4]:
class TenseModel:
    def __init__(self, tense_segments: list):
        self.segments = tense_segments
        self.tense = None
        self.find_tense(self.get_passive_pattern())
        if self.tense is None:
            self.find_tense(self.get_normal_pattern())
        #if self.tense is None:
            #self.tense = ""

    @staticmethod
    def get_passive_pattern():
        return {
            'Present': [{'be': ['VBZ', 'VBP']}, {'_': ['VBN']}],
            'Modal1': [{'_all_modalpre_': ['MD']}, {'be': ['VB']}, {'_': ['VBN']}],
            'PresentContinuous': [{'be': ['VBZ', 'VBP']}, {'be': ['VBG']}, {'_': ['VBN']}],
            'Past': [{'be': ['VBD']}, {'_': ['VBN']}],
            'PastContinuous': [{'be': ['VBD']}, {'be': ['VBG']}, {'_': ['VBN']}],
            'Modal2': [{'_all_modalpas_': ['MD']}, {'be': ['VB']}, {'_': ['VBN']}],
            'PrePerfect': [{'have': ['VBZ', 'VBP']}, {'be': ['VBN']}, {'_': ['VBN']}],
            'PasPerfect': [{'have': ['VBD']}, {'be': ['VBN']}, {'_': ['VBN']}],
            'Future': [{'will': ['MD']}, {'be': ['VB']}, {'_': ['VBN']}],
            'FutureContinuous': [{'will': ['MD']}, {'be': ['VB']}, {'be': ['VBG']}, {'_': ['VBN']}],
            'PreSubjunctive': [{'_all_modalpas_': ['MD']}, {'be': ['VB']}, {'_': ['VBN']}],
            'PasSubjunctive': [{'_all_modalpas_': ['MD']}, {'have': ['VB']}, {'be': ['VBN']}, {'_': ['VBN']}],
            'Infinitive': [{'must': ['MD']}, {'be': ['VB']}, {'_': ['VBN']}],
        }

    @staticmethod
    def get_normal_pattern():
        return {
            #'Pasth': [{'had': ['VBN']}],
            'Present': [{'_': ['VBZ', 'VBP', 'VB']}],
            'Modal1': [{'_all_modalpre_': ['MD']}, {'_': ['VB']}], 
            'Modal2': [{'_all_modalpas_': ['MD']}, {'_': ['VB']}],
            'PresentContinuous': [{'be': ['VBZ', 'VBP']}, {'_': ['VBG']}],
            'Past': [{'_': ['VBD']}],
            'PastContinuous': [{'be': ['VBD']}, {'_': ['VBG']}],
            'PrePerfect': [{'have': ['VBZ', 'VBP']}, {'_': ['VBN']}],
            'PrePerfectContinuous': [{'have': ['VBZ', 'VBP']}, {'be': ['VBN']}, {'_': ['VBG']}],
            'PasPerfect': [{'have': ['VBD']}, {'_': ['VBN']}],
            'PasPerfectContinuous': [{'have': ['VBD']}, {'be': ['VBN']}, {'_': ['VBG']}],
            'FutPerfect': [{'will': ['MD']}, {'have': ['VB']}, {'_': ['VBN']}],
            'FutPerfectContinuous': [{'will': ['MD']}, {'have': ['VB']}, {'be': ['VBN']}, {'_': ['VBG']}],
            'Future': [{'will': ['MD']}, {'_': ['VB']}],
            'FutureContinuous': [{'will': ['MD']}, {'be': ['VB']}, {'_': ['VBG']}],
        }

    @staticmethod
    def get_all_modalpre():
        return ['can', 'may', 'shall', 'must']
        #return ['can', 'may', 'must']
    
    @staticmethod
    def get_all_modalpas():
        return ['could','might','should','would']

    def find_tense(self, patterns):
        for tense_name, tense_pattern in patterns.items():
            if len(tense_pattern) != len(self.segments):
                continue

            for index, segment_pattern in enumerate(tense_pattern):
                if len(self.segments) > index and self.segments[index] is not None:
                    tense_segment = self.segments[index]
                    if '_' in segment_pattern and len(self.segments) == index + 1:
                        if tense_segment.tag in segment_pattern['_']:
                            self.tense = tense_name
                            return True
                    if tense_segment.lemma in segment_pattern:
                        if tense_segment.tag in segment_pattern[tense_segment.lemma]:
                            if tense_segment.lemma in segment_pattern:
                                continue

                    elif tense_segment.lemma in self.get_all_modalpre():
                        if '_all_modalpre_' in segment_pattern:
                            continue
                    elif tense_segment.lemma in self.get_all_modalpas():
                        if '_all_modalpas_' in segment_pattern:
                            continue
                    elif tense_segment.lemma == 'have' and len(self.segments) == 1:
                        self.tense = tense_name
                        return True
                    break
        return False


In [5]:
class SentenceTenseModel:
    def __init__(self, tokens):
        self._tokens = tokens

    def parse(self):
        tense_list = []

        current_tense = []
        increment_counter = 0
        for token in self._tokens:
            if token.pos_ in ('AUX', 'VERB'):
                increment_counter += 1
                tense_segment = TenseSegmentModel(token.text, token.lemma_, token.tag_)
                current_tense.append(tense_segment)
            elif increment_counter > 0:
                increment_counter = 0
                tense = TenseModel(current_tense)
                tense_list.append(tense)
                current_tense = []
        return tense_list

In [6]:
class TenseParser:
    def __init__(self):
        self.nlp = spacy.load('en_core_web_sm')

    def find_tenses(self, text):
        doc = self.nlp(text)
        tense_list = []
        # for token in doc:
        #     print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop)
        for sentence in doc.sents:
            if len(sentence) <= 1:
                continue
            sentence_tense_model = SentenceTenseModel(sentence)
            tense_list.append(sentence_tense_model.parse())
        return tense_list

    def find_tense_simple_form(self, text):
        result = self.find_tenses(text)
        return [[tense.tense for tense in sentence] for sentence in result]

    def find_tense_simple_form_str(self, text):
        result = self.find_tense_simple_form(text)
        return '. '.join([''.join([str(tense) for tense in sentence]) for sentence in result])

### Annotation

In [7]:
prediction=open('F:\\Y4\\NLP\\triage\\europarl\\result\\model\\prediction.txt','r',encoding='utf-8')
tensep=open('F:\\Y4\\NLP\\triage\\europarl\\result\\tensep.txt','w')
 
lines=prediction.readlines()  
prediction.close()
 
for line in lines:
    tense_parser = TenseParser()
    print(tense_parser.find_tense_simple_form_str(line), file=tensep)
    
tensep.close()

In [8]:
f=open('F:\\Y4\\NLP\\triage\\europarl\\result\\tensep.txt','r')
alllines=f.readlines()
f.close()
f=open('F:\\Y4\\NLP\\triage\\europarl\\result\\tensep.txt','w+')
for eachline in alllines:
    a=re.sub('PresentContinuous','Present',eachline)
    b=re.sub('PastContinuous','Past',a)
    c=re.sub('FutureContinuous','Future',b)
    d=re.sub('PrePerfectContinuous','PrePerfect',c)
    e=re.sub('PasPerfectContinuous','PasPerfect',d)
    g=re.sub('FutPerfectContinuous','FutPerfect',e)
    h=re.sub('Modal1','Modal',g)
    i=re.sub('Modal2','Modal',h)
    j=re.sub('PresentPresent','Present',i)  
    k=re.sub('PresentPresent','Present',j)
    final=re.sub('None','',k)
    f.writelines(final)
f.close()

### Comparasion

In [15]:
L=[]
with open('F:\\Y4\\NLP\\triage\\europarl\\result\\tenseref.txt') as f1, open('F:\\Y4\\NLP\\triage\\europarl\\result\\tensep.txt') as f2:
    for lineno, (line1, line2) in enumerate(zip(f1, f2), 1):
        if line1 != line2:
            L.append(lineno)
print(L)            

[4, 6, 9, 14, 17, 22, 24, 32, 33, 36, 37, 38, 40, 41, 43, 45, 49, 50, 51, 53, 56, 60, 70, 77, 85, 88, 90, 91, 94, 103, 108, 109, 115, 117, 119, 121, 122, 123, 124, 125, 126, 128, 129, 131, 132, 134, 137, 139, 143, 144, 146, 147, 149, 150, 153, 160, 163, 164, 166, 168, 169, 171, 173, 174, 176, 177, 182, 183, 187, 189, 196, 198, 200, 202, 203, 207, 211, 212, 213, 217, 218, 219, 222, 225, 228, 233, 239, 241, 242, 244, 245, 252, 253, 255, 258, 259, 266, 268, 279, 282, 283, 284, 288, 289, 290, 291, 293, 295, 297, 299, 302, 307, 308, 309, 310, 311, 313, 315, 316, 319, 322, 323, 324, 328, 329, 330, 331, 332, 333, 335, 336, 338, 339, 342, 343, 344, 346, 347, 348, 349, 352, 353, 355, 357, 360, 361, 362, 363, 364, 368, 369, 370, 371, 372, 374, 375, 377, 378, 379, 382, 383, 386, 388, 391, 393, 395, 396, 398, 400, 401, 404, 405, 410, 412, 413, 414, 415, 417, 418, 419, 420, 421, 422, 424, 426, 427, 428, 429, 430, 431, 432, 433, 435, 437, 438, 439, 440, 442, 443, 444, 447, 448, 455, 457, 459, 462, 4

In [16]:
l=len(L)
print(1-l/552)

0.552536231884058
