In [1]:
import json
import nltk
import random
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet

In [2]:
with open('data/Taxes & Duties annotations.json', 'r', encoding='utf-8') as f:
    lines = json.load(f)

In [3]:
contexts = []
for line in lines:
    annotation = line['annotations']
    for annot in annotation:
        result = annot['result']
        for res in result:
            labels = res['value']['labels']
            if labels[0] == "Who is responsible for paying taxes?":
                context = line['data']['text']
                contexts.append(context)

In [4]:
print(len(contexts))

86


In [10]:
def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    if pos_tag.startswith('V'):
        return wordnet.VERB
    if pos_tag.startswith('N'):
        return wordnet.NOUN
    if pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [5]:
def rephrase(text):
    tokens = word_tokenize(text)
    tagged = nltk.pos_tag(tokens)
    synsets = []
    for token, pos in tagged:
        token_synsets = wordnet.synsets(token, pos=get_wordnet_pos(pos))
        if token_synsets:
            synsets.append(token_synsets[0])
    lemmas = [random.choice(synset.lemmas()).name() for synset in synsets]
    return ' '.join(lemmas)

In [27]:
def paraphrase(text):
    synonyms_dict = {}
    tokens = word_tokenize(text)
    for token in tokens:
        synsets = wordnet.synsets(token)
        if synsets:
            synonyms = [synset.lemmas()[0].name() for synset in synsets]
            if len(synonyms) > 1:
                 synonym_dict[token] = random.choice(synonyms)
    for token, replacement in synonym_dict.items():
        text = text.replace(token, replacement)
    return text     

In [6]:
print(contexts[3])

Article 17 Defects Liability 17.1 Defects Liability Period (i) The Contractor shall be responsible for all the Defects and deficiencies, except usual wear and tear in the Project Highway or any Section thereof, till the expiry of a period of commencing from the date of Completion Certificate (the “Defects Liability Period”) as specified below: (a) 5 (five) years from the date of completion in case of a road being constructed with flexible pavement; (b) 10 (ten) years from the date of completion in case of road being constructed with rigid pavement; (c) 10 (ten) years from the date of completion in case of road being constructed with flexible pavement using perpetual design; (d) 10 (ten) years from the date of completion in case of all stand-alone structures, e.g. Major Bridges/ and Tunnels; (e) 10 (ten) years from the date of completion for the stretches where new technology/ material has been/ is proposed to be used. (f) 3 (three) years from the date of completion for stretches requir

In [29]:
rephrased_text = rephrase(contexts[2])
print(rephrased_text)

hold neutralise redress arsenic emptor sole exclusive redress labor neglect attain labor pass_completion Oregon day_of_the_month seller volition pay buyer damages X arsenic vendee sole exclusive remedy labor fail attain completion Beaver_State day_of_the_month marketer will pay purchaser redress X


In [4]:
# from googletrans import Translator
# from nltk.tokenize import sent_tokenize
# def backtrans(text, lang):
#     translator = Translator()
#     sent_text = sent_tokenize(text)
#     full_trans_text = ""
#     for sent in sent_text:
#         translated = translator.translate(sent, dest = lang).text
#         back_translated = translator.translate(translated, src=lang, dest='en').text
#         full_trans_text += " "+back_translated
#     return full_trans_text

In [5]:
from os import environ
from google.cloud import translate_v2
from nltk.tokenize import sent_tokenize

environ['GOOGLE_APPLICATION_CREDENTIALS'] = r"Backtrans_Aug_LD/google_translate_key.json"
translate_client = translate_v2.Client()
def backtrans(text, lang):
    sent_text = sent_tokenize(text)
    full_trans_text = ""
    for sent in sent_text:
        translated = translate_client.translate(sent, target_language = lang)
        back_translated = translate_client.translate(translated['translatedText'], target_language ='en')
        full_trans_text += " "+back_translated['translatedText']
    return full_trans_text

In [8]:
query = "What is the amount for retention money?"
back_trans_query = backtrans(query, 'it')
print(back_trans_query)

 What is the withholding amount?


In [6]:
final_context = []
i = 0 
for context in contexts:
    back_trans_text = backtrans(context, 'it')
    final_context.append(back_trans_text)
    i += 1
    print(i)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86


In [7]:
i = 0
for cont in final_context:
    f = open('taxes_duties/aug_data/ITALIAN/taxes_duties_it_backtrans '+str(i)+'.txt','w', encoding='utf-8')
    f.write(cont)
    i += 1

In [9]:
print(final_context[3])

 CLAUSE-3: PERFORMANCE BONUS: 3.1 The deposit shall consist of: (ii) Reservations to be recovered from the contractor&#39;s running account invoice. 3.2 The contractor must pay a performance bond equivalent to 2% of the contract amount within 30 days from the date of successful bidding, including additional deposits for prepaid bidding or bidding for undervalued items. Later in the term of the contract, it must be executed in one of the following formats: or (b) a bank guarantee from a nationalized/scheduled bank of India acceptable to the employer in the prescribed form. 3.3 Retention shall be deducted from the contractor&#39;s temporary invoice as a deposit of 3% (3%) of the total amount of each invoice for work performed (including price fluctuations). 3.4 If the Contractor expressly requests in writing, the Contractor may convert the reserve amount deducted from the provisional bill into a bank guarantee in the prescribed form. 3.5 Any compensation or other sums payable by the Cont

In [41]:
# from gingerit.gingerit import GingerIt
# corrected_contexts = []
# i = 0
# for context in final_context:
#     corrected = ""
#     cont = sent_tokenize(context)
#     for c in cont:
#         if len(c.split()) < 100:
#             text = GingerIt().parse(c)
#             corrected += " " + text['result']
#         else:
#             continue
#     corrected_contexts.append(corrected)
#     i += 1
#     print(i)
print(final_context[3])
print(corrected_contexts[3])

 Article 13 Delay Damages and Bonuses 13.1 Delay Damages. If it occurs after the date of substantial completion on which substantial completion was guaranteed, Contractor shall pay the amount set forth in Exhibit T for each day or portion of the delay until substantial completion occurs. shall be paid to the owner per day (“Late Damages”). 13.2 LNG Production Bonus and First Cargo Bonus A. LNG Production Bonus 1. If substantial completion occurs within *** (***) days from the date of assurance of substantial completion, the owner shall pay a bonus amount to the contractor. will do. (i) *** (US$** *) to one of our customers before the LNG production bonus date. The maximum LNG production bonus payable to a contractor pursuant to this Section 13.2A shall be *** US Dollar (US$***). 2. “LNG Production Bonus Date” is *** (***) days after NTP issuance. 3. For the avoidance of doubt, Contractor shall not be entitled to LNG Production Bonuses on LNG that is boiled off or vaporized and sold to 