# **Data augmentation: combined approach**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pickle
import numpy as np

from nltk.tag.stanford import StanfordPOSTagger
from nltk.tokenize import word_tokenize

from nltk.corpus import wordnet

from lxml import html
import requests

# Install word tokenizer:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Install French POS-tagger:
!wget 'https://nlp.stanford.edu/software/stanford-tagger-4.2.0.zip'
!unzip stanford-tagger-4.2.0.zip

# Install Huggingface libraries:
!pip install transformers

from transformers import pipeline

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


--2023-05-23 21:59:50--  https://nlp.stanford.edu/software/stanford-tagger-4.2.0.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 302 FOUND
Location: https://downloads.cs.stanford.edu/nlp/software/stanford-tagger-4.2.0.zip [following]
--2023-05-23 21:59:50--  https://downloads.cs.stanford.edu/nlp/software/stanford-tagger-4.2.0.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 78034596 (74M) [application/zip]
Saving to: ‘stanford-tagger-4.2.0.zip’


2023-05-23 22:00:01 (7.14 MB/s) - ‘stanford-tagger-4.2.0.zip’ saved [78034596/78034596]

Archive:  stanford-tagger-4.2.0.zip
   creating: stanford-postagger-full-2020-11-17/
  inflating: stanford-postagger-full-2020-1

## **Load the data**

Classification task:

In [3]:
task = '3'

Load train set:

In [4]:
f_in = open("drive/MyDrive/train_set_"+task+"_orig.pkl","rb")

data_train = pickle.load(f_in)
 
f_in.close()

Extract positive examples:

In [5]:
data_train_positive = [data_train[i][0] for i in range(len(data_train)) if data_train[i][2]]

In [6]:
data_train_positive[0:5]

['Article 1 : Occupations ou utilisations du sol interdites\n \n1) Dans l’ensemble de la zone sont interdits :\n \nLes terrains de camping ou de caravanage permanents visés à l’article L.443-1 et L.444-1 du \ncode de l’urbanisme.',
 'Article 1 : Occupations ou utilisations du sol interdites\n \n1) Dans l’ensemble de la zone sont interdits :\n \nLes habitations légères de loisirs.',
 'Article 1 : Occupations ou utilisations du sol interdites\n \n1) Dans l’ensemble de la zone sont interdits :\n \nLes constructions destinées à l’habitation ne dépendant pas d’une exploitation agricole autres \nque celles visées à l’article 2 paragraphe 1).',
 'Article 1 : Occupations ou utilisations du sol interdites\n \n1) Dans l’ensemble de la zone sont interdits :\n \nLes constructions destinées à l’hébergement hôtelier autres que celles visées à l’article 2 \nparagraphe 1).',
 'Article 1 : Occupations ou utilisations du sol interdites\n \n1) Dans l’ensemble de la zone sont interdits :\n \nLes construct

## **Perform augmentation**

### **Combined approach (adj+adv, DES): replace all adjectives and adverbs in segments which contain concepts from expert nomenclature enriched by DES**

Experiment (augmentation) name:

In [7]:
experiment = '7'

How many times repeat augmentation:

In [8]:
k = 1

Load nomenclature concepts from expert nomenclature:

In [9]:
f = open("drive/MyDrive/nomenclature_expert", "r")

hierarchy = {} # nomenclature hierarchy
nomenclature_expert = [] # list of all concepts

for line in f:
    try:
        textLine = line.strip()
        if textLine != '':
            data = textLine.split(':')
            parent_node = data[0].strip()
            if parent_node.lower() not in nomenclature_expert and parent_node != 'objet':
                nomenclature_expert.append(parent_node.lower())
            child_nodes = [name.strip() for name in data[1].split(',')]
            for name in child_nodes:
                if name.lower() not in nomenclature_expert:
                    nomenclature_expert.append(name.lower())
            hierarchy[parent_node] = child_nodes
    except ValueError:
        print('Invalid input:',line)

f.close()

Control sum:

In [10]:
len(nomenclature_expert)

207

Enrich expert nomenclature by using synonyms from the DES dictionary:


In [11]:
def getSynonymsDes(word,k):
    synonyms = []
    
    request_str = 'https://crisco4.unicaen.fr/des/synonymes/'+word.lower().replace(" ","+")
    page = requests.get(request_str)
    tree = html.fromstring(page.content)
    rows = tree.xpath('//table/tr')
    
    for row in rows:
        text = row.xpath('./td/a/text()')[0].strip()
        synonyms.append(text)
        
    return synonyms[0:k]

In [12]:
s = 5 # top synonyms for each concept

new_nomenclature = []

for name in nomenclature_expert:
    list_synonyms = getSynonymsDes(name,s)
    if list_synonyms != []:
        for concept in [item.replace('_',' ') for item in list_synonyms]:
            if concept.lower() not in nomenclature_expert and concept.lower() not in new_nomenclature:
                new_nomenclature.append(concept.lower())

nomenclature_extended = nomenclature_expert + new_nomenclature

Control sum:

In [13]:
len(nomenclature_extended)

487

Define POS-tagger:

In [14]:
st = StanfordPOSTagger('/content/stanford-postagger-full-2020-11-17/models/french-ud.tagger',
                       '/content/stanford-postagger-full-2020-11-17/stanford-postagger-4.2.0.jar',
                       encoding='utf-8')

Mask all adjectives and adverbs in segments which contain concepts from enriched expert nomenclature:

In [15]:
def getMaskedSegmentsADJADV(input_data,nomenclature_concepts):
  classified_list = []
  masked_list = []
  for i in range(len(input_data)):
    phrase = input_data[i]
    flag = False
    for concept in nomenclature_concepts:
      if (concept+" " in phrase) or (concept+"," in phrase) or (concept+"." in phrase):
        flag = True
    if flag:
      tokenized_text = word_tokenize(phrase, language='french')
      classified_text = st.tag(tokenized_text)
      masked_text = ""
      for word,tag in classified_text:
        if tag != 'ADJ' and tag != 'ADV':
          if word == "’":
            masked_text = masked_text[:-1] + word
          elif word == "," or word == "." or word == ")":
            masked_text = masked_text[:-1] + word + " "
          elif word == "(":
            masked_text += word
          else:
            masked_text += word + " "
        else:
          masked_text += '<mask> '
      classified_list.append(classified_text)
      masked_list.append(masked_text.strip())
    if i % 10 == 0:
      print("Process",i,"segment")

  return classified_list, masked_list

Perform masking:

In [16]:
classified_segments, masked_segments = getMaskedSegmentsADJADV(data_train_positive,nomenclature_extended)

Process 0 segment
Process 10 segment
Process 20 segment
Process 30 segment
Process 40 segment
Process 50 segment
Process 60 segment
Process 70 segment
Process 80 segment
Process 90 segment
Process 100 segment
Process 110 segment


Masked segments:

In [17]:
masked_segments[0:5]

['Article 1 : Occupations ou utilisations du sol interdites 1) Dans l’<mask> de la zone sont interdits : Les terrains de camping ou de caravanage <mask> visés à l’article L.443-1 et L.444-1 du code de l’urbanisme.',
 'Article 1 : Occupations ou utilisations du sol interdites 1) Dans l’<mask> de la zone sont interdits : Les habitations <mask> de loisirs.',
 'Article 1 : Occupations ou utilisations du sol interdites 1) Dans l’<mask> de la zone sont interdits : Les constructions destinées à l’habitation <mask> dépendant <mask> d’une exploitation <mask> <mask> que celles visées à l’article 2 paragraphe 1).',
 'Article 1 : Occupations ou utilisations du sol interdites 1) Dans l’<mask> de la zone sont interdits : Les constructions destinées à l’hébergement <mask> <mask> que celles visées à l’article 2 paragraphe 1).',
 'Article 1 : Occupations ou utilisations du sol interdites 1) Dans l’<mask> de la zone sont interdits : Les constructions destinées <mask> bureaux, au commerce et activités de

Predict masked words in the resulting sentences:

In [18]:
model_name = "camembert-base" 

camembert_unmasker = pipeline("fill-mask", model=model_name, tokenizer=model_name) # define the model

def generateCombinations(masked_text):
  # predict masked words:
  predicted_words = camembert_unmasker(masked_text)

  # extract predicted words:
  generated_words = []
  if masked_text.count("<mask>") == 1:
    generated_candidates = []
    for word in predicted_words[:k]:
      generated_candidates.append(word['token_str'])
    generated_words.append(generated_candidates)
  else:
    for word_candidates in predicted_words:
      generated_candidates = []
      for word in word_candidates[:k]:
        generated_candidates.append(word['token_str'])
      generated_words.append(generated_candidates)

  return generated_words

Downloading (…)lve/main/config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/811k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.40M [00:00<?, ?B/s]

Define functions for new phrase generation:

In [19]:
def maskedPhrase2NewTextADJADV(generated_combinations,combination_id,classified_phrase):
  predicted_text = ""
  word_id = 0
  for word,tag in classified_phrase:
    if tag != 'ADJ' and tag != 'ADV':
      if word == "’" or word == ".":
        predicted_text = predicted_text[:-1] + word
      else:
        predicted_text += word + " "
    else:
      predicted_text += generated_combinations[word_id][combination_id] + " "
      word_id += 1

  return predicted_text

def generateNewSegmentsADJADV(input_data,classified_list,masked_list,k):
  # k - how many times to repeat genration with each phrase
  generated_segments = []
  for i in range(len(masked_list)):
    nb_combinations = masked_list[i].count("<mask>")
    if nb_combinations > 0:
      generated_combinations = generateCombinations(masked_list[i])
      for j in range(k):
        new_phrase = maskedPhrase2NewTextADJADV(generated_combinations,j,classified_list[i])
        if new_phrase.strip() != input_data[i].replace("\n","").strip():
          generated_segments.append(new_phrase.strip())
    if i % 10 == 0:
      print("Process",i,"segment")

  return generated_segments

Perform generation:

In [20]:
new_segments = generateNewSegmentsADJADV(data_train_positive,classified_segments,masked_segments,1)

Process 0 segment
Process 10 segment
Process 20 segment
Process 30 segment
Process 40 segment
Process 50 segment
Process 60 segment
Process 70 segment


Generated segments:

In [21]:
new_segments[0:5]

['Article 1 : Occupations ou utilisations du sol interdites 1 ) Dans l’ensemble de la zone sont interdits : Les terrains de camping ou de caravanage sont visés à l’article L.443-1 et L.444-1 du code de l’urbanisme.',
 'Article 1 : Occupations ou utilisations du sol interdites 1 ) Dans l’ensemble de la zone sont interdits : Les habitations principales de loisirs.',
 'Article 1 : Occupations ou utilisations du sol interdites 1 ) Dans l’ensemble de la zone sont interdits : Les constructions destinées à l’habitation ne dépendant pas d’une exploitation ( autre que celles visées à l’article 2 paragraphe 1 ).',
 'Article 1 : Occupations ou utilisations du sol interdites 1 ) Dans l’ensemble de la zone sont interdits : Les constructions destinées à l’hébergement ( autres que celles visées à l’article 2 paragraphe 1 ).',
 'Article 1 : Occupations ou utilisations du sol interdites 1 ) Dans l’ensemble de la zone sont interdits : Les constructions destinées aux bureaux , au commerce et activités de

Create new segments:

In [22]:
data_new = [(i,-1,True) for i in new_segments]

data_augmented = data_train + data_new

Some stats:

In [23]:
len(data_augmented)

540

In [24]:
print("Positive examples:", len([i for i in range(len(data_augmented)) if data_augmented[i][2]]))
print("Negative examples:", len([i for i in range(len(data_augmented)) if not data_augmented[i][2]]))

Positive examples: 188
Negative examples: 352


Save results:

In [25]:
f_out = open("drive/MyDrive/train_set_"+task+"_augm-"+experiment+".pkl","wb")

pickle.dump(data_augmented,f_out)

f_out.close()

### **Combined approach (nouns, DES): replace all nouns in segments which contain concepts from expert nomenclature enriched by DES**

Experiment (augmentation) name:

In [26]:
experiment = '8'

How many times repeat augmentation:

In [27]:
k = 2

Mask all nouns in segments which contain concepts from enriched expert nomenclature:

In [28]:
def getMaskedSegmentsNOUN(input_data,nomenclature_concepts):
  classified_list = []
  masked_list = []
  for i in range(len(input_data)):
    phrase = input_data[i]
    flag = False
    for concept in nomenclature_concepts:
      if (concept+" " in phrase) or (concept+"," in phrase) or (concept+"." in phrase):
        flag = True
    if flag:
      tokenized_text = word_tokenize(phrase, language='french')
      classified_text = st.tag(tokenized_text)
      masked_text = ""
      for word,tag in classified_text:
        if tag != 'NOUN':
          if word == "’":
            masked_text = masked_text[:-1] + word
          elif word == "," or word == "." or word == ")":
            masked_text = masked_text[:-1] + word + " "
          elif word == "(":
            masked_text += word
          else:
            masked_text += word + " "
        else:
          masked_text += '<mask> '
      classified_list.append(classified_text)
      masked_list.append(masked_text.strip())
    if i % 10 == 0:
      print("Process",i,"segment")

  return classified_list, masked_list

Perform masking:

In [29]:
classified_segments, masked_segments = getMaskedSegmentsNOUN(data_train_positive,nomenclature_extended)

Process 0 segment
Process 10 segment
Process 20 segment
Process 30 segment
Process 40 segment
Process 50 segment
Process 60 segment
Process 70 segment
Process 80 segment
Process 90 segment
Process 100 segment
Process 110 segment


Masked segments:

In [30]:
masked_segments[0:5]

['<mask> 1 : <mask> ou <mask> du <mask> interdites 1) Dans l <mask> ensemble de la <mask> sont interdits : Les <mask> de <mask> ou de <mask> permanents visés à l’<mask> L.443-1 et L.444-1 du <mask> de l’<mask>.',
 '<mask> 1 : <mask> ou <mask> du <mask> interdites 1) Dans l <mask> ensemble de la <mask> sont interdits : Les <mask> légères de <mask>.',
 '<mask> 1 : <mask> ou <mask> du <mask> interdites 1) Dans l <mask> ensemble de la <mask> sont interdits : Les <mask> destinées à l’<mask> ne dépendant pas d’une <mask> agricole autres que celles <mask> à l’<mask> 2 <mask> 1).',
 '<mask> 1 : <mask> ou <mask> du <mask> interdites 1) Dans l <mask> ensemble de la <mask> sont interdits : Les <mask> destinées à l’<mask> hôtelier autres que celles <mask> à l’<mask> 2 <mask> 1).',
 '<mask> 1 : <mask> ou <mask> du <mask> interdites 1) Dans l <mask> ensemble de la <mask> sont interdits : Les <mask> destinées aux <mask>, au <mask> et <mask> de <mask>, à l’<mask>, à l’<mask> autres que celles <mask> à

Define functions for new phrase generation:

In [31]:
def generateCombinations(masked_text):
  # predict masked words:
  predicted_words = camembert_unmasker(masked_text)

  # extract predicted words:
  generated_words = []
  if masked_text.count("<mask>") == 1:
    generated_candidates = []
    for word in predicted_words[:k]:
      generated_candidates.append(word['token_str'])
    generated_words.append(generated_candidates)
  else:
    for word_candidates in predicted_words:
      generated_candidates = []
      for word in word_candidates[:k]:
        generated_candidates.append(word['token_str'])
      generated_words.append(generated_candidates)

  return generated_words

def maskedPhrase2NewTextNOUN(generated_combinations,combination_id,classified_phrase):
  predicted_text = ""
  word_id = 0
  for word,tag in classified_phrase:
    if tag != 'NOUN':
      if word == "’" or word == ".":
        predicted_text = predicted_text[:-1] + word
      else:
        predicted_text += word + " "
    else:
      predicted_text += generated_combinations[word_id][combination_id] + " "
      word_id += 1

  return predicted_text

def generateNewSegmentsNOUN(input_data,classified_list,masked_list,k):
  # k - how many times to repeat genration with each phrase
  generated_segments = []
  for i in range(len(masked_list)):
    nb_combinations = masked_list[i].count("<mask>")
    if nb_combinations > 0:
      generated_combinations = generateCombinations(masked_list[i])
      for j in range(k):
        new_phrase = maskedPhrase2NewTextNOUN(generated_combinations,j,classified_list[i])
        if new_phrase.strip() != input_data[i].replace("\n","").strip():
          generated_segments.append(new_phrase.strip())
    if i % 10 == 0:
      print("Process",i,"segment")

  return generated_segments

Perform generation:

In [32]:
new_segments = generateNewSegmentsNOUN(data_train_positive,classified_segments,masked_segments,k)

Process 0 segment
Process 10 segment
Process 20 segment
Process 30 segment
Process 40 segment
Process 50 segment
Process 60 segment
Process 70 segment


Generated segments:

In [33]:
new_segments[0:5]

['1 1 : les ou de du des interdites 1 ) Dans l l ensemble de la de sont interdits : Les de de le ou de les permanents visés à l’article L.443-1 et L.444-1 du de de l’de.',
 'de 1 : le ou du du site interdites 1 ) Dans l ‘ ensemble de la qui sont interdits : Les et de de ou de de permanents visés à l’Article L.443-1 et L.444-1 du et de l’».',
 'Page 1 : Les ou et du sexe interdites 1 ) Dans l L ensemble de la qui sont interdits : Les armes légères de la.',
 'Partie 1 : les ou parties du corps interdites 1 ) Dans l l ensemble de la société sont interdits : Les chaussures légères de et.',
 ': 1 : les ou de du sont interdites 1 ) Dans l l ensemble de la qui sont interdits : Les activités destinées à l’et ne dépendant pas d’une exploitation agricole autres que celles de à l’( 2 ( 1 ).']

Create new segments:

In [34]:
data_new = [(i,-1,True) for i in new_segments]

data_augmented = data_train + data_new

Some stats:

In [35]:
len(data_augmented)

628

In [36]:
print("Positive examples:", len([i for i in range(len(data_augmented)) if data_augmented[i][2]]))
print("Negative examples:", len([i for i in range(len(data_augmented)) if not data_augmented[i][2]]))

Positive examples: 276
Negative examples: 352


Save results:

In [37]:
f_out = open("drive/MyDrive/train_set_"+task+"_augm-"+experiment+".pkl","wb")

pickle.dump(data_augmented,f_out)

f_out.close()

### **Combined approach (verbs, DES): replace all verbs in segments which contain concepts from expert nomenclature enriched by DES**

Experiment (augmentation) name:

In [38]:
experiment = '9'

How many times repeat augmentation:

In [39]:
k = 4

Mask all verbs in segments which contain concepts from enriched expert nomenclature:

In [40]:
def getMaskedSegmentsVERB(input_data,nomenclature_concepts):
  classified_list = []
  masked_list = []
  for i in range(len(input_data)):
    phrase = input_data[i]
    flag = False
    for concept in nomenclature_concepts:
      if (concept+" " in phrase) or (concept+"," in phrase) or (concept+"." in phrase):
        flag = True
    if flag:
      tokenized_text = word_tokenize(phrase, language='french')
      classified_text = st.tag(tokenized_text)
      masked_text = ""
      for word,tag in classified_text:
        if tag != 'VERB':
          if word == "’":
            masked_text = masked_text[:-1] + word
          elif word == "," or word == "." or word == ")":
            masked_text = masked_text[:-1] + word + " "
          elif word == "(":
            masked_text += word
          else:
            masked_text += word + " "
        else:
          masked_text += '<mask> '
      classified_list.append(classified_text)
      masked_list.append(masked_text.strip())
    if i % 10 == 0:
      print("Process",i,"segment")

  return classified_list, masked_list

Perform masking:

In [41]:
classified_segments, masked_segments = getMaskedSegmentsVERB(data_train_positive,nomenclature_extended)

Process 0 segment
Process 10 segment
Process 20 segment
Process 30 segment
Process 40 segment
Process 50 segment
Process 60 segment
Process 70 segment
Process 80 segment
Process 90 segment
Process 100 segment
Process 110 segment


Masked segments:

In [42]:
masked_segments[0:5]

['Article 1 : Occupations ou utilisations du sol <mask> 1) Dans l’ensemble de la zone sont <mask> : Les terrains de camping ou de caravanage permanents <mask> à l’article L.443-1 et L.444-1 du code de l’urbanisme.',
 'Article 1 : Occupations ou utilisations du sol <mask> 1) Dans l’ensemble de la zone sont <mask> : Les habitations légères de loisirs.',
 'Article 1 : Occupations ou utilisations du sol <mask> 1) Dans l’ensemble de la zone sont <mask> : Les constructions <mask> à l’habitation ne <mask> pas d’une exploitation agricole autres que celles visées à l’article 2 paragraphe 1).',
 'Article 1 : Occupations ou utilisations du sol <mask> 1) Dans l’ensemble de la zone sont <mask> : Les constructions <mask> à l’hébergement hôtelier autres que celles visées à l’article 2 paragraphe 1).',
 'Article 1 : Occupations ou utilisations du sol <mask> 1) Dans l’ensemble de la zone sont <mask> : Les constructions <mask> aux bureaux, au commerce et activités de service, à l’artisanat, à l’industri

Define functions for new phrase generation:

In [43]:
def generateCombinations(masked_text):
  # predict masked words:
  predicted_words = camembert_unmasker(masked_text)

  # extract predicted words:
  generated_words = []
  if masked_text.count("<mask>") == 1:
    generated_candidates = []
    for word in predicted_words[:k]:
      generated_candidates.append(word['token_str'])
    generated_words.append(generated_candidates)
  else:
    for word_candidates in predicted_words:
      generated_candidates = []
      for word in word_candidates[:k]:
        generated_candidates.append(word['token_str'])
      generated_words.append(generated_candidates)

  return generated_words

def maskedPhrase2NewTextVERB(generated_combinations,combination_id,classified_phrase):
  predicted_text = ""
  word_id = 0
  for word,tag in classified_phrase:
    if tag != 'VERB':
      if word == "’" or word == ".":
        predicted_text = predicted_text[:-1] + word
      else:
        predicted_text += word + " "
    else:
      predicted_text += generated_combinations[word_id][combination_id] + " "
      word_id += 1

  return predicted_text

def generateNewSegmentsVERB(input_data,classified_list,masked_list,k):
  # k - how many times to repeat genration with each phrase
  generated_segments = []
  for i in range(len(masked_list)):
    nb_combinations = masked_list[i].count("<mask>")
    if nb_combinations > 0:
      generated_combinations = generateCombinations(masked_list[i])
      for j in range(k):
        new_phrase = maskedPhrase2NewTextVERB(generated_combinations,j,classified_list[i])
        if new_phrase.strip() != input_data[i].replace("\n","").strip():
          generated_segments.append(new_phrase.strip())
    if i % 10 == 0:
      print("Process",i,"segment")

  return generated_segments

Perform generation:

In [44]:
new_segments = generateNewSegmentsVERB(data_train_positive,classified_segments,masked_segments,k)

Process 0 segment
Process 10 segment
Process 20 segment
Process 30 segment
Process 40 segment
Process 50 segment
Process 60 segment
Process 70 segment


Generated segments:

In [45]:
new_segments[0:5]

['Article 1 : Occupations ou utilisations du sol : 1 ) Dans l’ensemble de la zone sont concernés : Les terrains de camping ou de caravanage permanents mentionnés à l’article L.443-1 et L.444-1 du code de l’urbanisme.',
 'Article 1 : Occupations ou utilisations du sol ( 1 ) Dans l’ensemble de la zone sont autorisés : Les terrains de camping ou de caravanage permanents définis à l’article L.443-1 et L.444-1 du code de l’urbanisme.',
 'Article 1 : Occupations ou utilisations du sol – 1 ) Dans l’ensemble de la zone sont situés : Les terrains de camping ou de caravanage permanents visés à l’article L.443-1 et L.444-1 du code de l’urbanisme.',
 'Article 1 : Occupations ou utilisations du sol - 1 ) Dans l’ensemble de la zone sont définis : Les terrains de camping ou de caravanage permanents prévus à l’article L.443-1 et L.444-1 du code de l’urbanisme.',
 'Article 1 : Occupations ou utilisations du sol : 1 ) Dans l’ensemble de la zone sont autorisées : Les habitations légères de loisirs.']

Create new segments:

In [46]:
data_new = [(i,-1,True) for i in new_segments]

data_augmented = data_train + data_new

Some stats:

In [47]:
len(data_augmented)

786

In [48]:
print("Positive examples:", len([i for i in range(len(data_augmented)) if data_augmented[i][2]]))
print("Negative examples:", len([i for i in range(len(data_augmented)) if not data_augmented[i][2]]))

Positive examples: 434
Negative examples: 352


Save results:

In [49]:
f_out = open("drive/MyDrive/train_set_"+task+"_augm-"+experiment+".pkl","wb")

pickle.dump(data_augmented,f_out)

f_out.close()