<a href="https://colab.research.google.com/github/sophiamaria05/IC_MDA/blob/main/Tokenize_xml.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Importing the libraries and installing dependancies

In [1]:
import os

if os.environ.get("TOKENIZE_PIP_ALREADY_RUN") != "1":
  print("\n\nRunning Pip commands...\n\n")

  import subprocess
  import sys

  subprocess.check_call([sys.executable, "-m", "pip", "install", "-U", "h5py"])
  subprocess.check_call([sys.executable, "-m", "pip", "install", "typing-extensions"])
  subprocess.check_call([sys.executable, "-m", "pip", "install", "wheel"])
  subprocess.check_call([sys.executable, "-m", "pip", "install", "-U", "spacy"])
  subprocess.check_call([sys.executable, "-m", "spacy", "download", "pt_core_news_lg"])
  os.environ["TOKENIZE_PIP_ALREADY_RUN"] = "1"
else:
    print("\n\nPip commands were already executed!\n\n")

# !pip install h5py
# !pip install typing-extensions
# !pip install wheel
# !pip install -U spacy
# !python -m spacy download pt_core_news_lg



Running Pip commands...




In [2]:
import xml.etree.ElementTree as ET
from xml.dom.minidom import parseString

In [3]:
import spacy
nlp = spacy.load("pt_core_news_lg")

In [4]:
from spacy import displacy

##Defining the functions

In [5]:
def count_nmod(dep):
  nmod = 0
  for i in range(0, len(dep)):
    if dep[i]=="nmod":
      nmod+=1
  return nmod

In [6]:
def tokenize_phrase(phrase):
  doc = nlp(phrase.text)
  text = [i.lemma_ for i in doc]
  pos = [i.pos_ for i in doc]
  dep = [i.dep_ for i in doc]

  try:
    text[pos.index("NOUN")]
  except Exception as e:
    print('\nThe phrase "'+ phrase.text +'"\'s NOUN was not found')
    phrase.set('type', '0')
    displacy.render(doc, style="dep", jupyter=True)
    return False
  try:
    text[dep.index("ROOT")]
  except Exception as e:
    print('\nThe phrase "'+ phrase.text +'"\'s VERB was not found')
    phrase.set('type', '0')
    displacy.render(doc, style="dep", jupyter=True)
    return False
  try:
    text[dep.index("obj")]
  except Exception as e:
    print('\nThe phrase "'+ phrase.text +'"\'s OBJECT was not found')
    phrase.set('type', '0')
    displacy.render(doc, style="dep", jupyter=True)
    return False

  ET.SubElement(phrase, "noun").text = text[pos.index("NOUN")].title()
  ET.SubElement(phrase, "verb").text = text[dep.index("ROOT")].title()
  ET.SubElement(phrase, "object").text = text[dep.index("obj")].title()

  nmod_amount = count_nmod(dep)
  if nmod_amount>=1:
    ET.SubElement(phrase, "complement").text = text[dep.index("nmod")].title()
  if nmod_amount==2:
    ET.SubElement(phrase, "complement_n").text = text[dep.index("nmod", dep.index("nmod")+1)].title()

  return True

In [7]:
def get_xml_files(xml_file_folder):
  """
    Gets the folder path where the xml file with predicted phrases.
    Returns a List[xml_files].
  """
  return [file for file in os.listdir(xml_file_folder) if file.endswith('.xml')]

In [8]:
def tokenize(file_path, write_name):
  xml = ET.parse(file_path, parser=ET.XMLParser(encoding='utf-8'))
  xml_fix = ET.tostring(xml.getroot(), encoding='utf-8').decode('utf-8')#.replace('�', 'a')
  root = ET.fromstring(xml_fix)
  xml_string = parseString(xml_fix)
  #print(xml_string.toprettyxml(indent='\t', newl='\n'))

  for use_case in root:
    for flow in use_case:
      for phrase in flow:
        tokenize_phrase(phrase)

  tree = ET.ElementTree(root)
  tree.write(write_name)
  print(">>", file_path, "<< TOKENIZED AND SAVED ON >>", write_name, "<<")
  print(parseString(ET.tostring(tree.getroot())).toprettyxml(indent='\t', newl='\n'))

##Runing the tokenization

In [9]:
folder = os.environ.get("TO_BE_TOKENIZED_DIR")
if folder == None or folder == "":
  folder = "/content/"

save_folder = os.environ.get("TOKENIZED_XML_DIR")
if save_folder == None or save_folder == "":
  save_folder = "/content/tokenized/"
os.makedirs(save_folder, exist_ok=True)

files = get_xml_files(folder)
for file_name in files:
  tokenize(folder+"/"+file_name, save_folder+"tokenized_phrases ("+file_name.split('.', 1)[0]+").xml")


The phrase "Sistema informa ..."'s OBJECT was not found



The phrase "Sistema lista cliente"'s OBJECT was not found



The phrase "Usu�rio clica salvar"'s NOUN was not found



The phrase "Sistema cansela o processo"'s OBJECT was not found


>> /content//arquivo (3).xml << TOKENIZED AND SAVED ON >> /content/tokenized/tokenized_phrases (arquivo (3)).xml <<
<?xml version="1.0" ?>
<root>
	<use_case id="0">
		Vizualizar Cliente
		<main_flow>
			<phase id="0">
				Sistema exibe clientes
				<noun>Sistema</noun>
				<verb>Exibir</verb>
				<object>Cliente</object>
			</phase>
			<phase id="1">
				Usu�rio seleciona cliente
				<noun>Cliente</noun>
				<verb>Selecionar</verb>
				<object>Cliente</object>
			</phase>
		</main_flow>
		<alternative_flow id="0">
			Usu�rio n�o existente
			<flow id="0" type="0">Sistema informa ...</flow>
		</alternative_flow>
	</use_case>
	<use_case id="1">
		Cadastrar Cliente
		<main_flow>
			<phase id="0" type="0">Sistema lista cliente</phase>
			<phase id="1">
				Usu�rio seleciona cliente
				<noun>Cliente</noun>
				<verb>Selecionar</verb>
				<object>Cliente</object>
			</phase>
			<phase id="2">
				Usu�rio insere nome, endere�o e telefone
				<noun>Nome</noun>
				<verb>Inserir</verb>
				<ob