<a href="https://colab.research.google.com/github/sophiamaria05/IC_MDA/blob/main/predicting_type_with_trained_decision_tree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Preparing

##Importing libraries

In [None]:
#reading xml files
import xml.etree.cElementTree as ET
from xml.dom.minidom import parseString
#deal with trees
from sklearn import tree
import joblib
#deal with tables
import pandas as pd
import numpy as np
#importing files
import os

##Defining methods

In [None]:
#@title ***get_xml_files(xml_file_folder):*** Geting the xml files (of the phrases)
def get_xml_files(xml_file_folder):
  if len([file for file in os.listdir(xml_file_folder) if file.endswith('.xml')])==0:
    raise FileNotFoundError("No xml file was found! Please upload the file with the tokenized phrases.")
  return [file for file in os.listdir(xml_file_folder) if file.endswith('.xml')]

In [None]:
#@title ***get_tokens(xml_file):*** Gets the tokens from the xml
def get_tokens(xml_file):
  tree_xml = ET.parse(xml_file)
  root_xml = tree_xml.getroot()
  return tree_xml, root_xml

In [None]:
#@title ***get_table(tree_xml, root_xml, reserved_words):*** Gets table from xml
def get_table(tree_xml, root_xml, reserved_words):
  phrases = {'phrase': [], 'id_use_case':[], 'flow':[], 'id_phrase':[], 'noun':[],       'verb':[],       'object':[],       'complement':[],       'complementN':[],
                'noun_is_system':[], 'reserved_word':[], 'noun_check':[], 'verb_check':[], 'object_check':[], 'complement_check':[], 'complementN_check':[]}
  for use_case in root_xml:
    for flow in use_case:
      for phrase in flow:
        #print(phrases)#FIXING "All arrays must be of the same length" ERROR
        if (phrase.get('type') != None):
          #print("type:", phrase.get('type'))#FIXING "All arrays must be of the same length" ERROR
          continue
        elif ((phrase.find("noun") is None) or (phrase.find("verb") is None) or (phrase.find("object") is None)):
          #print("type:", phrase.get('type'))#FIXING "All arrays must be of the same length" ERROR
          phrase.set('type', '0')
          continue

        #append phrase
        if phrase.text != None:
          phrases['phrase'].append(phrase.text)
        else:
          phrases['phrase'].append(np.nan)

        #append the ids to find the phrase
        phrases['id_use_case'].append(use_case.get("id"))
        phrases['flow'].append(flow.tag)
        phrases['id_phrase'].append(phrase.get("id"))

        #append noun
        '''
        if phrase.find("noun") is not None:
          if phrase.find("noun").text=="Sistema":
            phrases['noun_is_system'].append(1)
          else:
            phrases['noun_is_system'].append(0)
          phrases['noun_check'].append(1)
          phrases['noun'].append(phrase.find("noun").text)
        else:
          phrases['noun_check'].append(0)
          phrases['noun'].append(np.nan)

          phrases['noun_is_system'].append(0)#FIXING "All arrays must be of the same length" ERROR
        '''
        phrases['noun_check'].append(1)
        phrases['noun'].append(phrase.find("noun").text)
        if phrase.find("noun").text=="Sistema":
          phrases['noun_is_system'].append(1)
        else:
          phrases['noun_is_system'].append(0)

        #append verb
        '''
        if phrase.find("verb") is not None:
          phrases['verb_check'].append(1)
          phrases['verb'].append(phrase.find("verb").text)
          if phrase.find("verb").text in reserved_words:
            phrases['reserved_word'].append(1)
          else:
            phrases['reserved_word'].append(0)
        else:
          phrases['verb_check'].append(0)
          phrases['verb'].append(np.nan)

          phrases['reserved_word'].append(0)#FIXING "All arrays must be of the same length" ERROR
        '''
        phrases['verb_check'].append(1)
        phrases['verb'].append(phrase.find("verb").text)
        if phrase.find("verb").text in reserved_words:
          phrases['reserved_word'].append(1)
        else:
          phrases['reserved_word'].append(0)

        #append object
        '''
        if phrase.find("object") is not None:
          phrases['object_check'].append(1)
          phrases['object'].append(phrase.find("object").text)
        else:
          phrases['object_check'].append(0)
          phrases['object'].append(np.nan)
        '''
        phrases['object_check'].append(1)
        phrases['object'].append(phrase.find("object").text)

        #append complement
        if phrase.find("complement") is not None:
          phrases['complement_check'].append(1)
          phrases['complement'].append(phrase.find("complement").text)
        else:
          phrases['complement_check'].append(0)
          phrases['complement'].append(np.nan)

        #append complementN
        if phrase.find("complementN") is not None:
          phrases['complementN_check'].append(1)
          phrases['complementN'].append(phrase.find("complementN").text)
        else:
          phrases['complementN_check'].append(0)
          phrases['complementN'].append(np.nan)

  #print('\n\n\n',phrases)#FIXING "All arrays must be of the same length" ERROR

  df = pd.DataFrame.from_dict(phrases)
  return df

In [None]:
#@title ***get_trained_tree(tree_file_folder):*** Gets trained decision tree from the joblib file
def get_trained_tree(tree_file_folder):
  tree_file = [file for file in os.listdir(tree_file_folder) if file.endswith('.joblib')]
  if len(tree_file)==0:
    raise FileNotFoundError("No decision tree model was found! Please upload the joblib file.")
  return joblib.load(tree_file_folder+"/"+tree_file[0])

In [None]:
#@title ***int_to_tag(tag_int):*** Converts the ***type*** back to tag
def int_to_tag(tag_int):
  if type(tag_int)!=int:
    return ""
  if tag_int==0:
    return "0"
  tag = ""
  while tag_int!=1:
    tag+=(str(tag_int%2)+".")
    tag_int = tag_int//2
  tag+="1"
  return tag

In [None]:
#@title ***predict_types(df):*** Predicts the type of the phrases using the trained tree
def predict_types(dtc, df):
  types = dtc.predict(df.iloc[:, -7:])
  return types

In [None]:
#@title ***concat_types_to_df(df, types):*** Adds the predicted types to the data frame
def concat_types_to_df(df, types):
  types_tag = []
  for tag in types:
    types_tag.append(int_to_tag(int(tag)))
  df = pd.concat([df, pd.DataFrame(types_tag, columns=['types'])], axis=1)
  return df

In [None]:
#@title ***name_file(save_folder, xml_file):*** Defines the name of the written file
def name_file(save_folder, xml_file):
  if '.xml' == xml_file[-4:]:
    xml_file = xml_file[:-4]
  if "tokenized_phrases (" == xml_file[:19] and ")" == xml_file[-1:]:
    return save_folder+'predicted_phrases_types ('+xml_file[19:-1]+').xml'
  else:
    return save_folder+'predicted_phrases_types ('+xml_file+').xml'

#EXAMPLES:
# save_folder = "/content/predicted_files/"
# xml_file = "tokenized_phrases (arquivo (3)).xml"
# name_file(save_folder, xml_file) #/content/predicted_files/predicted_phrases_types (arquivo (3)).xml

# save_folder = "/content/predicted_files/"
# xml_file = "arquivo (3).xml"
# name_file(save_folder, xml_file) #/content/predicted_files/predicted_phrases_types (arquivo (3)).xml

##Main method

In [None]:
#@title ***predict(tree_file_folder, reserved_words, xml_file_folder):***
def predict(tree_file_folder, reserved_words, tokenized_folder, save_folder):
  #Gets phrases
  xml_files = get_xml_files(tokenized_folder)
  #Predicts and get dataframe for each xml file
  i=0
  predicted_xmls = []
  for xml_file in xml_files:
    tree_xml, root_xml = get_tokens(tokenized_folder+'/'+xml_file)
    df = get_table(tree_xml, root_xml, reserved_words)
    from IPython.display import display
    #display(df)
    #Gets trained decision tree
    dtc = get_trained_tree(tree_file_folder)
    #Predicts types and concat to the phrases table
    types = predict_types(dtc, df)
    df = concat_types_to_df(df, types)
    #Updates and writes new xml
    for use_case in root_xml:
      for flow in use_case:
        for phrase in flow:
          if phrase.get('type') != None:
            continue
          try:
            phrase.set('type', df.loc[(df['id_use_case']==use_case.get("id")) & (df['id_phrase']==phrase.get("id")) & (df['flow']==flow.tag), 'types'].iloc[0])
          except Exception as e:
            print(e)
    # if i==0:
    #   tree_xml.write('predicted_phrases_types.xml')
    # else:
    #   tree_xml.write('predicted_phrases_types('+ str(i) +').xml')
    # i+=1
    tree_xml.write(name_file(save_folder, xml_file))

    predicted_xmls.append(tree_xml)
  return predicted_xmls

#Predicting

In [None]:
#@title Reserved words
reserved_words = os.environ.get("RESERVED_WORDS")
if reserved_words == None or reserved_words == "":
  reserved_words = ['Informar', 'Inserir']
else:
  reserved_words = reserved_words.split(";")

In [None]:
#@title Folder where the **decision three model** was uploaded
tree_file_folder = os.environ.get("DECISION_TREE_DIR")
if tree_file_folder == None or tree_file_folder == "":
  tree_file_folder = "/content/"

In [None]:
#@title Folder where the **xml files** of the phrases was uploaded
tokenized_folder = os.environ.get("TOKENIZED_XML_DIR")
if tokenized_folder == None or tokenized_folder == "":
  tokenized_folder = "/content/"

In [None]:
#@title Folder where the **predicted xml files** are going to be saved
save_folder = os.environ.get("PREDICTED_XML_DIR")
if save_folder == None or save_folder == "":
  save_folder = "/content/predicted_files/"
os.makedirs(save_folder, exist_ok=True)

In [None]:
#@title Run ***predict()***
predicted_xmls = predict(tree_file_folder, reserved_words, tokenized_folder, save_folder)

# for xml in predicted_xmls:
#   print(parseString(ET.tostring(xml.getroot())).toprettyxml(indent='\t', newl='\n'))