In [1]:
from pydantic import BaseModel
from typing import List
import xml.etree.ElementTree as ET
import re
import os
import json
from typing import Union

In [2]:
class Clause(BaseModel):
    AlternativeAnalysis: str
    DiscourseGenre: str
    IllocutionaryForce: str
    ImplicitInformation: str
    Listener: str
    Location: str
    NotionalStructureSchema: str
    RhetoricalQuestion: str
    SalienceBand: str
    Sequence: str
    Speaker: str
    SpeakerListenerAge: str
    SpeakersAge: str
    SpeakersAttitude: str
    SpeechStyle: str
    TopicNP: str
    Type: str
    VocabularyAlternate: str

class NP(BaseModel):
    Implicit: str
    Relativized: str
    SemanticRole: str
    Sequence: str
    ThingThingRelationship: str

class Noun(BaseModel):
    FutureExpansion: str
    LexicalSense: str
    NounListIndex: Union[str, int]
    Number: str
    ParticipantStatus: str
    ParticipantTracking: str
    Person: str
    Polarity: str
    Proximity: str
    SemanticComplexityLevel: int
    SurfaceRealization: str

class VP(BaseModel):
    Implicit: str
    Sequence: str

class Verb(BaseModel):
    AdjectiveDegree: str
    Aspect: str
    LexicalSense: str
    Mood: str
    Polarity: str
    Reflexivity: str
    SemanticComplexityLevel: Union[str, int]
    TargetAspect: str
    TargetMood: str
    TargetTenseForm: str
    Time: str

class AdjP(BaseModel):
    Implicit: str
    Sequence: str
    Usage: str
    
class Adjective(BaseModel):
    Degree: str
    LexicalSense: str
    SemanticComplexityLevel: Union[str, int]
    
class AdvP(BaseModel):
    Implicit: str
    Sequence: str
    
class Adverb(BaseModel):
    Degree: str
    LexicalSense: str
    SemanticComplexityLevel: int
    
class Adposition(BaseModel):
    LexicalSense: str
    SemanticComplexityLevel: int

class Conjunction(BaseModel):
    Implicit: str
    LexicalSense: str
    SemanticComplexityLevel: int

In [3]:
class ClauseXMLData(BaseModel):
    Clause: List[Clause]

class NPXMLData(BaseModel):
    NP: List[NP]

class NounXMLData(BaseModel):
    Noun: List[Noun]

class VPXMLData(BaseModel):
    VP: List[VP]

class VerbXMLData(BaseModel):
    Verb: List[Verb]
###    
class AdjPXMLData(BaseModel):
    AdjP: List[AdjP]

class AdjectiveXMLData(BaseModel):
    Adjective: List[Adjective]

class AdvPXMLData(BaseModel):
    AdvP: List[AdvP]

class AdverbXMLData(BaseModel):
    Adverb: List[Adverb]

class AdpositionXMLData(BaseModel):
    Adposition: List[Adposition]

class ConjXMLData(BaseModel):
    Conjunction: List[Conjunction]

In [4]:
folder_path = 'xml'
file_list = os.listdir(folder_path)

In [5]:
xml_files = []

for file_name in file_list:
    if file_name.endswith('.xml'):
        file_path = os.path.join(folder_path, file_name)
        
        with open(file_path, 'r') as file:
            text_file = file.read()
            
        xml_files.append(text_file)

In [7]:
def clean_text(xml_data):
    xml_data = xml_data.replace("<!DOCTYPE  SYSTEM \"TBTA\">", "")
    xml_data = xml_data.replace("Speaker`sAge", "SpeakersAge")
    xml_data = xml_data.replace("Speaker`sAttitude", "SpeakersAttitude")
    xml_data = xml_data.replace("TargetTense&Form", "TargetTenseForm")
    xml_data = xml_data.replace("Speaker-ListenerAge", "SpeakerListenerAge")
    xml_data = xml_data.replace("Thing-ThingRelationship", "ThingThingRelationship")
    xml_data = re.sub(r"(<\?xml[^>]+\?>)", r"\1<root>", xml_data) + "</root>"
    return xml_data

In [8]:
def parse_xml_data(root, tag, model):
    parsed_data = []
    for element in root.findall(f'.//{tag}'):
        try:
            parsed_element = model.parse_obj(element.attrib)
            parsed_data.append(parsed_element)
        except Exception as e:
            print(f"Error parsing {tag}: {e}")
    return parsed_data

adverbial phrases and adverbs were not being parsed correctly for some reason, so they are done separately

In [9]:
parsed_advps = []
parsed_advs = []

for xml_data in xml_files:
    xml_data = clean_text(xml_data)
    root = ET.fromstring(xml_data.strip())

    for element in root.findall('.//AdvP'):
        parsed_element = AdvP.parse_obj(element.attrib)
        parsed_advps.append(parsed_element)
    
    for element in root.findall('.//Adverb'):
        parsed_element = Adverb.parse_obj(element.attrib)
        parsed_advs.append(parsed_element)


In [10]:
for xml_data in xml_files:
    xml_data = clean_text(xml_data)
    
    root = ET.fromstring(xml_data.strip())
    
    parsed_clauses = parse_xml_data(root, 'Clause', Clause)
    parsed_nps = parse_xml_data(root, 'NP', NP)
    parsed_vps = parse_xml_data(root, 'VP', VP)
    parsed_conjs = parse_xml_data(root, 'Conjunction', Conjunction)
    parsed_verbs = parse_xml_data(root, 'Verb', Verb)
    parsed_nouns = parse_xml_data(root, 'Noun', Noun)
    parsed_adjps = parse_xml_data(root, 'AdjP', AdjP)
    parsed_adjs = parse_xml_data(root, 'Adjective', Adjective)
    parsed_adpos = parse_xml_data(root, 'Adposition', Adposition)

In [11]:
clauses = ClauseXMLData(Clause=parsed_clauses)
nps = NPXMLData(NP=parsed_nps)
vps = VPXMLData(VP=parsed_vps)
conjs = ConjXMLData(Conjunction=parsed_conjs)
verbs = VerbXMLData(Verb=parsed_verbs)
nouns = NounXMLData(Noun=parsed_nouns)
advps = AdvPXMLData(AdvP=parsed_advps)
advs = AdverbXMLData(Adverb=parsed_advs)
adjps = AdjPXMLData(AdjP=parsed_adjps)
adjs = AdjectiveXMLData(Adjective=parsed_adjs)
adpos = AdpositionXMLData(Adposition=parsed_adpos)

In [12]:
def store_unique_values(data):
    unique_values = {label: set() for label in data[0].dict().keys()}
    for item in data:
        for label, value in item.dict().items():
            unique_values[label].add(value)
    
    return unique_values

In [13]:
def write_unique_values_to_file(data, filename):
    with open(filename, 'w') as file:
        json.dump({k: list(v) for k, v in data.items()}, file)

In [14]:
write_unique_values_to_file(store_unique_values(clauses.Clause), 'unique_clause_values.txt')
write_unique_values_to_file(store_unique_values(nps.NP), 'unique_np_values.txt')
write_unique_values_to_file(store_unique_values(vps.VP), 'unique_vp_values.txt')
write_unique_values_to_file(store_unique_values(conjs.Conjunction), 'unique_conj_values.txt')
write_unique_values_to_file(store_unique_values(verbs.Verb), 'unique_verb_values.txt')
write_unique_values_to_file(store_unique_values(nouns.Noun), 'unique_noun_values.txt')
write_unique_values_to_file(store_unique_values(advps.AdvP), 'unique_advp_values.txt')
write_unique_values_to_file(store_unique_values(advs.Adverb), 'unique_adv_values.txt')
write_unique_values_to_file(store_unique_values(adjps.AdjP), 'unique_adjp_values.txt')
write_unique_values_to_file(store_unique_values(adjs.Adjective), 'unique_adj_values.txt')
write_unique_values_to_file(store_unique_values(adpos.Adposition), 'unique_adpos_values.txt')