In [72]:
import os
import pandas as pd
from typing import List
from cassis import load_typesystem, TypeSystem, load_cas_from_xmi, Cas





In [73]:
def process_xmi_file(xmi_file_name: str, type_system: TypeSystem) -> List[dict]:
    with open(xmi_file_name, 'rb') as f:
        cas: Cas = load_cas_from_xmi(f, type_system)

    data = []
    argument_components = cas.select('de.tudarmstadt.ukp.dkpro.argumentation.types.ArgumentComponent')
    claims = cas.select('de.tudarmstadt.ukp.dkpro.argumentation.types.Claim')
    premises = 'de.tudarmstadt.ukp.dkpro.argumentation.types.Premise'
    rebuttals = 'de.tudarmstadt.ukp.dkpro.argumentation.types.Rebuttal'
    refutations = 'de.tudarmstadt.ukp.dkpro.argumentation.types.Refutation'
    backings = 'de.tudarmstadt.ukp.dkpro.argumentation.types.Backing'
    for claim in claims:
        claim_text = claim.get_covered_text()
        data.append({
            'claim': claim_text
        })

    return data

In [74]:
if __name__ == '__main__':
    data_path: str = 'habernal.gurevych.2017.argumentation.mining.CL.data/data/gold.data.toulmin/'

    type_system_file: str = os.path.join(data_path, 'TypeSystem.xml')

    with open(type_system_file, 'rb') as f:
        type_system: TypeSystem = load_typesystem(f)

    # List all XMI files
    xmi_files = [f for f in os.listdir(data_path) if f.endswith('.xmi')]
    print("XMI files found:", xmi_files)

    # Process each XMI file and collect data
    all_data = []
    for xmi_file in xmi_files:
        xmi_file_path = os.path.join(data_path, xmi_file)
        print(f"Processing file: {xmi_file_path}")
        file_data = process_xmi_file(xmi_file_path, type_system)
        all_data.extend(file_data)

    # Load data into a pandas DataFrame
    claims = pd.DataFrame(all_data)
    print(claims.head())

XMI files found: ['1021.xmi', '1037.xmi', '1045.xmi', '1064.xmi', '1084.xmi', '1133.xmi', '1189.xmi', '1196.xmi', '1197.xmi', '1198.xmi', '1199.xmi', '1220.xmi', '1233.xmi', '1260.xmi', '1289.xmi', '1346.xmi', '1403.xmi', '1412.xmi', '1414.xmi', '1430.xmi', '1444.xmi', '1465.xmi', '1467.xmi', '1483.xmi', '1487.xmi', '1533.xmi', '1544.xmi', '1546.xmi', '1562.xmi', '1568.xmi', '1576.xmi', '1594.xmi', '1604.xmi', '1619.xmi', '1621.xmi', '1625.xmi', '1627.xmi', '163.xmi', '1640.xmi', '1658.xmi', '166.xmi', '1666.xmi', '1667.xmi', '1668.xmi', '1669.xmi', '1671.xmi', '1672.xmi', '1675.xmi', '1683.xmi', '1692.xmi', '1698.xmi', '17.xmi', '1701.xmi', '1713.xmi', '1715.xmi', '1727.xmi', '1734.xmi', '1793.xmi', '1827.xmi', '1846.xmi', '1848.xmi', '1849.xmi', '1850.xmi', '1854.xmi', '1863.xmi', '1886.xmi', '1909.xmi', '1930.xmi', '1966.xmi', '2036.xmi', '2050.xmi', '2086.xmi', '2096.xmi', '2143.xmi', '2145.xmi', '2159.xmi', '2162.xmi', '2163.xmi', '2165.xmi', '2166.xmi', '2171.xmi', '2173.xmi', '2

In [75]:
claims

Unnamed: 0,claim
0,"But don't assume that one size fits everybody,..."
1,
2,Education at home is nothing like education at...
3,
4,School can be very bad for you.
...,...
360,
361,
362,I wouldn't trust a state school to train a dog...
363,


In [66]:
def process_xmi_file(xmi_file_name: str, type_system: TypeSystem):
    with open(xmi_file_name, 'rb') as f:
        cas: Cas = load_cas_from_xmi(f, type_system)

    # Define the types of elements to extract
    element_types = [
        'de.tudarmstadt.ukp.dkpro.argumentation.types.Claim',
        'de.tudarmstadt.ukp.dkpro.argumentation.types.Premise',
        'de.tudarmstadt.ukp.dkpro.argumentation.types.Rebuttal',
        'de.tudarmstadt.ukp.dkpro.argumentation.types.Refutation',
        'de.tudarmstadt.ukp.dkpro.argumentation.types.Backing'
    ]

    # Create a list to store the extracted elements
    elements = []

    # Extract elements of each type
    for element_type in element_types:
        for element in cas.select(element_type):
            element_data = {
                'type': element_type,
                'text': element.get_covered_text()
            }
            elements.append(element_data)

    # Create a pandas DataFrame from the extracted elements
    df = pd.DataFrame(elements)

    # Create separate columns for each element type
    for element_type in element_types:
        element_df = df[df['type'] == element_type]
        element_df = element_df.rename(columns={'text': element_type})
        element_df = element_df.drop(columns=['type'])
        element_df = element_df.reset_index(drop=True)
        df = pd.concat([df, element_df], axis=1)

    return df
   


In [67]:
# Process all XMI files and concatenate the results
dfs = []
for xmi_file_name in xmi_file_names:
    df = process_xmi_file(xmi_file_name, type_system)
    dfs.append(df)


In [68]:
all_elements_df = pd.concat(dfs)

In [69]:
all_elements_df

Unnamed: 0,type,text,de.tudarmstadt.ukp.dkpro.argumentation.types.Claim,de.tudarmstadt.ukp.dkpro.argumentation.types.Premise,de.tudarmstadt.ukp.dkpro.argumentation.types.Claim.1,de.tudarmstadt.ukp.dkpro.argumentation.types.Rebuttal,de.tudarmstadt.ukp.dkpro.argumentation.types.Claim.2,de.tudarmstadt.ukp.dkpro.argumentation.types.Premise.1,de.tudarmstadt.ukp.dkpro.argumentation.types.Claim.3,de.tudarmstadt.ukp.dkpro.argumentation.types.Refutation,...,de.tudarmstadt.ukp.dkpro.argumentation.types.Premise.2,de.tudarmstadt.ukp.dkpro.argumentation.types.Claim.4,de.tudarmstadt.ukp.dkpro.argumentation.types.Refutation.1,de.tudarmstadt.ukp.dkpro.argumentation.types.Claim.5,de.tudarmstadt.ukp.dkpro.argumentation.types.Premise.3,de.tudarmstadt.ukp.dkpro.argumentation.types.Claim.6,de.tudarmstadt.ukp.dkpro.argumentation.types.Rebuttal.1,de.tudarmstadt.ukp.dkpro.argumentation.types.Claim.7,de.tudarmstadt.ukp.dkpro.argumentation.types.Premise.4,de.tudarmstadt.ukp.dkpro.argumentation.types.Claim.8
0,de.tudarmstadt.ukp.dkpro.argumentation.types.C...,"But don't assume that one size fits everybody,...","But don't assume that one size fits everybody,...","It was boring, tedious, slow and frustrating. ...",,,,,,,...,,,,,,,,,,
1,de.tudarmstadt.ukp.dkpro.argumentation.types.P...,"It was boring, tedious, slow and frustrating. ...",,All it needs to do is to fail to provide a tot...,,,,,,,...,,,,,,,,,,
2,de.tudarmstadt.ukp.dkpro.argumentation.types.P...,All it needs to do is to fail to provide a tot...,,,,,,,,,...,,,,,,,,,,
3,de.tudarmstadt.ukp.dkpro.argumentation.types.B...,"It was boring, tedious, slow and frustrating. ...",,,,,,,,,...,,,,,,,,,,
0,de.tudarmstadt.ukp.dkpro.argumentation.types.C...,,,Lesson plans (and the national curriculum) are...,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2,de.tudarmstadt.ukp.dkpro.argumentation.types.P...,These same children go to school where they co...,,If they survive all this lot they can be brain...,,,,,,,...,,,,,,,,,,
3,de.tudarmstadt.ukp.dkpro.argumentation.types.P...,If they survive all this lot they can be brain...,,,,,,,,,...,,,,,,,,,,
4,de.tudarmstadt.ukp.dkpro.argumentation.types.B...,One at a local school to me was nearly killed ...,,,,,,,,,...,,,,,,,,,,
0,de.tudarmstadt.ukp.dkpro.argumentation.types.C...,,,"I don't want to be the hate figure here, but h...",,,,,,,...,,,,,,,,,,


In [64]:
import os
import pandas as pd
from lxml import etree

def extract_sofa_strings(xmi_file_path: str) -> List[dict]:
    tree = etree.parse(xmi_file_path)
    root = tree.getroot()

    data = []
    for sofa in root.findall('.//cas:Sofa', namespaces={'cas': 'http:///uima/cas.ecore'}):
        sofa_string = sofa.get('sofaString')
        data.append({
            'sofa_string': sofa_string
        })

    return data

if __name__ == '__main__':
    data_path: str = 'habernal.gurevych.2017.argumentation.mining.CL.data/data/gold.data.toulmin/'

    # List all XMI files
    xmi_files = [f for f in os.listdir(data_path) if f.endswith('.xmi')]
    print("XMI files found:", xmi_files)

    # Process each XMI file and collect data
    all_data = []
    for xmi_file in xmi_files:
        xmi_file_path = os.path.join(data_path, xmi_file)
        print(f"Processing file: {xmi_file_path}")
        file_data = extract_sofa_strings(xmi_file_path)
        all_data.extend(file_data)

    # Load data into a pandas DataFrame
    sofastring = pd.DataFrame(all_data)
    print(sofastring.head())

XMI files found: ['1021.xmi', '1037.xmi', '1045.xmi', '1064.xmi', '1084.xmi', '1133.xmi', '1189.xmi', '1196.xmi', '1197.xmi', '1198.xmi', '1199.xmi', '1220.xmi', '1233.xmi', '1260.xmi', '1289.xmi', '1346.xmi', '1403.xmi', '1412.xmi', '1414.xmi', '1430.xmi', '1444.xmi', '1465.xmi', '1467.xmi', '1483.xmi', '1487.xmi', '1533.xmi', '1544.xmi', '1546.xmi', '1562.xmi', '1568.xmi', '1576.xmi', '1594.xmi', '1604.xmi', '1619.xmi', '1621.xmi', '1625.xmi', '1627.xmi', '163.xmi', '1640.xmi', '1658.xmi', '166.xmi', '1666.xmi', '1667.xmi', '1668.xmi', '1669.xmi', '1671.xmi', '1672.xmi', '1675.xmi', '1683.xmi', '1692.xmi', '1698.xmi', '17.xmi', '1701.xmi', '1713.xmi', '1715.xmi', '1727.xmi', '1734.xmi', '1793.xmi', '1827.xmi', '1846.xmi', '1848.xmi', '1849.xmi', '1850.xmi', '1854.xmi', '1863.xmi', '1886.xmi', '1909.xmi', '1930.xmi', '1966.xmi', '2036.xmi', '2050.xmi', '2086.xmi', '2096.xmi', '2143.xmi', '2145.xmi', '2159.xmi', '2162.xmi', '2163.xmi', '2165.xmi', '2166.xmi', '2171.xmi', '2173.xmi', '2

In [65]:
sofastring



Unnamed: 0,sofa_string
0,cannibaldave\nNot particularly unusual among t...
1,I trust that was tongue in cheek? Lesson plans...
2,The idea that child abuse can be thwarted by t...
3,@oommph - here you get to a sensitive subject ...
4,@eileenphoto\n.\nSchool can be very bad for yo...
...,...
335,what like making sure you are fit to teach a c...
336,"I'm not a fan of the nanny state, but I can se..."
337,Ours will be that the State Education system i...
338,"Back on topic, , ,\nOf course the government a..."


In [77]:
def extract_data(xmi_file_path: str) -> pd.DataFrame:
    tree = etree.parse(xmi_file_path)
    root = tree.getroot()

    data = []
    for sofa in root.findall('.//cas:Sofa', namespaces={'cas': 'http:///uima/cas.ecore'}):
        sofa_string = sofa.get('sofaString')
        argument_components = []
        claims = []
        premises = []
        rebuttals = []
        refutations = []
        backings = []

        for element in root.findall('.//cas:ArgumentComponent', namespaces={'cas': 'http:///uima/cas.ecore'}):
            if element.get('sofa') == sofa_string:
                argument_components.append({
                    'id': element.get('id'),
                    'text': element.get('text')
                })

        for element in root.findall('.//cas:Claim', namespaces={'cas': 'http:///uima/cas.ecore'}):
            if element.get('sofa') == sofa_string:
                claims.append({
                    'id': element.get('id'),
                    'text': element.get('text')
                })

        for element in root.findall('.//cas:Premise', namespaces={'cas': 'http:///uima/cas.ecore'}):
            if element.get('sofa') == sofa_string:
                premises.append({
                    'id': element.get('id'),
                    'text': element.get('text')
                })

        for element in root.findall('.//cas:Rebuttal', namespaces={'cas': 'http:///uima/cas.ecore'}):
            if element.get('sofa') == sofa_string:
                rebuttals.append({
                    'id': element.get('id'),
                    'text': element.get('text')
                })

        for element in root.findall('.//cas:Refutation', namespaces={'cas': 'http:///uima/cas.ecore'}):
            if element.get('sofa') == sofa_string:
                refutations.append({
                    'id': element.get('id'),
                    'text': element.get('text')
                })

        for element in root.findall('.//cas:Backing', namespaces={'cas': 'http:///uima/cas.ecore'}):
            if element.get('sofa') == sofa_string:
                backings.append({
                    'id': element.get('id'),
                    'text': element.get('text')
                })

        file_number = int(os.path.basename(xmi_file_path).split('.')[0])
        data.append({
            'file_number': file_number,
           'sofa_string': sofa_string,
            'argument_components': argument_components,
            'claims': claims,
            'premises': premises,
           'rebuttals': rebuttals,
           'refutations': refutations,
            'backings': backings
        })

    df = pd.DataFrame(data).set_index('file_number')
    return df

if __name__ == '__main__':
    data_path: str = 'habernal.gurevych.2017.argumentation.mining.CL.data/data/gold.data.toulmin/'

    # List all XMI files
    xmi_files = [f for f in os.listdir(data_path) if f.endswith('.xmi')]

    # Process each XMI file and collect data
    dfs = []
    for xmi_file in xmi_files:
        xmi_file_path = os.path.join(data_path, xmi_file)
        print(f"Processing file: {xmi_file_path}")
        df = extract_data(xmi_file_path)
        dfs.append(df)

Processing file: habernal.gurevych.2017.argumentation.mining.CL.data/data/gold.data.toulmin/1021.xmi
Processing file: habernal.gurevych.2017.argumentation.mining.CL.data/data/gold.data.toulmin/1037.xmi
Processing file: habernal.gurevych.2017.argumentation.mining.CL.data/data/gold.data.toulmin/1045.xmi
Processing file: habernal.gurevych.2017.argumentation.mining.CL.data/data/gold.data.toulmin/1064.xmi
Processing file: habernal.gurevych.2017.argumentation.mining.CL.data/data/gold.data.toulmin/1084.xmi
Processing file: habernal.gurevych.2017.argumentation.mining.CL.data/data/gold.data.toulmin/1133.xmi
Processing file: habernal.gurevych.2017.argumentation.mining.CL.data/data/gold.data.toulmin/1189.xmi
Processing file: habernal.gurevych.2017.argumentation.mining.CL.data/data/gold.data.toulmin/1196.xmi
Processing file: habernal.gurevych.2017.argumentation.mining.CL.data/data/gold.data.toulmin/1197.xmi
Processing file: habernal.gurevych.2017.argumentation.mining.CL.data/data/gold.data.toulmin

In [78]:
dfs

[                                                   sofa_string  \
 file_number                                                      
 1021         cannibaldave\nNot particularly unusual among t...   
 
             argument_components claims premises rebuttals refutations backings  
 file_number                                                                     
 1021                         []     []       []        []          []       []  ,
                                                    sofa_string  \
 file_number                                                      
 1037         I trust that was tongue in cheek? Lesson plans...   
 
             argument_components claims premises rebuttals refutations backings  
 file_number                                                                     
 1037                         []     []       []        []          []       []  ,
                                                    sofa_string  \
 file_number                     