Claes Pauline. Master Digital Text Analysis. Student ID: 20163274

# Script for handling FRANTEXT XML DATA



# Step 1. Correct XML

In [35]:
import sys
import glob, os
original_stdout = sys.stdout

In [35]:
def XML_corrector(source_file_path):
    '''Function to insert line break after each closing tag, to make 
    the XML documents a bit more readable.'''
    with open(source_file_path, 'r') as f: 
        data = f.read()
    print(len(data))
    
    corrected = data.replace(r'/>', '/>'+'\n') # insert line break after each closing tag
    print(corrected[:200])
    #return corrected

def write_corr_XML(source_file_path, folder_name):
    '''Function to make XML more easily queryable.'''
    
    file_name = source_file_path.replace('.xml', r'') # get file name

    with open(source_file_path, 'r') as f: 
        data = f.read() # read in data
    
    
    
    corrected = data.replace(r'>', '>\n') # enter line break after each closing tag
    corrected = corrected.replace('x:wf', 'line') # replace tag 'x:wf' with 'line', as the LXML module used later on was not
                                                  # able to query the original 'x:wf' tag
    
    
    
    with open(str(folder_name) + "temp_" + file_name + '.xml', 'w') as f: # write adapted files to new XML documents
        sys.stdout = f # Change the standard output to the file we created.
        print(corrected)
        sys.stdout = original_stdout
    
   # print(file_name, len(data))
    
    print(f""" {'-'*10} {file_name} {'-'*10} \n *Original XML length:{len(data)} \n *Corrected XML length: {len(corrected)}""") # obtain text length
    
   #print(corrected[:200])
    #return corrected

In [157]:
# EXAMPLE EXECUTION OF CODE
for file in glob.glob("*.xml"):
    write_corr_XML(file)

 ---------- FONTENELLE_Entretiens_Pluralite_des_mondes ---------- 
 *Original XML length:1547129 
 *Corrected XML length: 1588252
 ---------- VOITURE_VINCENT_Lettres ---------- 
 *Original XML length:6809593 
 *Corrected XML length: 6985423


# Step 2. Frantext XML parsing

In [1]:
import pandas as pd
import glob, os
import xml.etree.ElementTree as et 

In [3]:

col_list = ["file_name", "word", "lemma", "POS"]
queried_tag = '{http://www.tei-c.org/ns/1.0}line' # this is the tag that I changed
                                                  # from 'x:wf' to 'line' earlier


def write_attr_df(path, col_list, queried_tag):
    '''Read in XML data, search for "lemma", "pos", and "word" tags, 
    and construct data frame with one row per word, and four columns:
    file name, word, lemma, POS. 
    
    This results in a data frame that is easy to query and identical to 
    the one used for the EPUB data.'''
    
    xtree = et.parse(path) # parse the XML using etree module
    xroot = xtree.getroot() # get root
    rows = [] # instantiate empty list
    file_name = path.replace('.xml', r'')
    
    for child in xroot.iter(): # iterate over root
        if child.tag == queried_tag: # if one of the child tags is the one that we're looking for
            lemma = child.attrib.get('lemma') # get the attribute 'lemma'
            pos = child.attrib.get('pos') # get the attribute 'pos' (pos-tag)
            word = child.attrib.get('word') # get the attribute 'word'
            
            rows.append({"file_name": file_name, "lemma": lemma, "POS": pos, "word":word})
            # append dictionary containing file name, lemma, pos, word
    
    df = pd.DataFrame(rows, columns = col_list)
    
    return df
  #  print(f"writing {file_name} successful")

In [None]:
## EXAMPLE EXECUTION OF CODE:
import glob, os

df_list = [] # make empty list to add data frames to 

for file in glob.glob("*.xml"):
    df = write_attr_df(file, col_list, queried_tag)
    
    df_list.append(df)
    
df_result = pd.concat(df_list, axis = 0)

df_result.to_csv("frantext_XML.csv", sep=",", header=True, index=False, encoding='utf-8')

In [179]:
df = pd.read_csv("frantext_XML.csv")
df.head()

Unnamed: 0,file_name,word,lemma,POS
0,WFtemp_FONTENELLE_Entretiens_Pluralite_des_mondes,DÉDICACE,dédicace,NC
1,WFtemp_FONTENELLE_Entretiens_Pluralite_des_mondes,à,à,P
2,WFtemp_FONTENELLE_Entretiens_Pluralite_des_mondes,Monsieur,Monsieur,NC
3,WFtemp_FONTENELLE_Entretiens_Pluralite_des_mondes,L.,L.,NP
4,WFtemp_FONTENELLE_Entretiens_Pluralite_des_mondes,vous,vous,CLS


# Step 3. Frantext concordancer

I will now use the data frame constructed above to construct a data frame consisting of only concordances of ALLER + INF. This is the same function as used for the EPUB data. When 'aller + INF' found, the previous 50 words will be joined and the following 50 words will be joined, in order to retrieve a concordance dataframe.

In [4]:
def frantext_concordancer_to_df(df):
    """
        
        This function takes as arguments the dataframe constructed from the Frantext XML.
        
        1. Subsequently, it builds a list of indices that answer the following criteria: the lemma must be 'aller' and it must be followed by an infinitive.
        
        2. Based on this index list, which thus only consists of indices that have aller + inf, it builds a dataframe for each index in the index list: 
            a) a column for the filename, based on the file name in the original dataframe
            b) the previous 50 words preceding the instance of 'aller' that is followed by an infinitive
            c) the word immediately preceding 'aller'
            d) the instance of 'aller' itself
            e) the parts-of-speech tag of the instance of 'aller'
            f) the infinitive following 'aller'
            g) the word immediately following the infinitive
            h) the parts-of-speech tag of the word immediately following the infinitive
            i) the following 50 words after the infinitive
        
        3. So, after having constructed a dataframe for each index in the index list, it adds these dataframes to the empty list 'data', and subsequently concatenates 
        them into one dataframe.
        
    """
    index_list = [index for index in list(df[df['lemma'] == "aller"].index) if df["POS"].iloc[index+1] == "VINF"] # build index list
                # this list comprehension builds a list of indices of rows where the lemma is aller, and the following
                # POS-tag is an infinitive 
        
    data = [] # instantiate empty list
    
    for index in index_list: # build dataframe for each index in the index list
        temp = pd.DataFrame({
            "filename" : [df["file_name"][index]], # get filename of that index
            "previous50": [" ".join(df["word"][index-50:index-1])], # join previous 50 words
            "prev1": [df["word"].iloc[index-1]], # get previous word
            "aller": [df["word"].iloc[index]], # get instance of 'aller' itself
            "aller_POS": [df["POS"].iloc[index]], # get pos-tag of 'aller'
            "INF": [df["word"].iloc[index+1]], # get infinitive following 'aller'
            "next1": [df["word"].iloc[index+2]], # get word following infinitive
            "next1_POS" : [df["POS"].iloc[index+2]], # get POS-tag of word following infinitive
            "next50": [" ".join(df["word"][index+3:index+50])] # join following 50 words
        })
        
        data.append(temp) # add dataframes to list
        
    concordance_df = pd.concat(data, axis="rows", ignore_index=True) # concatenate all dataframes in list
    
    return concordance_df # return one big data frame containing all the concordances 
    

In [38]:
### EXAMPLE
df_early.head()

Unnamed: 0,filename,previous50,prev1,aller,aller_POS,INF,next1,next1_POS,next50
0,WFtemp_LABE_Debat_de_folie_et_amour,"à gouverner les Viles , sans que lon l' apelle...",d',aller,VINF,planter,des,P+D,"chous . Le fol ira tant et viendra , en donner..."
1,WFtemp_LABE_Debat_de_folie_et_amour,ay dit . Quand Mercure ut fini la defense de F...,",",và,V,prononcer,un,DET,arrest interlocutoire en cette maniere : Pour ...
2,WFtemp_Montaigne_Essais1-2,"on peut faire aux ennemis en guerre , cela est...",les,alla,V,charger,tous,PRO,"endormis et les défict , alleguant qu' en sa t..."
3,WFtemp_Montaigne_Essais1-2,"meint un , qui pour avoir ou haussé la main , ...",estant,allé,VPP,recognoistre,la,DET,"Ville d' Arle , et s' estant jetté hors du cou..."
4,WFtemp_Montaigne_Essais1-2,"tels Princes , que le plus grand soit avant le...",moindres,vont,V,trouver,",",PONCT,"et le recherchent , non pas luy eux . Non seul..."
