Claes Pauline. Master Digital Text Analysis. Student ID: 20163274

# Script for handling all EPUB data (coming from Google Books corpus)

In [None]:
import glob, os
import pandas as pd
from bs4 import BeautifulSoup

In [None]:
pwd

# Step 1. Epub to txt
In Epub files, there are multiple folders, including one 'content'. This then includes multiple xml files, all containing a piece of text of the work in question.

Therefore, we open all xml files in each of the folders, and write them together to 1 txt file that can be queried and edited.

### Script

In [None]:
def get_one_txt(path_to_epub, new_path_to_txt): 
    '''Function to write all of the separate xml files of the EPUB format into one txt file.
    Takes as argument a path to the folder containing the epub files, as well as a path to a new folder, in which the 
    merged .txt files should be placed. Prints its progress. 
    '''
    folders = os.listdir(path_to_epub) # check all files in directory
    for item in folders: # iterate over folders
        if ".DS_Store" in folders: 
            folders.remove(".DS_Store") # remove this extension
    #folders.remove(".DS_store")
    
    for folder in folders: # iterate over all folders
        text_list = [] # create empty list 
        
        for file in glob.glob(f"{path_to_epub}/{folder}/*.xml"): # iterate over all xml files
            with open(file) as f: 
                text = f.read() #read in XML data
                
            text_list.append(text) # append text to list
        
        # first write a txt file in original folder
        with open(f"{path_to_epub}/{folder}/{folder}.txt", "w") as t: 
            for item in text_list: 
                t.write(item) # write all of the elements in the list to one and the same file
        
        print(f"-----Done writing {folder}.txt in original folder-----")
        
        # then write same file to one folder containing all txts
        with open(f"{new_path_to_txt}/{folder}.txt", "w") as t: 
            for item in text_list: 
                t.write(item)
        print(f"-----Done writing {folder}.txt in new txt folder-----")

### First selection epub to 1 txt

#### early

In [None]:
cd /Users/paulineclaes/Documents/dta/Thesis/Data/XML/EarlyTexts

In [None]:
path_to_epub = "/Users/paulineclaes/Documents/dta/Thesis/Data/XML/EarlyTexts/EarlyTexts_Epub"
new_path_to_txt = "/Users/paulineclaes/Documents/dta/Thesis/Data/XML/EarlyTexts/EarlyTexts_txt"
get_one_txt(path_to_epub, new_path_to_txt)

#### later

In [None]:
path_to_epub = "/Users/paulineclaes/Documents/dta/Thesis/Data/XML/KandidaatTeksten/KandidaatTeksten_Epub"
new_path_to_txt = "/Users/paulineclaes/Documents/dta/Thesis/Data/XML/KandidaatTeksten/KandidaatTeksten_txt"

get_one_txt(path_to_epub, new_path_to_txt)

### Second selection epub to 1 txt

#### early

In [None]:
path_to_epub = "/Users/paulineclaes/Documents/dta/thesis/Data/XML/EarlyTexts/EarlyTexts_SecondSelection/EarlyTexts_SecondSelection_Epub"
new_path_to_txt = "/Users/paulineclaes/Documents/dta/thesis/Data/XML/EarlyTexts/EarlyTexts_SecondSelection/EarlyTexts_SecondSelection_txt"

In [None]:
get_one_txt(path_to_epub, new_path_to_txt)

#### later

In [None]:
path_to_epub = "/Users/paulineclaes/Documents/dta/thesis/Data/XML/LaterTexts/LaterTexts_SecondSelection/LaterTexts_SecondSelection_Epub"
new_path_to_txt = "/Users/paulineclaes/Documents/dta/thesis/Data/XML/LaterTexts/LaterTexts_SecondSelection/LaterTexts_SecondSelection_txt"

In [None]:
get_one_txt(path_to_epub, new_path_to_txt)

# Step 2. Epub corrector

In the previous step, all XML files from the EPUB files were extracted and written to one file. These EPUB files contain many separate xml files. These were read by the script, and merged into one txt document. In this current step, the merged XML files (merged into a txt file, which still contain a lot of junk) will be cleaned, so only clean text remains.

In [11]:
# Function to remove tags
def remove_tags(doc):
    '''Function to remove all tags, so only clean text within the tags
    remains.'''
    # parse html content
    soup = BeautifulSoup(doc, "html.parser")
  
    for data in soup(['style', 'script']):
        # Remove tags
        data.decompose()
  
    # return data by retrieving the tag content
    text = ' '.join(soup.stripped_strings)
    
    return text

def clean_text_to_txt(path_to_cleaned_txt): 
    """Function that takes as argument the path to the directory containing
    tagged, unclean txt files. Then removes the tags using BeautifulSoup, 
    and writes the clean text to a new txt file."""

    for file in glob.glob("*.txt"):

        with open(file) as f: 
            text = f.read()

        clean_text = remove_tags(text)
        
        filename = file.replace(".txt", r"")

        with open(f"{path_to_cleaned_txt}/{filename}_cleaned.txt", 'w') as t: 
            t.write(clean_text)

# Step 3. EPUB parsing

In this script, I will parse the texts and make a data frame identical to the one of the texts coming from Frantext. More specifically, I will manipulate the data in such a way that it contains a row per word in the entire corpus, and four columns: word, lemma, POS, and filename. In the very last step, this data frame will be used to query the data for concordances of ALLER + INF. 



In [3]:
import spacy
from spacy.lang.fr.examples import sentences 
#from spacy_lefff import LefffLemmatizer, POSTagger
#from spacy.language import Language
#from spacy.morphology import Morphology
nlp = spacy.load("fr_core_news_sm")
#nlp.add_pipe("melt_tagger", after="parser")
#nlp.add_pipe("french_lemmatizer", after="melt_tagger")


In [4]:
doc = nlp("aller")

token = doc[0]

print(token.morph)

VerbForm=Inf


In [5]:
nlp.max_length = 5500000 # set max length to a random high number to avoid length errors

In [6]:
def get_word_lemma_POS_df(extension):
    """
    Function to get a word-lemma-POS data frame in the same form as the data frame constructed for Frantext data. 
    Contains four columns: file name, word (as it appears in corpus), lemma (lemma of word), POS (part-of-speech tag)
    """
    
    data = [] # initiate empty list
    
    
    
    for file in glob.glob(extension): # iterate over foldeR
        with open(file, 'r') as f: # open each file
            text = f.read() # read it in
        
        filename = file.replace(".txt", "") # get the file name without the extension
        
        df = pd.DataFrame({ # construct data frame for each separate text. This is done so texts will not get mixed
          #  "filename": [filename], 
            "word" : [token for token in nlp(text)], # word as it appears in corpus (tokenized)
            "lemma" : [token.lemma_ for token in nlp(text)], # lemma of that word
            "POS" : [token.pos_ for token in nlp(text)] # part of speech tag of that word
        })
        
        df["filename"] = filename # add file name
        
        data.append(df) # add dataframe per file to list
        
    word_lemma_POS_df = pd.concat(data, axis="rows", ignore_index=True) # concatenate all data frames in list to one dataframe
    
    return word_lemma_POS_df

# Step 4. EPUB concordancer

After having made a WLP (Word-Lemma-POS) dataframe for the EPUB files, this WLP-dataframe will now be queried for occurrences of 'aller + INF'. When found, the previous 50 words will be joined and the following 50 words will be joined, in order to obtain a concordance dataframe.

In [3]:
def concordancer_to_df(df):
    """
        
        This function takes as arguments the dataframe constructed from the Epub XML.
        
        1. Subsequently, it builds a list of indices that answer the following criteria: the lemma must be 'aller' and it must be followed by an infinitive.
        
        2. Based on this index list, which thus only consists of indices that have aller + inf, it builds a dataframe for each index in the index list: 
            a) a column for the filename, based on the file name in the original dataframe
            b) the previous 50 words preceding the instance of 'aller' that is followed by an infinitive
            c) the word immediately preceding 'aller'
            d) the instance of 'aller' itself
            e) the parts-of-speech tag of the instance of 'aller'
            f) the infinitive following 'aller'
            g) the word immediately following the infinitive
            h) the parts-of-speech tag of the word immediately following the infinitive
            i) the following 50 words after the infinitive
        
        3. So, after having constructed a dataframe for each index in the index list, it adds these dataframes to the empty list 'data', and subsequently concatenates 
        them into one dataframe.
        
    """
    index_list = [index for index in list(df[df['lemma'] == "aller"].index) if df["POS"].iloc[index+1] == "VERB"] 
            # build index list for each row that has a lemma 'aller' and for which the following POS-tag is 'VERB'
            # this way, we get the indices of the concordances that we are looking for
    
    data = [] # instantiate empty list
    
    for index in index_list: # build dataframe for each index in the index list
        temp = pd.DataFrame({
            "filename" : [df["filename"][index]], # get file name
            "previous50": [" ".join(df["word"][index-50:index-1])], # get previous 50 words
            "prev1": [df["word"].iloc[index-1]], # get word right before
            "aller": [df["word"].iloc[index]], # get instance of aller itself (not lemmatized)
            "aller_POS": [df["POS"].iloc[index]], # get POS-tag of ALLER (this will be 'VERB', but this is just to check if 
                                                  # everything worked correctly afterward)
            "INF": [df["word"].iloc[index+1]], # get verb right after 'aller' (not lemmatized)
            "next1": [df["word"].iloc[index+2]], # get word right after verb following 'aller'
            "next1_POS" : [df["POS"].iloc[index+2]], # get pos-tag of word right after verb following 'aller'
            "next50": [" ".join(df["word"][index+3:index+50])] # get following 50 words
        })
        
        data.append(temp) # add dataframes to list
        
    concordance_df = pd.concat(data, axis="rows", ignore_index=True) # concatenate
    
    return concordance_df