In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import re
import os
import csv

### Generating CSV file from XML

In [17]:
xml_file_path = "/Users/ridhipurohit/Documents/French Revolution Data/frc"

# CSV file to write the data
csv_file_path = os.path.join("/Users/ridhipurohit/Documents/French Revolution Data/", 'data_frc.csv')

# Set to hold all possible headers
headers = set(['file_name', 'text_content'])  # Initial known fields
    
    # Initialize the CSV file and writer
with open(csv_file_path, 'w', newline='', encoding='utf-8') as csvfile:
        writer = None

        # Iterate over all files in the directory
        for file_name in os.listdir(xml_file_path):
            if file_name.endswith('.xml'):
                full_path = os.path.join(xml_file_path, file_name)

                # Read the XML content from the file
                with open(full_path, 'r', encoding='utf-8') as file:
                    xml_content = file.read()

                    # Create a BeautifulSoup object for XML parsing
                    soup = BeautifulSoup(xml_content, 'xml')

                    # Extract metadata
                    metadata = soup.find('metadata')
                    metadata_dict = {}
                    if metadata:
                        for element in metadata.find_all(recursive=False):
                            metadata_dict[element.name] = element.text.strip()

                    # Extract text content using regex to capture everything within <text>...</text>
                    text_match = re.search(r'<text>(.*?)</text>', xml_content, re.DOTALL)
                    full_text = ""
                    if text_match:
                        # Extract and parse HTML content within text tags
                        text_html = text_match.group(1)
                        text_soup = BeautifulSoup(text_html, 'html.parser')
                        paragraphs = text_soup.find_all('p')
                        full_text = " ".join(p.get_text(strip=True) for p in paragraphs)

                    # Prepare the row to be written to CSV
                    row_data = {'file_name': file_name, **metadata_dict, 'text_content': full_text}

                    # Check if new fields have appeared and update writer if necessary
                    new_fields = set(row_data.keys()) - headers
                    if new_fields:
                        headers.update(new_fields)
                        writer = csv.DictWriter(csvfile, fieldnames=list(headers))
                        csvfile.seek(0)
                        writer.writeheader()

                        # Write the data row
                    writer.writerow(row_data)

In [28]:
df = pd.read_csv("/Users/ridhipurohit/Documents/French Revolution Data/data_frc.csv", low_memory=False)

In [36]:
len(df)

11443

In [15]:
df.columns

Index(['subject', 'search_id', 'dfate', 'physical-description', 'rerepublish',
       'volume', 'text_content', 'creator', 'sponsor', 'ocr', 'language',
       'bwocr', 'repub_state', 'languate', 'page-progression', 'scanfee',
       'file_name', 'contributor', 'year', 'invoice', 'link_to_catalog',
       'foldoutcount', 'scanner', 'uploader', 'republisher_date', 'collection',
       'physical_description', 'curation', 'item_number', 'scanningcenter',
       'openlibrary_edition', 'shiptracking', 'scandate', 'openlibrary_work',
       'sponsordate', 'date', 'repub_seconds', 'identifier', 'identifier-ark',
       'mediatype', 'notes', 'identifier-access', 'topicmodel', 'publisher',
       'republisher_operator', 'republisher_time', 'operator', 'ppi',
       'description', 'bookplateleaf', 'republisher', 'title', 'imagecount',
       'allsubjects', 'foldout_seconds', 'bookreader-defaults', 'lccn',
       'backup_location', 'call_number', 'coverage', 'foldout-operator',
       'camera', '

In [37]:
df['date'].unique()

array(['1791', '1796', '1797', '1790', '1792', '1789', '1795', '1794',
       '1793', '1798', '1799', '1788', '1787', '1797?', '1789-1791',
       '1831', '1797-1798', '1798/1799', '1789?', '1793-1794', '1793?',
       '1800', '1799-1800', '1791-1792', '1790-1799?', '1814',
       '1793/1794', '1790?', '1795-1796', '1796/1797', '1790-1791',
       '1801', '1794-1795', '1792/1793', '1797/1798', '1795/1799',
       '1796-1797', '1820'], dtype=object)

In [10]:
df.head()

Unnamed: 0,subject,search_id,dfate,physical-description,rerepublish,volume,text_content,creator,sponsor,ocr,...,coverage,foldout-operator,camera,updatedate,publicdate,citation,addeddate,link-to-catalog,updater,source
202,Credit,,,,,,ffcfrff - ' MÉMOIRE SUR LES finances ET SUR LE...,"Lauraguais, Louis-Léon-Félicité, comte de, ...",The Newberry Library,ABBYY FineReader 11.0,...,,,Canon EOS 5D Mark II,2016-08-11 12:29:11,2016-08-11 11:30:03,"Martin & Walter. Révolution française, I, ...",2016-08-11 12:29:13,,chrissy-robinson@archive.org,
440,"Finance, Public",,,,,,O (je 0/L-li^ O- t L' \nyyV (. '^iy -c- ■1:^^ ...,France. Assemblée des notables (1787-1788).,The Newberry Library,ABBYY FineReader 11.0,...,,,Canon EOS 5D Mark II,2017-02-14 14:17:39,2017-02-14 13:00:36,"Martin & Walter. Révolution française, I, 5810",2017-02-14 14:17:41,,chrissy-robinson@archive.org,
460,Letters patent,,,,,,"DIFFÉRENS DISCOURS AU ROI, Par M. de Valentin,...",France. Cour des aides (Paris).,The Newberry Library,ABBYY FineReader 11.0,...,,,Canon EOS 5D Mark II,2016-04-18 20:47:14,2016-04-18 19:36:56,,2016-04-18 20:47:16,,chrissy-robinson@archive.org,
676,Adultery,,,,,,Cmc. m lettre D E M. baudet DE jordan. BffiNEV...,"Bergasse, Nicolas, 1750-1832.",The Newberry Library,ABBYY FineReader 11.0,...,,,Canon EOS 5D Mark II,2016-08-16 14:29:17,2016-08-16 12:59:09,,2016-08-16 14:29:19,,chrissy-robinson@archive.org,
683,Botanical literature,,,p. ;cm.,,,"I7^y P RO s PE CT U s, NOUVEAU TRAITÉ Lu \ -M ...","Valade, veuve, fl. 1784-1799, printer.",The Newberry Library,ABBYY FineReader 11.0,...,,,,2017-06-28 19:59:26,2017-06-28 18:55:26,,2017-06-28 19:59:28,http://vufind.carli.illinois.edu/vf-nby/Record...,chrissy-robinson@archive.org,


In [89]:
df['text_content'].head()

114    LES VÉRITABLES INTÉRÊTS DES VftC \n% ^ TROIS O...
123    V e* %CK° ‘AïfVflfi'' EXTRAIT DES REGISTRES D ...
184    PROCÈS-VERBAL DE V INSTALLATION DU GRAND BAILL...
220    |5 0 ÜTT» oit +^-ér§ V ( I )  RÉFLEXIONS Sur V...
230    ■ LES voeux DE LA PATRIE FORMÉS ET SATISFAITS»...
Name: text_content, dtype: object

### Generating Yearly CSV Files

In [45]:
df1 = df.loc[df['date'] == '1791']

In [46]:
len(df1)

1320

In [47]:
df1.to_csv("/Users/ridhipurohit/Documents/French Revolution Data/Rep Learning/Yearly Data/data_frc_1791.csv")

In [48]:
df2 = pd.read_csv("/Users/ridhipurohit/Documents/French Revolution Data/Rep Learning/Yearly Data/data_frc_1791.csv")
len(df2)

1320

### PRE_PROCESSING YEARLY DATA

In [49]:
def remove_single_letters(text):
    return ' '.join([word for word in str(text).split() if len(word) > 1])


In [50]:
# Pre-processing text

def clean_text(df):
    df['text_content'] = df['text_content'].str.lower()
    df['text_content'] = df['text_content'].str.replace(r'[^\w\s]', '', regex=True)
    df['text_content'] = df['text_content'].str.replace(r'\d+', '', regex=True)
    df['text_content'] = df['text_content'].str.strip()
    df['text_content'] = df['text_content'].str.replace(r'\s+', ' ', regex=True)
    df['text_content'] = df['text_content'].apply(remove_single_letters)

    return df

In [52]:
fp1 = "/Users/ridhipurohit/Documents/French Revolution Data/Rep Learning/Yearly Data/data_frc_1787.csv"
fp2 = "/Users/ridhipurohit/Documents/French Revolution Data/Rep Learning/Yearly Data/data_frc_1789.csv"
fp3 = "/Users/ridhipurohit/Documents/French Revolution Data/Rep Learning/Yearly Data/data_frc_1790.csv"
fp4 = "/Users/ridhipurohit/Documents/French Revolution Data/Rep Learning/Yearly Data/data_frc_1791.csv"
output_dir = "/Users/ridhipurohit/Documents/French Revolution Data/Rep Learning/Pre-Processed Yearly Data"


In [53]:
for fp in [fp1, fp2, fp3, fp4]:
    df = pd.read_csv(fp)
    df = clean_text(df)
    
    # save cleaned data
    file_name = os.path.basename(fp)
    output_path = os.path.join(output_dir, file_name)
    
    df.to_csv(output_path)

In [62]:
df3 = pd.read_csv("/Users/ridhipurohit/Documents/French Revolution Data/Rep Learning/Pre-Processed Yearly Data/frc_1788_norm_text.csv")

In [55]:
df3['text_content'].head()

0    ffcfrff mémoire sur les finances et sur le cré...
1    je lli yyv iy sc prononcé de lordre du roi et ...
2    différens discours au roi par de valentin des ...
3    cmc lettre baudet de jordan bffinevb mm übrary...
4    iy ro pe ct nouveau traité lu physique et écon...
Name: text_content, dtype: object

In [57]:
df3['text_content'].head()

0    fy _ji bref mémoire le lieutenantgénéral au ba...
1    relation de tout ce qui sest passé de relatif ...
2    jj délibération commission intermédiaire de ha...
3    curé de picardie un évêque sur le droit des cu...
4    prononcé par rômantribuths feur aix procureur ...
Name: text_content, dtype: object

In [59]:
df3['text_content'].head()

0    arrêté de la chambre des comptes da ix jv octo...
1    adresse de plusieurs membres du clergé de sain...
2    de notre honneur nous obligent fous prémunir c...
3    apologie oej ftyïv du serment par un prêtée de...
4    observations sur le chapitre vih un imprimé ay...
Name: text_content, dtype: object

In [61]:
df3['text_content'].head()

0    va de dépense qve rend le directoire du départ...
1    éloge funebre de mirabeau prononcé le mai la c...
2    nouvelle instruction en forme de conférence ou...
3    triomphe de rité un ay san the ri yertissement...
4    compte rendu ses commettans par de bonneville ...
Name: text_content, dtype: object

In [63]:
df3['text_content'].head()

0    les véritables intérêts des vftc trois ordres ...
1    ck aïfvflfi extrait des registres sénéchal et ...
2    procèsverbal de installation du grand bailliag...
3    ütt oit ér réflexions sur administration de la...
4    les voeux de la patrie formés et satisfaits le...
Name: text_content, dtype: object

In [94]:
def count_words(text):
    return len(str(text).split())

df3 = df['text_content'].apply(count_words)

# Calculate the total number of words
total_words = df3.sum()
print("Total words:", total_words)

Total words: 2658366


In [95]:
def count_single_letters(text):
    return sum(len(word) == 1 for word in str(text).split())

df4 = df['text_content'].apply(count_single_letters)

# Calculate the total number of single-letter words
total_single_letters = df4.sum()
print("Total single-letter words:", total_single_letters)

Total single-letter words: 154332


In [97]:

df5 = df['text_content'].apply(count_words)

# Calculate the total number of words
total_words = df5.sum()
print("Total words:", total_words)

Total words: 2504034
