Claes Pauline. Master Digital Text Analysis. Student ID: 20163274

# Metadata
This script contains all code used for adding metadata to data frames. 

In [1]:
import pandas as pd

## Add wordcounts to French data
While English textdata already have information on word counts from EEBO and EMMA metadata, I do not have this for the French data. Therefore, this needs to be counted. For French texts coming from Frantext as well as Google Books, all textdata was parsed into a data frame of four columns (Word, Lemma, POS, filename), containing one row per word. Therefore, it makes sense to group that data frame per filename and count the number of rows that each file contains (as one word = one row). However, it needs to be filtered, since punctuation needs to be excluded. 


In [8]:
def read(path):
    return pd.read_csv(path)

In [78]:
wlp_frantext_early = read("/Users/paulineclaes/Documents/dta/Thesis/Data/Dataframes/WLP/frantext_WLP_early.csv")
wlp_epub_early = read("/Users/paulineclaes/Documents/dta/Thesis/Data/Dataframes/WLP/epub_WLP_early.csv")
wlp_frantext_later = read("/Users/paulineclaes/Documents/dta/Thesis/Data/Dataframes/WLP/frantext_WLP_later.csv")
wlp_epub_later = read("/Users/paulineclaes/Documents/dta/Thesis/Data/Dataframes/WLP/epub_WLP_later.csv")

In [12]:
def get_separate_df_per_filename(df):
    """Function to get a separate data frame per file name. 
    Takes as input the WLP data frame, prints the number of words excluding punctuation and spaces."""
    
    for filename in df["file_name"].unique(): # for each unique filename in data frame
        new_df = df[df["file_name"] == filename] # construct a new df of only that file name
        new_df = new_df.drop(new_df.index[new_df['POS'].isin(["PUNCT", "PONCT", "SPACE"])], axis=0) # drop punctuation and spaces using their POS-tags
        print(f"filename: {filename} \tTokens: {len(new_df)}\n") # print the filename and the number of rows in that data frame ( so the number of words )

In [13]:
def get_dict_per_filename(df):
    """Function to get a separate data frame per file name. 
    Takes as input the WLP data frame, adds the file name and its number of words excluding punctuation and spaces to a dictionary.
    Key = filename, value = number of words.
    """
    
    author_dict = {}
    for filename in df["file_name"].unique():
        new_df = df[df["file_name"] == filename]
        new_df = new_df.drop(new_df.index[new_df['POS'].isin(["PUNCT", "PONCT", "SPACE"])], axis=0) # drop punctuation and spaces using their POS-tags
        author_dict[filename] = len(new_df)
    return author_dict

In [14]:
def merge_dicts(dict1, dict2):
    dict1.update(dict2)
    return dict1

In [None]:
## EXAMPLE 
# building the dictionary to contain the wordcounts

author_dict_frantext = get_dict_per_filename(wlp_frantext_early) # wordcounts for frantext WLP
author_dict_epub = get_dict_per_filename(wlp_epub_early) # wordcounts for EPUB WLP
author_dict = merge_dicts(author_dict_frantext, author_dict_epub) # add them to one dictionary

In [110]:
### EXAMPLE
# read in the data frame that we want to map the word counts to (per file name)
df = pd.read_csv("/Users/paulineclaes/Documents/dta/Thesis/Data/Dataframes/concordance/all_early_concordance.csv")

In [112]:
# insert a column containing the wordcounts based on the file name column
df.insert(5, "all_tokens", df["filename"].map(author_dict))

# Merge metadata with concordance dataframe

## 1. Assigning a unique ID to each author

In [3]:
df = pd.read_excel("/Users/paulineclaes/Documents/dta/thesis/ClaesPauline_thesis_finaleversie/data/final_metadata.xlsx")

In [152]:
# assign unique number to authors, starting from 1 
df.insert(3, "author_id", df.groupby(["author"], sort=False).ngroup()+1) 

In [154]:
author_id_list = [] # instantiate empty list
for author_id, author_df in df.groupby(["author_id"]): # groupby author and iterate
    author_df = author_df.reset_index() # reset the index to the author
    author_df.insert(5, "text_id_per_author", author_df.index+1) # number of texts per author (count restarts at 1 for each new author)
    author_id_list.append(author_df) # add to list
    
new_df = pd.concat(author_id_list).reset_index(drop=True) # get it into one dataframe


In [156]:
# merge authorIDs with textIDs, so that each text effectively has a unique ID 
new_df.insert(6, 
              "authorId_textId", 
              [f"{row['author_id']}_{row['text_id_per_author']}" for index, row in new_df.iloc[:, 4:6].iterrows()])




In [159]:
# insert a new column that indicates whether a text is a translation, reference text or source text
new_df.insert(7, "transl_ref_srcTxt", 
              ["transl" if "T" in value else "srcTxt" if "FS" in value else "ref" for value in new_df["data_identifier"]])

## 2. Mapping metadata to concordance dataframe 
(inserting unique identifiers per text, and other information)

In [106]:
#function to map metadata dictionary with key=filename to the dataframe

def map_filename_dict_to_df(source_df, target_df, target_index, target_colname, col1, col2):
    
    """
    Arguments: source_df, target_df, target_index, target_colname, col1, col2
    - source_df: metadata df
    - target_df : df you want to insert metadata 
    - target_index: index you want new column to be
    - target_colname: column name you want new column to have
    - col1: column name of column you want the metadata to be based on (so a column that is shared across dataframes)
    - col2: the column containing information you want to transfer across dataframes.
    
    Actual function: 
    def map_filename_dict_to_df(source_df, target_df, target_index, target_colname, col1, col2):
        filename_dict = {filename:value for filename, value in zip(source_df[col1], source_df[col2])}
        target_df.insert(target_index, target_colname, target_df[col1].map(filename_dict))
    
        return target_df
    
    """
    
    filename_dict = {filename:value for filename, value in zip(source_df[col1], source_df[col2])}
    target_df.insert(target_index, target_colname, target_df[col1].map(filename_dict))
    
    return target_df

In [None]:
# EXAMPLE of doing it for 1 column


# add unique author id based on the unique data identifier
f = map_filename_dict_to_df(source_df = m, 
                            target_df = f,
                            target_index=3,
                            target_colname="author_id",
                            col1 = "data_identifier", 
                            col2 = "author_id"
)

In [134]:
# EXAMPLE of doing it in bulk at once


col_list = ["period", 
            "data_identifier", 
            "author_id", 
            "text_id_per_author", 
            "authorId_textId", 
            "title", 
            "USTC_subject_classification", 
            "author", 
            "textDate", "wordcount"]
print(len(col_list))
index_list = [i for i in range(0, len(col_list))]
print(index_list, len(index_list))

for col_name, col_ix in zip(col_list, index_list):
    addData = map_filename_dict_to_df(
        source_df = m, 
        target_df = addData, 
        target_index = col_ix, 
        target_colname = col_name, 
        col1 = "filename", 
        col2 = col_name
    )


10
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9] 10


## 3. Add numeric period category

For the English data, we have a classification into decades: 1580-1589, 1590-1599, ...

We now want to turn this into a numeric variable. There are 5 decades in total (1580-1589, 1590-1599, 1600-1609, 1680-1689, 1690-1699). These will be assigned a number chronologically. 

In [2]:
en = pd.read_excel('/Users/paulineclaes/Documents/dta/thesis/finaldata/final_GoToInf.xlsx')
fr = pd.read_excel('/Users/paulineclaes/Documents/dta/thesis/finaldata/final_AllerINF.xlsx')

In [3]:
en.head()

Unnamed: 0,period,data_identifier,timeframe,author_id,text_id_per_author,authorId_textId,filename,title,author,textDate,...,position,position_fronting,argmt,adverbial,split,attention?,preceding_context,match,following_context,note
0,1600-1609,ET13,early,11,1,11_1,A05339.xml,Noua Francia : or The description of that part...,"Erondelle, P.",1609.0,...,,,,,,,", neere about the Açors, well fil led with ...",going,a fiſhing for New-found-land-fiſh. And they as...,
1,1600-1609,ET09,early,8,1,8_1,A01991.xml,Admirable and memorable histories containing t...,"Grimeston, Edward",1607.0,...,,,,,,,", by reason of the greatnesse and length. Thi...",going,"a iourney with his Wa gon, was be-nighted and ...",
2,1600-1609,ET13,early,11,1,11_1,A05339.xml,Noua Francia : or The description of that part...,"Erondelle, P.",1609.0,...,,,,,,,"hogſ heads of Meale, which were giuen to the...",going,a way. The eleuenth of Auguſt the ſaid Monſi...,
3,1600-1609,ET14,early,12,1,12_1,99850354.xml,"Fovvre bookes, of the institution, vse and doc...",anon10,1600.0,...,,,,,,,": that is, of a s...",going,"about it seauen times, ...",
4,1600-1609,ET14,early,12,1,12_1,99850354.xml,"Fovvre bookes, of the institution, vse and doc...",anon10,1600.0,...,,,,,,,impotencie weaknesse ...,going,"about to adore the Head in heauen, ...",


In [4]:
en.insert(1, "period_category", en.groupby(["period"], sort=True).ngroup()+1) 

> However, for French data, this is less straightforward, since these were not first classified into decades, and have a wider range of text dates, since they precede the corresponding English translation.

> Therefore, the range of possible textdates for French is divided into 5 equal chunks. 

In [41]:
fr.head()

Unnamed: 0,timeframe,kind,data_identifier,author_id,text_id_per_author,authorId_textId,fr_source_filename,fr_source_title,fr_source_author,fr_source_textDate,...,vpers,pos,previous50,prev1,aller,aller_POS,INF,next1,next1_POS,next50
0,early,french_original,EFS01,35,1,35_1,WFtemp_LABE_Debat_de_folie_et_amour,Débat de folie et d'amour,"Labé, Louise",1555,...,3sg,inf,"à gouverner les Viles , sans que lon l' apelle...",d',aller,VINF,planter,des,P+D,"chous . Le fol ira tant et viendra , en donner..."
1,early,french_original,EFS01,35,1,35_1,WFtemp_LABE_Debat_de_folie_et_amour,Débat de folie et d'amour,"Labé, Louise",1555,...,3sg,pres,ay dit . Quand Mercure ut fini la defense de F...,",",và,VP,prononcer,un,DET,arrest interlocutoire en cette maniere : Pour ...
2,early,french_original,EFS13,47,1,47_1,LESCARBOT_cleaned,"Histoire de la Nouvelle France (Lescarbot, Marc)","Lescarbot, Marc",1609,...,3sg,past,de Canada & Hochelaga au temps de post Tacques...,est,allé,VPP,rechercher,leurs,DET,"pelleteries , Canada que pour icelles ils ont ..."
3,early,french_original,EFS13,47,1,47_1,LESCARBOT_cleaned,"Histoire de la Nouvelle France (Lescarbot, Marc)","Lescarbot, Marc",1609,...,3pl,pres,qu' vn autre & n' en perdent point yn tour de ...,les,vont,VP,voir,de,ADP,plus grand chose : comme pardeça quand on pres...
4,early,french_original,EFS13,47,1,47_1,LESCARBOT_cleaned,"Histoire de la Nouvelle France (Lescarbot, Marc)","Lescarbot, Marc",1609,...,inf,pres,à l' vn des bours dudit lac ne nous apparoisso...,&,aller,VINF,chercher,passage,NOUN,tre ou cinq rivieres toutes sortantes dudit ff...


In [3]:
import numpy as np

In [58]:
conditions = [
    (fr['fr_source_textDate'] >= 1502) & (fr['fr_source_textDate'] <= 1542), # 1
    (fr['fr_source_textDate'] >= 1543) & (fr['fr_source_textDate'] <= 1583), # 2
    (fr['fr_source_textDate'] >= 1584) & (fr['fr_source_textDate'] <= 1624), # 3
    (fr['fr_source_textDate'] >= 1625) & (fr['fr_source_textDate'] <= 1665), # 4
    (fr['fr_source_textDate'] >= 1666) & (fr['fr_source_textDate'] <= 1699) # 5
]

values = ['1', '2', '3', '4', '5']

fr.insert(1, 'period_category', np.select(conditions, values))

In [59]:
fr.head()

Unnamed: 0,timeframe,period_category,kind,data_identifier,author_id,text_id_per_author,authorId_textId,fr_source_filename,fr_source_title,fr_source_author,...,vpers,pos,previous50,prev1,aller,aller_POS,INF,next1,next1_POS,next50
0,early,2,french_original,EFS01,35,1,35_1,WFtemp_LABE_Debat_de_folie_et_amour,Débat de folie et d'amour,"Labé, Louise",...,3sg,inf,"à gouverner les Viles , sans que lon l' apelle...",d',aller,VINF,planter,des,P+D,"chous . Le fol ira tant et viendra , en donner..."
1,early,2,french_original,EFS01,35,1,35_1,WFtemp_LABE_Debat_de_folie_et_amour,Débat de folie et d'amour,"Labé, Louise",...,3sg,pres,ay dit . Quand Mercure ut fini la defense de F...,",",và,VP,prononcer,un,DET,arrest interlocutoire en cette maniere : Pour ...
2,early,3,french_original,EFS13,47,1,47_1,LESCARBOT_cleaned,"Histoire de la Nouvelle France (Lescarbot, Marc)","Lescarbot, Marc",...,3sg,past,de Canada & Hochelaga au temps de post Tacques...,est,allé,VPP,rechercher,leurs,DET,"pelleteries , Canada que pour icelles ils ont ..."
3,early,3,french_original,EFS13,47,1,47_1,LESCARBOT_cleaned,"Histoire de la Nouvelle France (Lescarbot, Marc)","Lescarbot, Marc",...,3pl,pres,qu' vn autre & n' en perdent point yn tour de ...,les,vont,VP,voir,de,ADP,plus grand chose : comme pardeça quand on pres...
4,early,3,french_original,EFS13,47,1,47_1,LESCARBOT_cleaned,"Histoire de la Nouvelle France (Lescarbot, Marc)","Lescarbot, Marc",...,inf,pres,à l' vn des bours dudit lac ne nous apparoisso...,&,aller,VINF,chercher,passage,NOUN,tre ou cinq rivieres toutes sortantes dudit ff...


> Inserting the same period category in the metadata dataframe.

In [None]:
meta = pd.read_excel('/Users/paulineclaes/Documents/dta/thesis/finaldata/final_metadata.xlsx')

In [63]:
# english subset (which already has a classification per decade)
meta_en_subset = meta[meta['transl_ref_srcTxt'] != 'srcTxt']
# french subset (which does not yet have a classification per decade)
meta_fr_subset = meta[meta['transl_ref_srcTxt'] == 'srcTxt']

In [64]:
# insert period category in english data
meta_en_subset.insert(3, "period_category", meta_en_subset.groupby(["period"], sort=True).ngroup()+1) 

In [None]:
# insert period category in French data
meta_fr_subset.insert(3, 'period_category', np.select(conditions, values))

In [72]:
# concatenating two data frames row-wise
meta_new = pd.concat([meta_en_subset, meta_fr_subset])

In [76]:
# write to excel file 

#meta_new.to_excel('/Users/paulineclaes/Documents/dta/thesis/finaldata/final_metadata.xlsx',
#                 index=False,
#                 na_rep='NA')