In [2]:
import pandas as pd
import numpy as np
import nltk
import requests
import re
import json

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
tab_colors_list = list(mcolors.TABLEAU_COLORS.keys())
import matplotlib.lines as mlines
import seaborn as sns

import gspread
from gspread_dataframe import get_as_dataframe, set_with_dataframe
from google.oauth2 import service_account # based on google-auth library

import sddk

ModuleNotFoundError: No module named 'pandas'

In [None]:
# to access gsheet, you need Google Service Account key json file
# I have mine located in my personal space on sciencedata.dk, so I read it from there:
conf = sddk.configure()

# (1) read the file and parse its content
file_data = conf[0].get(conf[1] + "ServiceAccountsKey.json").json()
# (2) transform the content into crendentials object
credentials = service_account.Credentials.from_service_account_info(file_data)
# (3) specify your usage of the credentials
scoped_credentials = credentials.with_scopes(['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive'])
# (4) use the constrained credentials for authentication of gspread package
gc = gspread.Client(auth=scoped_credentials)
# (5) establish connection with spreadsheets specified by their url
terms = gc.open_by_url("https://docs.google.com/spreadsheets/d/1nONTEwp42CVnq3iCiONrFbJedIcYtBV-l4Bil5mU7Eo/edit?usp=sharing")

In [None]:
# read edh dataset
publicfolder = "b6b6afdb969d378b70929e86e58ad975"
EDH = sddk.read_file("EDH_text_cleaned_2021-01-21.json", "df", publicfolder)
EDH.head(5)

In [None]:
# read edcs dataset
publicfolder = "1f5f56d09903fe259c0906add8b3a55e"
EDCS = sddk.read_file("EDCS_text_cleaned_2021-01-21.json", "df", publicfolder)
EDCS.head(5)

In [None]:
# read declined terms
with open("../data/occups_declined_dict.json", "r") as fp:
    occups_declined_dict = json.load(fp)

In [None]:
# check that our terms are properly arranged (from the longest...)
list(occups_declined_dict.keys())[:20]

# Function to extract terms

In [None]:
def extract_occup_v1(inscription_text):
    occups_found = []
    for occup in occups_declined_dict.keys():
        for occup_morph in occups_declined_dict[occup]:
            try:
                occup_morph_N = inscription_text.lower().count(occup_morph)
                if occup_morph_N > 0:
                    occups_found.extend([occup] * occup_morph_N)
                    inscription_text = inscription_text.replace(occup_morph, "")
                
            except: pass
    return occups_found

In [None]:
def extract_occup(inscription_text):
    occups_found = []
    try:
        inscription_text = inscription_text.lower()
    except:
        inscription_text = ""
    for occup in occups_declined_dict.keys():
        for occup_morph in occups_declined_dict[occup]:
            try:
                if occup_morph in inscription_text: # first check it this way, otherwise skip
                    occup_morph_N = len(re.findall("(\W|^)" + occup_morph + "(\W|$)", inscription_text))
                    occups_found.extend([occup] * occup_morph_N)
                    inscription_text = inscription_text.replace(occup_morph, "")
            except: pass
    return occups_found

In [None]:
extract_occup("fdfsdf. Lotori,")

# Test on a sample

In [None]:
# sample for testing
EDH_sample = EDH[:1000]
len(EDH_sample)

In [None]:
%%time
EDH_sample["occups"] = EDH_sample["clean_text_interpretive_word"].apply(extract_occup)
EDH_occups_list = [el for sublist in EDH_sample["occups"].tolist() for el in sublist]
len(EDH_occups_list) 
# v1: 3.03s, 946 found
# once we remove after first match, then only 636
# but once we are sensitive to repeated occurences, we get 742
# when looking for within a lower case inscription, the number increases to 902

In [None]:
EDH_sample["occups_N"] = EDH_sample["occups"].apply(len)

In [None]:
len(EDH_sample[EDH_sample["occups_N"] > 0])

In [None]:
# create sample and export it to gsheet
EDH_occups_sample = EDH_sample[EDH_sample["occups_N"] > 0][["id", "clean_text_interpretive_word", "occups"]]
set_with_dataframe(terms.add_worksheet("EDH_occups_sample", 1, 1), EDH_occups_sample)

# Application on the whole dataset

In [None]:
%%time
EDH["occups"] = EDH["clean_text_interpretive_word"].apply(extract_occup)

In [None]:
EDH_occups_list = [el for sublist in EDH["occups"].tolist() for el in sublist]
print(len(EDH_occups_list))
print(nltk.FreqDist(EDH_occups_list).most_common(30))

first results (for comparison):

```[('curator', 2833), ('ustor', 2092), ('plastes', 1076), ('psaltes', 866), ('cerdo', 730), ('uctor', 592), ('faber', 458), ('promus', 320), ('sagittarius', 236), ('erarius', 217), ('scriba', 194), ('negotiator', 191), ('medicus', 137), ('tector', 136), ('centonarius', 132), ('emporus', 113), ('gladiator', 112), ('aerarius', 104), ('marmorarienses', 102), ('conditor', 76), ('vexillarius', 74), ('lanius', 72), ('scaenicus', 71), ('tesserarius', 70), ('argentarius', 65), ('mensor', 62), ('scriptor', 61), ('figlus', 56), ('nauta', 53), ('cellio', 52)]```

In [None]:
EDH["occups_N"] = EDH["occups"].apply(len)

In [None]:
%%time
EDCS["occups"] = EDCS["clean_text_interpretive_word"].apply(extract_occup)

In [None]:
EDCS_occups_list = [el for sublist in EDCS["occups"].tolist() for el in sublist]
print(len(EDCS_occups_list))
print(nltk.FreqDist(EDCS_occups_list).most_common(30))

previous version
```40509
[('plastes', 5934), ('curator', 4368), ('figlus', 4251), ('cerdo', 2905), ('ustor', 2863), ('psaltes', 2205), ('faber', 2162), ('pollio', 907), ('medicus', 723), ('uctor', 671), ('scriba', 558), ('mercator', 546), ('aerarius', 517), ('promus', 485), ('cocus', 437), ('figulus', 419), ('sagittarius', 376), ('marmorarienses', 360), ('lanius', 334), ('emporus', 316), ('agricola', 310), ('fullo', 279), ('vexillarius', 273), ('gladiator', 248), ('restio', 246), ('aedifex', 245), ('negotiator', 244), ('vitor', 242), ('tector', 230), ('centonarius', 220)]
```

In [None]:
EDCS["occups_N"] = EDCS["occups"].apply(len)

# TERMS - overview

In [None]:
print("EDH - number of term occurances: " + str(EDH["occups_N"].sum()))
print("EDH - number of inscriptions with at least one term mentioned: " + str(len(EDH[EDH["occups_N"] > 0])))
print("EDH - number of dated inscriptions with at least one term mentioned: "+ str(len(EDH[(EDH["not_before"].notnull()) & (EDH["occups_N"] > 0)])))
print("EDH - number of term occurances in dated inscriptions: " + str(EDH[EDH["not_before"].notnull()]["occups_N"].sum()))

In [None]:
print("EDCS - number of term occurances: " + str(EDCS["occups_N"].sum()))
print("EDCS - number of inscriptions with at least one term mentioned: " + str(len(EDCS[EDCS["occups_N"] > 0])))
print("EDCS - number of dated inscriptions with at least one term mentioned: "+ str(len(EDCS[(EDCS["dating to"].notnull()) & (EDCS["occups_N"] > 0)])))
print("EDCS - number of term occurances in dated inscriptions: " + str(EDCS[EDCS["dating to"].notnull()]["occups_N"].sum()))

In [None]:
# login to our project folder, owned by my AU account 648597@au.dk
conf = sddk.configure("SDAM_root", "648597@au.dk")

In [None]:
sddk.write_file("SDAM_data/social_diversity/EDH_occups.json", EDH, conf)
sddk.write_file("SDAM_data/social_diversity/EDCS_occups.json", EDCS, conf)