# Requirements

In [1]:
import os
import json
import pandas as pd
import numpy as np
import json
import logging ### to monitor the code
from bs4 import BeautifulSoup
import xml.etree.cElementTree as ET
import pickle
import math
import random
import sys
import csv
import unicodedata
import requests
from urllib.request import urlopen 
import io
import getpass
import re
from collections import defaultdict
from itertools import islice # to iterate through dicts

import nltk
from nltk.collocations import *


### plotting
### to use latex (important for greek fonts)
#! apt-get install texlive-latex-recommended 
#! apt install texlive-latex-extra
#! apt install dvipng
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats

import gspread
from gspread_dataframe import get_as_dataframe, set_with_dataframe
from google.oauth2 import service_account # based on google-auth library

import sddk

#from anda import gr
# OR uncomment the following:
script_url = "https://raw.githubusercontent.com/sdam-au/anda_py/master/anda/gr.py"
exec(requests.get(script_url).content)

In [2]:
conf = sddk.configure("SDAM_root", "648597@au.dk")

sciencedata.dk username (format '123456@au.dk'): 648597@au.dk
sciencedata.dk password: ········
connection with shared folder established with you as its owner
endpoint variable has been configured to: https://sciencedata.dk/files/SDAM_root/


In [3]:
# to access gsheet, you need Google Service Account key json file
# I have mine located in my personal space on sciencedata.dk, so I read it from there:

# (1) read the file and parse its content
file_data = conf[0].get("https://sciencedata.dk/files/ServiceAccountsKey.json").json()
# (2) transform the content into crendentials object
credentials = service_account.Credentials.from_service_account_info(file_data)
# (3) specify your usage of the credentials
scoped_credentials = credentials.with_scopes(['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive'])
# (4) use the constrained credentials for authentication of gspread package
gc = gspread.Client(auth=scoped_credentials)

AGT_overview = gc.open_by_url("https://docs.google.com/spreadsheets/d/1iVta_FuEDgUM_Lf_yByrdbbXNoVH_dnVZs6QRyYv1NM/edit?usp=sharing")

# AGT_metadata = gc.open_by_url("https://docs.google.com/spreadsheets/d/1hEUnL3E07F-EnE3wYnk1V91aXfPDrcnhFHKjD-04CM0/edit?usp=sharing")

In [5]:
# import the data
AGT = sddk.read_file("SDAM_data/AGT/AGT_dated_20201027.json", "df", conf)

# Lemmatization

In [6]:
AGT["lemmata"] = AGT.apply(lambda row: lemmatize_string(row["string"], all_lemmata=False, filter_by_postag=["n", "a", "v"], involve_unknown=True), axis=1)

In [7]:
AGT["lemmata_wordcount"] = AGT.apply(lambda row: len(row["lemmata"]), axis=1)
AGT["lemmata_wordcount"].sum()

13926141

In [8]:
%%time
AGT["lemmatized_sentences"] = AGT.apply(lambda row: get_lemmatized_sentences(row["string"], all_lemmata=False, filter_by_postag=["n", "a", "v"], involve_unknown=True), axis=1)

CPU times: user 2min 30s, sys: 315 ms, total: 2min 30s
Wall time: 2min 31s


In [13]:
AGT["n_sentences"] = AGT['lemmatized_sentences'].apply(lambda x: len(x))

In [12]:
# look at some string
# e.g. paul's letter to the Galatians
AGT[AGT["author_id"]=="tlg0031paul"].iloc[3]["string"][:1000]

'\n\n                    ΠΑΥΛΟΣ ἀπόστολος, οὐκ ἀπʼ ἀνθρώπων οὐδὲ διʼ ἀνθρώπου\n\nἀλλὰ διὰ Ἰησοῦ Χριστοῦ καὶ θεοῦ πατρὸς τοῦ\n\nἐγείραντος αὐτὸν ἐκ νεκρῶν,\n                     \n                    καὶ οἱ σὺν ἐμοὶ πάντες\n\nἀδελφοί, ταῖς ἐκκλησίαις τῆς Γαλατίας·\n                     \n                    χάρις ὑμῖν καὶ\n\n\n\n εἰρήνη ἀπὸ θεοῦ πατρὸς ἡμῶν καὶ κυρίου Ἰησοῦ Χριστοῦ,\n                     \n                    τοῦ δόντος ἑαυτὸν ὑπὲρ τῶν ἁμαρτιῶν ἡμῶν ὅπως ἐξέληται\n\nἡμᾶς ἐκ τοῦ αἰῶνος τοῦ ἐνεστῶτος πονηροῦ κατὰ τὸ\n\nθέλημα τοῦ θεοῦ καὶ πατρὸς ἡμῶν,\n                     \n                    ᾧ ἡ δόξα εἰς τοὺς\n\nαἰῶνας τῶν αἰώνων· ἀμήν.\n                     \n\n                    Θαυμάζω ὅτι οὕτως ταχέως μετατίθεσθε ἀπὸ τοῦ καλέσαντος\n\nὑμᾶς ἐν χάριτι Χριστοῦ εἰς ἕτερον εὐαγγέλιον,\n                     \n                    ὃ\n\nοὐκ ἔστιν ἄλλο· εἰ μή τινές εἰσιν οἱ ταράσσοντες ὑμᾶς καὶ\n\nθέλοντες μεταστρέψαι τὸ εὐαγγέλιον τοῦ χριστοῦ.\n                     \n     

In [10]:
# and its lemmatized version
print(AGT[AGT["author_id"]=="tlg0031paul"].iloc[3]["lemmata"][:100])

['παυλος', 'ἀπόστολος', 'ἀπʼ', 'ἄνθρωπος', 'διʼ', 'ἄνθρωπος', 'Ἰησοῦς', 'χριστός', 'θεός', 'πατήρ', 'νεκρός', 'πᾶς', 'ἀδελφός', 'ἐκκλησία', 'γαλατίας', 'χάρις', 'ὑμῖν', 'εἰρήνη', 'θεός', 'πατήρ', 'ἡμός', 'κύριος', 'Ἰησοῦς', 'χριστός', 'ἁμάρτημα', 'ἡμός', 'ἐξαιρέω', 'ἡμός', 'αἰών', 'πονηρός', 'θέλημα', 'θεός', 'πατήρ', 'ἡμός', 'δόξα', 'αἰών', 'αἰών', 'ἄμη', 'θαυμάζω', 'μετατίθημι', 'χάρις', 'χριστός', 'ἕτερος', 'εὐαγγέλιον', 'τίνω', 'εἶμι', 'μεταστρέφω', 'εὐαγγέλιον', 'χριστός', 'ἄγγελος', 'οὐρανός', 'εὐαγγελίζομαι', 'ὑμῖν', 'παρʼ', 'εὐηγγελισάμεθα', 'ὑμῖν', 'ἀνάθεμα', 'προερέω', 'λέγω', 'εἶμι', 'εὐαγγελίζομαι', 'παρʼ', 'παραλαμβάνω', 'ἀνάθεμα', 'ἄνθρωπος', 'πειθός', 'θεός', 'ζάω', 'ἄνθρωπος', 'ἀρέσκω', 'ἄνθρωπος', 'ἀρέσκω', 'χριστός', 'δοῦλος', 'γνωρίζω', 'ὑμῖν', 'ἀδελφός', 'εὐαγγέλιον', 'εὐαγγελίζομαι', 'ὑπʼ', 'ἄνθρωπος', 'ἄνθρωπος', 'παραλαμβάνω', 'διδάσκω', 'διʼ', 'ἀποκάλυψις', 'Ἰησοῦς', 'χριστός', 'ἀκούω', 'ἐμην', 'ἀναστροφή', 'ποτός', 'ἰουδαϊσμῷ', 'ὑπερβολην', 'διώκω', 'ἐκκλησία',

In [14]:
sddk.write_file("SDAM_data/AGT/AGT_20201027.json", AGT, conf)

A file with the same name ("AGT_20201027.json") already exists in this location.
Press Enter to overwrite it or choose different path and filename: 
Your <class 'pandas.core.frame.DataFrame'> object has been succefully written as "https://sciencedata.dk/files/SDAM_root/SDAM_data/AGT/AGT_20201027.json"
