In [0]:
import os
import pandas as pd
import numpy as np
import nltk
import pickle
import json
import sys
import regex ### to use extended regex library: https://pypi.org/project/regex/
import csv
from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests
import regex as re
import xml.etree.cElementTree as ET
from urllib.request import urlopen
import unicodedata
from unicodedata import normalize

In [0]:
### lemmatization
from cltk.stem.lemma import LemmaReplacer
lemmatizer = LemmaReplacer('greek')
from cltk.tag.pos import POSTag

### stop words:
from nltk.tokenize.punkt import PunktLanguageVars
from cltk.stop.greek.stops import STOPS_LIST
stopwords = [normalize("NFC", word) for word in STOPS_LIST]


### proper names
with open('/Users/vojtech/cltk_data/greek/lexicon/greek_proper_names_cltk/proper_names.txt') as f: 
    proper_names = [line.rstrip('\n') for line in f]

### to replace by means of regex
to_replace =[(r'\u2003' , ''),
             (r'\]' , ''),
             (r'\[' , ''),
             (r'\· ' , ' '), 
             (r'\․․' , ' '),
             (r'\.', ' '),
             (r'\․' , ' '),
             (r'\⋮' , ''),
             (r'—', ''),
             (r'-' , ''),
             (r"\d", ""),
             (r'ℎ', 'ἡ'),
             (r'  ', ' '),
             (r'[a-zA-Z0-9_]', ''),
             (r'\n\n\n\n\n', ' '),
             (r'\d' , '')
             ]
### to replace normally 
to_replace_dict={
    "ἓ":"ἕ",
    "ὰ" : "ά",
    "ὸ" : "ό",
    "ὺ" : "ύ",
    "ὲ" : "έ",
    "ὶ" : "ί",
    "," : "",
    }

def cleaning_raw_text(cell):
    cell = normalize("NFC", cell)
    for item in to_replace:
        ### by means of regex:
        cell = re.sub(item[0], item[1], cell)
    for orig, result in to_replace_dict.items():
        cell = cell.replace(orig, result)
    cell = cell.split()
    cell_lemmatized = []
    for word in cell:
        if word not in proper_names:
            lemma = lemmatizer.lemmatize(word)[0].lower()
            cell_lemmatized.append(lemma)
        else:
            lemma = lemmatizer.lemmatize(word)[0].capitalize()
            cell_lemmatized.append(lemma)
    ### stop words filtering:
    cell_lemmatized = [normalize("NFC", word) for word in cell_lemmatized]
    cell_lemmatized = [w for w in cell_lemmatized if not w in stopwords]
    cell_lemmatized = [w for w in cell_lemmatized if len(w) > 2]
    return cell_lemmatized

# upload the dictionaries (when parsed in the past)
with open('translator_dict_dodson.pickle', 'rb') as f:
    primary_dict = pickle.load(f)
with open('translator_dict_morpheus.pickle', 'rb') as f:
    morpheus_dict = pickle.load(f)

dict1 = list(primary_dict.keys())
dict2 = list(morpheus_dict.keys())

def translation_only(term):
    term = unicodedata.normalize("NFC", term)
    try:
        if term in primary_dict:
            translation = primary_dict.get(term)
        else: 
            if term in morpheus_dict:
                translation = morpheus_dict.get(term)
            else:
                translation = ''
        return translation
    except:
        pass

In [0]:
def date_extractor(datation):
    ### simple cleaning?
    datation = re.sub("\/\d+", "", datation)
    for element in ["?"]:
        datation = datation.replace(element, "")
    for element in ["AD", "Rom. Imp.", "Byzant", "Roman period", "Christian"]:
        if element in datation:
            for element_cent in [("1st", 1, 100),
                            ("2nd", 101, 200),
                            ("3rd", 201, 300),
                            ("4th", 301, 400),
                           ]:
                if element_cent[0] in datation:
                    dating = [datation, element_cent[1], element_cent[2], None, int(np.average([element_cent[1], element_cent[2]])), "cent", "AD"]
                    break
                else:
                    try:
                        post = re.search('(after\s|post\s)(\d+)', datation, flags=re.IGNORECASE).groups()
                        post = int(post[1])
                        dating = [datation, post, None, None, None, "post", "AD"]
                    except:
                        try:
                            date_both = re.search('(\d+)(\-)(\d+)', datation, flags=re.IGNORECASE).groups()
                            post = int(date_both[0])
                            ante = int(date_both[2])
                            dating = [datation, post, ante, None, int(np.average([post, ante])), "range", "AD"]
                        except:
                            try:
                                date_exact = re.search('\d+', datation, flags=re.IGNORECASE)
                                date_exact = int(date_exact.group(0))
                                dating = [datation, None, None, date_exact, date_exact, "exact", "AD"]
                            except:
                                dating = [datation, None, None, None, None, "unprecise", "AD"]
                                
            break
        else:
            for element_cent in [(" V ", -500, -401), 
                            (" V/IV ", -500, -301),
                            (" IV ", -400, -301),
                            (" III ", -300, -201),
                            (" II ", -200, -101),
                            (" I ", -100, -1),
                            ("5th", -500,  -401),
                            ("4th", -400, -301),
                            ("3rd", -300, -201),
                            ("2nd", -200, -101),
                            ("1st", -100, -1)]:
                if element_cent[0] in datation:
                    ### if there as an element indicating century
                    dating = [datation, element_cent[1], element_cent[2], None, int(np.average([element_cent[1], element_cent[2]])), "cent", "BC"]
                    break
                else:
                    try:
                        ante = re.search('(ante\s)(\d+)', datation, flags=re.IGNORECASE).groups()
                        ante = int(ante[1]) * -1
                        dating =  [datation, None, ante, None, None, "ante", "BC"]
                    except:
                        try:
                            post = re.search('(post\s|after\s)(\d+)', datation, flags=re.IGNORECASE).groups()
                            post = int(post[1]) * -1
                            dating = [datation, post, None, None, None, "post", "BC"]
                        except:
                            try:
                                date_both = re.search('(\d+)(\-)(\d+)', datation, flags=re.IGNORECASE).groups()
                                post = int(date_both[0]) * -1 
                                ante = int(date_both[2]) * -1
                                dating = [datation, post, ante, None, int(np.average([post, ante])), "range", "BC"]
                            except:
                                try:
                                    date_exact = re.search('\d+', datation, flags=re.IGNORECASE)
                                    date_exact = int(date_exact.group(0)) * -1
                                    dating = [datation, None, None, date_exact, date_exact, "exact", "BC"]
                                except:
                                        dating = [datation, None, None, None, None, "string", "BC"]
    for reign in [("reign of Hadrian", 117, 138),
                  ("reign of Justinian", 527, 565),
                  ("reign of Ant. Pius", 138, 161),
                  ("reign of Augustus", -27, 19),
                  ("reign of Tiberius", 14, 37)]:
        if reign[0] in datation:
            avr = int(np.average([reign[1], reign[2]]))
            if avr < 0:
                period = "BC"
            else:
                period = "AD"
            dating = [datation, reign[1], reign[2], None, avr, "range", period]
            break
    return dating 

In [0]:
def data_extraction(file_name):
    not_date_indicators = ["stoich.", "[", "decree", "Rachi", "Leuko"]
    not_date_indicators_2 = ["Krol", "IG"]
    try:
        with open("../IG_data/" + file_name) as file:
            soup = BeautifulSoup(file, "html.parser")
        spans = [span.get_text() for span in soup.find_all("span", class_="name")]
        region_category = spans[1]
        region_precisely = spans[2]
        span_ti = soup.find("span", class_="ti").get_text()
        span_ti = span_ti.split(" — ")
        book = soup.find("a", class_="booklink").get_text()
        datation = span_ti[2]
        for element in not_date_indicators:
            if datation.startswith(element): 
                datation = span_ti[3]
                break
        for element in not_date_indicators_2: # too far
            if datation.startswith(element): 
                datation = span_ti[1]
                break
        raw_text = soup.find("table", class_="grk").get_text()
        cleaned_text = cleaning_raw_text(raw_text)
        dating = list(date_extractor(datation))
        data = [file_name.partition(".x")[0], cleaned_text, book, region_category, region_precisely, span_ti[0], span_ti[1], span_ti] + dating
        return data
    except:
        pass

In [0]:
with open("data_test.csv", "a") as f:
    for file_name in ["PH"+str(element)+".xml" for element in range(1,100000)]:
        try:
            csv.writer(f).writerow(data_extraction(file_name))
        except:
            pass

In [0]:
inscriptions_df = pd.read_csv("data_test.csv")
inscriptions_df.columns = ["inscription", "cleaned_text", "book", "region_cat", "region", "reg_ab", "spec", "notes", "orig_date_text", "post", "ante", "exact", "avr", "type", "period"]
inscriptions_df = inscriptions_df[:4213]

In [0]:
texts_only = inscriptions_df["cleaned_text"].tolist()

In [0]:
def word_counter(cleaned_text, word):
    return len([term for term in cleaned_text if term.startswith(word)])

In [0]:
inscriptions_df.apply(lambda x: word_counter(row["clened_text"], )

In [0]:
raw_IG_span = []
for file_name in ["PH"+str(element)+".xml" for element in range(1,1000,50)]:
    raw_IG_span.append(data_extraction(file_name))

In [0]:
raw_ig_df

Unnamed: 0,inscription,cleaned_text,book,region_cat,region,reg_ab,spec,notes,orig_date_text,post,ante,exact,avr,type,period
0,PH1,"[ἔδοχσεν, το͂ι, δέμοι, τ̣ός, σαλαμ̣ῖνι, κλερόχ...",IG I³,Attica (IG I-III),Attica,Att.,Ath.: Akr.,"[Att., Ath.: Akr., stoich. 35, c. 510-500 a., ...",c. 510-500 a.,-510.0,-500.0,,-505.0,range,BC
1,PH51,"[θεός, ἔδοχσεν, τε͂ι, βολε͂ι, το͂ι, δέμοι, πρυ...",IG I³,Attica (IG I-III),Attica,Att.,stoich. 62,"[Att., stoich. 62, c. 435 a.]",c. 435 a.,,,-435.0,-435.0,exact,BC
2,PH101,"[Φίλιππος, φιλέο, δειραδιότες, γραμματεύω, οἰν...",IG I³,Attica (IG I-III),Attica,Att.,stoich. 24/20/35,"[Att., stoich. 24/20/35, 410/09 a.]",410 a.,,,-410.0,-410.0,exact,BC
3,PH151,"[οκλε͂ς, μεσσένε, πρ̣]",IG I³,Attica (IG I-III),Attica,Att.,non-stoich.,"[Att., non-stoich., s. V a.]",s. V a.,-500.0,-401.0,,-450.0,cent,BC
4,PH201,"[στρατεγ, ἀναγραφσα, γραμματεύς, τε͂ς, βολε͂ς,...",IG I³,Attica (IG I-III),Attica,Att.,stoich. 39?,"[Att., stoich. 39?, 450-435]",450-435,-450.0,-435.0,,-442.0,range,BC
5,PH251,"[κοντα, δραχμή, ντι]",IG I³,Attica (IG I-III),Attica,Att.,stoich.,"[Att., stoich., 490-480 a.]",490-480 a.,-490.0,-480.0,,-485.0,range,BC
6,PH301,"[ἰονικός, 𐅃𐅂𐅂𐅂ιι, 𐅃𐅂𐅂𐅂ιι, αὐλιᾶται, 𐅂𐅂𐅂ιι, σάρ...",IG I³,Attica (IG I-III),Attica,Att.,stoich.,"[Att., stoich., 415/4 a.]",415 a.,,,-415.0,-415.0,exact,BC
7,PH351,"[ἡοι, τάμιας, το͂ν, ἡιερο͂ν, χρεμάτον, τε͂ς, ἀ...",IG I³,Attica (IG I-III),Attica,Att.,non-stoich.,"[Att., non-stoich., 408/7 a.]",408 a.,,,-408.0,-408.0,exact,BC
8,PH401,"[κύλιχς, χρυσε͂, σταθμός, κύλιχς, ἀργύρεος, στ...",IG I³,Attica (IG I-III),Attica,Att.,stoich.,"[Att., stoich., p. post 408/7 a.]",p. post 408 a.,-408.0,,,,post,BC
9,PH451,"[τε͂ς, δεύτερος, ἀρχε͂ς, ἡε͂ι, γραμματεύω, ἡαλ...",IG I³,Attica (IG I-III),Attica,Att.,stoich.,"[Att., stoich., 447/6-433/2 a.]",447-433 a.,-447.0,-433.0,,-440.0,range,BC


In [0]:
filename = 'dogs'
outfile = open(filename,'wb')