In [4]:
import os,glob
import pandas as pd
import numpy as np

In [5]:
original_files = {}
for i in glob.glob('data/raw/*txt'):
    doc = open(i).read().splitlines()
    original_files[i[-5]] = doc

In [6]:
corpus_verse = []
corpus_book = []
corpus_english = []
corpus_latin = []

for i in glob.glob('data/raw/*txt'):
    doc = open(i).read().splitlines()
    for index,sentence in enumerate(doc):
        splitted = sentence.split("||") 
        if len(splitted)==3:
            file_name = i[-5]
            index = splitted[0].strip()
            english = splitted[1].strip()
            latin = splitted[2].strip()
            if bool(index) and bool(english) and bool(latin):
                if len(english)>=17: 
                    corpus_verse.append(index)
                    corpus_book.append(file_name)
                    corpus_english.append(english)
                    corpus_latin.append(latin)

df = pd.DataFrame({"verse":corpus_verse,"book":corpus_book,
"english":corpus_english,"latin":corpus_latin})
df.shape

(26508, 4)

In [7]:
# Inferred rules english:
#.replace('ð', 'þ')
#Uppercase letters don't strike any difference

# Inferred rules for latin
# Classical Latin did not have a lower-case/upper-case distinction.

In [8]:
df["english"] = df.english.str.replace('ð', 'þ').replace('æ', 'ae').replace('ę', 'ae').replace('œ', 'oe').str.lower()
df["latin"] = df.latin.str.lower()

In [9]:
import string

def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text
    
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [10]:
#Check punctuation!
# I will remove parenthesis and punctuation
df["english"] = df['english'].apply(remove_punctuations)
df["latin"] = df['latin'].apply(remove_punctuations)

#Remove multiple spaces 
df["latin"] = df.latin.replace(r'\s+', ' ', regex=True)
df["english"] = df.english.replace(r'\s+', ' ', regex=True)

## First look for stopwords, lemmatiser and stemmer for old english. For latin there must be something else

In [11]:
from cltk.lemmatize import OldEnglishDictionaryLemmatizer
# Explore lemmas
# OldEnglishDictionaryLemmatizer()._load_forms_and_lemmas()

In [21]:
from cltk.lemmatize import OldEnglishLemmatizationProcess, LatinLemmatizationProcess
from cltk.lemmatize.lat import LatinBackoffLemmatizer
## Lemmatizing with pipeline
from cltk.core.data_types import Process, Pipeline
from cltk.tokenizers import MultilingualTokenizationProcess
from cltk.languages.utils import get_lang
from cltk.nlp import NLP
from cltk.stops.ang import STOPS as ANG_STOPS
from cltk.stops.lat import STOPS as LAT_STOPS

In [20]:
ANG_STOPS

['and',
 'on',
 'þonne',
 'wið',
 'to',
 'þæt',
 'þe',
 'ne',
 'ic',
 'me',
 'heo',
 'him',
 'he',
 'swa',
 'þis',
 'mid',
 'þu',
 'ofer',
 'his',
 'þriwa',
 'seo',
 'hit',
 'se',
 'þas',
 'cweð',
 'þæs',
 'in',
 'sy',
 'ða',
 'ðy',
 'ær',
 'ðonne',
 'næfre',
 'þone',
 'ge',
 'ðas',
 'þære',
 'þam',
 'is',
 'of',
 'gif',
 'þæm',
 'nu',
 'under',
 'wiþ',
 'geond',
 'æfter',
 'ðis',
 'do',
 'hwæt',
 'her',
 'þurh',
 'þus',
 'lytel',
 'æt',
 'ðin',
 'willian',
 'cume',
 'þeos',
 'þara',
 'are',
 'cuman',
 'com',
 'ænig',
 'þon',
 'for',
 'us',
 'ac',
 'bot',
 'ende',
 'wæs',
 'wǣre',
 'wes',
 'wǣron',
 'wǣren',
 'wesað',
 'ic',
 'wit',
 'wē',
 'mīn',
 'uncer',
 'ūser',
 'ūre',
 'mē',
 'unc',
 'ūs',
 'mec',
 'uncit',
 'ūsic',
 'ðū',
 'git',
 'gē',
 'ðīn',
 'incer',
 'ēower',
 'ēowre',
 'ðē',
 'inc',
 'ēow',
 'ðec',
 'incit',
 'ēowic',
 'hē',
 'hēo',
 'hīe',
 'hit',
 'hyt',
 'hī',
 'hȳ',
 'hire',
 'hira',
 'heora',
 'hiera',
 'heom',
 'hine',
 'nǣr',
 'nǣfre',
 'nǣ

In [14]:
pipe_eng = Pipeline(description="A custom Old English pipeline", \
    processes=[MultilingualTokenizationProcess, OldEnglishLemmatizationProcess], \
    language=get_lang("ang"))

nlp_eng = NLP(language='ang', custom_pipeline=pipe_eng, suppress_banner=True)


latin_lem = LatinBackoffLemmatizer()

# #Latin pipeline not working. Lemmatized text remains the same
# pipe_latin = Pipeline(description="A custom Old Latin pipeline", \
#     processes=[MultilingualTokenizationProcess, LatinLemmatizationProcess], \
#     language=get_lang("ang"))

# nlp_latin = NLP(language='lat', custom_pipeline=pipe_latin, suppress_banner=True)


##### Check for lemmatization examples of the rules given by Prof.

In [15]:
# {"æ":"ae",
# "ę":"ae",
# "œ":"oe"
# "michi" to "mihi". 
# But that still leaves important variation that you don't 
# want to replace globally: "e" should sometimes be "ae" 
# (e.g. "eternus" will only be recognized once it is "aeternus"), 
# and "c" should sometimes be "t" ("laudacio")

# }

In [18]:
df.loc[149,"english"]

'forþon þe drihten on cnosle ł mægþe on rihtwisre is geþeaht þæs hæfenleasan ł wædlan ge forsawon ł ge gescendon forþan þe drihten hiht his is'

In [24]:
lemmatized_eng = []
lemmatized_lat = []
for (_,verse,book,english,latin) in df.itertuples(name=None):
    eng_lemma_no_stopw = [x for x in nlp_eng(english).lemmata if x not in ANG_STOPS]
    lemmatized_eng.append(" ".join(eng_lemma_no_stopw))
    #Latin
    lemma_list = [lemma for (word,lemma) in latin_lem.lemmatize(latin.split()) if lemma not in LAT_STOPS]
    lemmatized_lat.append(" ".join(lemma_list))

    


In [25]:
df["english_lemma"] = lemmatized_eng
df["latin_lemma"] = lemmatized_lat

In [29]:
LAT_STOPS

['ab',
 'ac',
 'ad',
 'adhic',
 'aliqui',
 'aliquis',
 'an',
 'ante',
 'apud',
 'at',
 'atque',
 'aut',
 'autem',
 'cum',
 'cur',
 'de',
 'deinde',
 'dum',
 'ego',
 'enim',
 'ergo',
 'es',
 'est',
 'et',
 'etiam',
 'etsi',
 'ex',
 'fio',
 'haud',
 'hic',
 'iam',
 'idem',
 'igitur',
 'ille',
 'in',
 'infra',
 'inter',
 'interim',
 'ipse',
 'is',
 'ita',
 'magis',
 'modo',
 'mox',
 'nam',
 'ne',
 'nec',
 'necque',
 'neque',
 'nisi',
 'non',
 'nos',
 'o',
 'ob',
 'per',
 'possum',
 'post',
 'pro',
 'quae',
 'quam',
 'quare',
 'qui',
 'quia',
 'quicumque',
 'quidem',
 'quilibet',
 'quis',
 'quisnam',
 'quisquam',
 'quisque',
 'quisquis',
 'quo',
 'quoniam',
 'sed',
 'si',
 'sic',
 'sive',
 'sub',
 'sui',
 'sum',
 'super',
 'suus',
 'tam',
 'tamen',
 'trans',
 'tu',
 'tum',
 'ubi',
 'uel',
 'uero',
 'unus',
 'ut']

In [32]:
#Validating examples
for index, row in df.iterrows():
    if row["english"] != row["english_lemma"] and " deinde " in row["latin"]:
        print(index)
        print(row["latin"])
        print("Original:",row["latin"])
        print("Lemma:",row["latin_lemma"])
        break

22920
in principio istius psalmi loquitur deus deinde numerosis diuisionibus diuiditur
Original: in principio istius psalmi loquitur deus deinde numerosis diuisionibus diuiditur
Lemma: principium iste psalmus loquor deus numerosus diuisio divido


In [None]:
df.to_excel("preprocessed.xlsx",index=False)