In [None]:
import json
import requests
import os
from pprint import pprint

import cltk
from cltk.tokenize.word import WordTokenizer
from cltk.stop.latin.stops import STOPS_LIST
from cltk.utils.file_operations import open_pickle
from cltk.lemmatize.latin.backoff import BackoffLatinLemmatizer
from cltk.stem.lemma import LemmaReplacer

from collections import Counter

from nltk.tokenize.punkt import PunktLanguageVars

# run this only once to set up the corpus files for lemmatizing and such
# def setup_files():
    # from cltk.corpus.utils.importer import CorpusImporter

    # corpus_importer = CorpusImporter('latin')
    # corpus_importer.list_corpora
    # corpus_importer.import_corpus('latin_text_latin_library')
    # corpus_importer.import_corpus('latin_models_cltk')




#set up tokenizer 
word_tokenizer = WordTokenizer('latin')

# set up fancy lemmatizer
# rel_path = os.path.join('~/cltk_data/latin/model/latin_models_cltk/lemmata/backoff')
# path = os.path.expanduser(rel_path)
# file = 'latin_pos_lemmatized_sents.pickle'      
# latin_pos_lemmatized_sents_path = os.path.join(path, file)

# if os.path.isfile(latin_pos_lemmatized_sents_path):
#     latin_pos_lemmatized_sents = open_pickle(latin_pos_lemmatized_sents_path)
# else:
#     latin_pos_lemmatized_sents = []
#     print('The file %s is not available in cltk_data' % file)

# lemmatizer = BackoffLatinLemmatizer(latin_pos_lemmatized_sents)

#or just use the default
default_lemmatizer = LemmaReplacer('latin')

# test lemmatizer if you want
# default_lemmatizer.lemmatize('amabo')
# default_lemmatizer.lemmatize('pedis')

# Remove punctuation with translate function
punctuation ="\"#$%&\'()+,-/:;<=>@[\]^_`{|}~.?!«»—"
translator = str.maketrans({key: " " for key in punctuation})

def process_text(data):
    print('starting process')
    works = data['works']
    lemma_list = []
    # make list of lines
    for work in works:
        books = work['books']
        for book in books:
            poems = book ['poems']
            for poem in poems:
                lines = poem['lines']
                for line in lines:
                    raw_line = line['text']
                    try:
                        text = raw_line.translate(translator)
                        word_list = word_tokenizer.tokenize(text)
                        for word in word_list:
                            word = default_lemmatizer.lemmatize(word)
#                             word = lemmatizer.lemmatize(word)
                            print(word[0])
                            lemma_list.append(word[0])
                    except:
                        pass
    print('done lemmatizing')

    # Build counter for lemma list
    text_lemmas_counter = Counter(lemma_list)
    text_lemmas_mc = text_lemmas_counter.most_common(100)

    running = 0

    print('Top 25 lemmas in Ars 1:\n')
    print("{number:>5}  {lemma:<12}{count:<12}{percent:<12}{running:<12}".format(number="", lemma="lemma", count="COUNT", percent="Type-Tok %", running = "RUNNING %"))
    for i, pair in enumerate(text_lemmas_mc[:50]):
        running += pair[1]
        print("{number:>5}. {lemma:<12}{count:<12}{percent:<12}{running:<12}".format(number=i+1, lemma=pair[0], count=pair[1], percent=str(round(pair[1] / len(lemma_list)*100, 2))+"%", running = str(round(running / len(lemma_list)*100, 2))+"%"))
    
    #in case anyone needs it I guess
    lemma_text = ' '.join(lemma_list)
    print(lemma_text)


#try this on the ars
with open('ars.json') as data_file:    
    data = json.load(data_file)

process_text(data)
        

# Refs:
# https://github.com/diyclassics/ll-experiments/blob/master/Exploring%20Diction%20and%20Topics%20in%20Latin%20Love%20Elegy.ipynb
# https://github.com/kynan/nbstripout