In [None]:
import pandas as pd
from hazm import (
    Normalizer, word_tokenize, POSTagger,
    Chunker, tree2brackets, Lemmatizer,
    DependencyParser, Stemmer
)
import random
import time

In [None]:
# read data - seprate by ;
df = pd.read_csv('data/verses.csv', sep=';')
print(df.shape)
# remove rows with empty text
df = df[df['text'].notnull()]
print(df.shape)
df.head(10)

In [None]:
# Initial normalizer, lemmatizer, stemmer
normalizer = Normalizer()
lemmatizer = Lemmatizer()
stemmer = Stemmer()
chunker = Chunker(model='resources/chunker.model')
tagger = POSTagger(model='resources/postagger.model')
parser = DependencyParser(tagger=tagger, lemmatizer=lemmatizer)

In [None]:
verses = df['text'].tolist()

In [None]:
# get structure of first 10 verses
for verse in verses[:10]:
    normalized_verse = normalizer.normalize(verse)
    tokens = word_tokenize(normalized_verse)
    tagged = tagger.tag(tokens)
    # parsed = parser.parse(tagged)
    print("================================")
    print("original_verse: ", verse)
    print("normalized_verse: ", normalized_verse)
    print("tokens: ", tokens)
    print("tagged: ", tagged)
    # print("parsed: ", parsed)
    # print(tree2brackets(chunker.parse(tagged)))

In [None]:
def get_verse_pattern(verse):
    """
    get structure of verse

    :param verse: verse
    :return: verse pattern
    """
    normalized_verse = normalizer.normalize(verse)
    tokens = word_tokenize(normalized_verse)
    tagged = tagger.tag(tokens)
    pattern = []
    for token in tagged:
        pattern.append(token[1])
    return pattern

In [None]:
def get_categories(verses):
    """
    get categories of verses

    :param verses: list of verses
    :return: dictionary of categorized tokens

        {
            'P': ['w1',]
        }

    """
    categories = {}
    # categorize tokens
    for verse in verses:
        try:
            normalized_verse = normalizer.normalize(verse)
            tokens = word_tokenize(normalized_verse)
            tagged = tagger.tag(tokens)
            # add to categories
            for token in tagged:
                if token[1] not in categories:
                    categories[token[1]] = [token[0]]
                else:
                    categories.get(token[1]).append(token[0])
        except Exception as e:
            print("verse: ", verse)
            print("error: ", e)
    return categories

In [None]:
def get_orders(df):
    """
    get orders of verses

    :param df: dataframe of verses
    :return: 2D list of orders

        [
            ['P', 'ADJ', 'N', 'P',],
        ]

    """
    orders = []
    # loop variables
    poem_id = 0
    order = []
    for row in df.iterrows():
        # get poem id
        this_poem_id = row[1]['poemId']
        # check if its first iteration
        if poem_id == 0:
            poem_id = this_poem_id
            pattern = get_verse_pattern(row[1]['text'])
            order.append(pattern)
        # if poem id changed clear order list
        elif poem_id != this_poem_id:
            orders.append(order)
            order = []
            poem_id = this_poem_id
            pattern = get_verse_pattern(row[1]['text'])
            order.append(pattern)
        # if poem id is same add to order list
        else:
            pattern = get_verse_pattern(row[1]['text'])
            order.append(pattern)
    return orders

In [None]:
start_time = time.time()
categories = get_categories(verses)
print("%s seconds took to get categories" % (time.time() - start_time))

In [None]:
start_time = time.time()
orders = get_orders(df)
print("%s seconds took to get orders" % (time.time() - start_time))

In [None]:
def gen_poem():
    """
    generate poem

    :return: poem
    """
    poem = ''
    # get random order
    # order = random.choice(orders)
    # print(order)
    # for pattern in order:
    #     verse = ''
    #     for token in pattern:
    #         # add random token from category
    #         verse += random.choice(categories.get(token)) + ' '
    #     poem += verse + '\n'
    return poem

In [None]:
tmp = gen_poem()
print(tmp)