In [1]:
import pandas as pd
from hazm import (
    Normalizer,
    word_tokenize,
    POSTagger,
    Chunker,
    tree2brackets,
    Lemmatizer,
    DependencyParser,
    Stemmer,
)
import random
import time

In [2]:
# read data - seprate by ;
df = pd.read_csv("data/verses.csv", sep=";")
print(df.shape)
# remove rows with empty text
df = df[df["text"].notnull()]
print(df.shape)
df.head(10)

(1384003, 5)
(1381557, 5)


Unnamed: 0,id,poemId,order,position,text
0,1,2051,1,0,جز نقش تو در نظر نیامد ما را
1,2,2051,2,1,جز کوی تو رهگذر نیامد ما را
2,3,2051,3,0,خواب ارچه خوش آمد همه را در عهدت
3,4,2051,4,1,حقا که به چشم در نیامد ما را
4,5,2052,1,0,بر گیر شراب طرب‌انگیز و بیا
5,6,2052,2,1,پنهان ز رقیب سفله بستیز و بیا
6,7,2052,3,0,مشنو سخن خصم که بنشین و مرو
7,8,2052,4,1,بشنو ز من این نکته که برخیز و بیا
8,9,2053,1,0,گفتم که لبت، گفت لبم آب حیات
9,10,2053,2,1,گفتم دهنت، گفت زهی حب نبات


In [3]:
# Initial normalizer, lemmatizer, stemmer
normalizer = Normalizer()
lemmatizer = Lemmatizer()
stemmer = Stemmer()
chunker = Chunker(model="resources/chunker.model")
tagger = POSTagger(model="resources/postagger.model")
parser = DependencyParser(tagger=tagger, lemmatizer=lemmatizer)

In [4]:
verses = df["text"].tolist()

In [5]:
# get structure of first 10 verses
for verse in verses[:10]:
    normalized_verse = normalizer.normalize(verse)
    tokens = word_tokenize(normalized_verse)
    tagged = tagger.tag(tokens)
    # parsed = parser.parse(tagged)
    print("================================")
    print("original_verse: ", verse)
    print("normalized_verse: ", normalized_verse)
    print("tokens: ", tokens)
    print("tagged: ", tagged)
    # print("parsed: ", parsed)
    # print(tree2brackets(chunker.parse(tagged)))

original_verse:  جز نقش تو در نظر نیامد ما را
normalized_verse:  جز نقش تو در نظر نیامد ما را
tokens:  ['جز', 'نقش', 'تو', 'در', 'نظر', 'نیامد', 'ما', 'را']
tagged:  [('جز', 'P'), ('نقش', 'Ne'), ('تو', 'PRO'), ('در', 'P'), ('نظر', 'N'), ('نیامد', 'V'), ('ما', 'PRO'), ('را', 'POSTP')]
original_verse:  جز کوی تو رهگذر نیامد ما را
normalized_verse:  جز کوی تو رهگذر نیامد ما را
tokens:  ['جز', 'کوی', 'تو', 'رهگذر', 'نیامد', 'ما', 'را']
tagged:  [('جز', 'P'), ('کوی', 'Ne'), ('تو', 'PRO'), ('رهگذر', 'N'), ('نیامد', 'V'), ('ما', 'PRO'), ('را', 'POSTP')]
original_verse:  خواب ارچه خوش آمد همه را در عهدت
normalized_verse:  خواب ارچه خوش آمد همه را در عهدت
tokens:  ['خواب', 'ارچه', 'خوش', 'آمد', 'همه', 'را', 'در', 'عهدت']
tagged:  [('خواب', 'N'), ('ارچه', 'ADV'), ('خوش', 'AJ'), ('آمد', 'V'), ('همه', 'PRO'), ('را', 'POSTP'), ('در', 'P'), ('عهدت', 'N')]
original_verse:  حقا که به چشم در نیامد ما را
normalized_verse:  حقا که به چشم در نیامد ما را
tokens:  ['حقا', 'که', 'به', 'چشم', 'در', 'نیامد', '

In [6]:
def get_verse_pattern(verse):
    """
    get structure of verse

    :param verse: verse
    :return: verse pattern
    """
    normalized_verse = normalizer.normalize(verse)
    tokens = word_tokenize(normalized_verse)
    tagged = tagger.tag(tokens)
    pattern = []
    for token in tagged:
        pattern.append(token[1])
    return pattern

In [7]:
def get_categories(verses):
    """
    get categories of verses

    :param verses: list of verses
    :return: dictionary of categorized tokens

        {
            'P': ['w1',]
        }

    """
    categories = {}
    # categorize tokens
    for verse in verses:
        try:
            normalized_verse = normalizer.normalize(verse)
            tokens = word_tokenize(normalized_verse)
            tagged = tagger.tag(tokens)
            # add to categories
            for token in tagged:
                if token[1] not in categories:
                    categories[token[1]] = [token[0]]
                else:
                    categories.get(token[1]).append(token[0])
        except Exception as e:
            print("verse: ", verse)
            print("error: ", e)
    return categories

In [8]:
def get_orders(df):
    """
    get orders of verses

    :param df: dataframe of verses
    :return: 2D list of orders

        [
            ['P', 'ADJ', 'N', 'P',],
        ]

    """
    orders = []
    # loop variables
    poem_id = 0
    order = []
    for row in df.iterrows():
        # get poem id
        this_poem_id = row[1]["poemId"]
        # check if its first iteration
        if poem_id == 0:
            poem_id = this_poem_id
            pattern = get_verse_pattern(row[1]["text"])
            order.append(pattern)
        # if poem id changed clear order list
        elif poem_id != this_poem_id:
            orders.append(order)
            order = []
            poem_id = this_poem_id
            pattern = get_verse_pattern(row[1]["text"])
            order.append(pattern)
        # if poem id is same add to order list
        else:
            pattern = get_verse_pattern(row[1]["text"])
            order.append(pattern)
    return orders

In [9]:
start_time = time.time()
categories = get_categories(verses)
print("%s seconds took to get categories" % (time.time() - start_time))

203.14691710472107 seconds took to get categories


In [10]:
start_time = time.time()
orders = get_orders(df)
print("%s seconds took to get orders" % (time.time() - start_time))

295.80238103866577 seconds took to get orders


In [11]:
def gen_poem():
    """
    generate poem

    :return: poem
    """
    poem = ""
    # get random order
    order = random.choice(orders)
    for pattern in order:
        verse = ""
        for token in pattern:
            # add random token from category
            verse += random.choice(categories.get(token)) + " "
        poem += verse + "\n"
    return poem

In [267]:
tmp = gen_poem()
print(tmp)

ای زرورق سرشت که افق در تو 
مگر طوق که سگ بر تو 
شنید و کبروی واصل تا خود 
صد چنان ازل هستی کوی از من 

