# Preprocessing of the poetry

## Clean
+ Split using symbol ":", just the poetry itself
+ Sort the poetry according to its length
+ `Poetry` - cleaned data(5~79 words and no strange symbols)

In [138]:
poetry_file = "poems.txt"
poetrys = []
with open(poetry_file, "r", encoding="utf-8") as f:
    for line in f:
        try:
            title, content = line.strip().split(':')
            content = content.replace(' ','')
            if set('_(（《[') & set(content):
                continue
            if len(content) < 5 or len(content) > 79:
                continue
            content = 'B' + content + 'E'
            poetrys.append(content)
        except Exception as e:
            pass
        

print(poetrys[0])
print("lines:", len(poetrys))

B寒随穷律变，春逐鸟声开。初风飘带柳，晚雪间花梅。碧林青旧竹，绿沼翠新苔。芝田初雁去，绮树巧莺来。E
lines: 34646


## Dict and List Structrue for Post-use

+ `words` - Sort word list in a descending way
+ `counter` - Dict {word:counts}
+ `word_int_map` - Dict {character:number}
+ Add `' '` into the `words`

In [153]:
from collections import Counter

all_words = [word for poetry in poetrys for word in poetry]
counter = Counter(all_words)
count_pairs = sorted(counter.items(), key=lambda x:-x[1])
words, _ = zip(*count_pairs)
words = words[:len(words)] + (' ',) 
word_int_map = dict(zip(words,range(len(words))))

## Create peotry vector
+ `poetry_vectors` - convert words into a vector, convert all poetrys into vectors

In [185]:
to_num = lambda word:word_int_map.get(word, len(words))
poetry_vectors = [list(map(to_num, poetry)) for poetry in poetrys]
print(poetry_vectors[0])

[2, 50, 179, 394, 1081, 597, 0, 13, 351, 148, 59, 79, 1, 155, 7, 457, 310, 166, 0, 161, 99, 164, 12, 493, 1, 202, 118, 60, 121, 153, 0, 206, 1578, 238, 76, 385, 1, 1200, 373, 155, 251, 29, 0, 780, 69, 1486, 510, 14, 1, 3]


## Create batches
+ `x_batches` - the batched input
+ `y_batches` - the batched output
+ Poetry_vectors within a batch have the same length 

In [191]:
np.full?

In [196]:
import numpy as np

batch_size = 64
n_chunk = len(poetry_vectors) // batch_size
x_batches = []
y_batches = []
for i in range(n_chunk):
    start_index = i * batch_size
    end_index = start_index + batch_size
    batches = poetry_vectors[start_index:end_index]
    length = max(map(len, batches))
    xdata = np.full((batch_size, length), word_int_map[' '], np.int32)
    print(xdata.shape)
    for row in range(batch_size):
        xdata[row, :len(batches[row])] = batches[row]
    ydata = np.copy(xdata)
    print()
    ydata[:, :-1] = xdata[:, 1:]
    """
    xdata             ydata
    [6,2,4,6,9]       [2,4,6,9,9]
    [1,4,2,8,5]       [4,2,8,5,5]
    """
    x_batches.append(xdata)  # (n_chunk, batch, length)
    y_batches.append(ydata)

(64, 74)
[[   2   50  179 ... 6109 6109 6109]
 [   2  161  343 ... 6109 6109 6109]
 [   2   10   89 ...  217    1    3]
 ...
 [   2  205  502 ... 6109 6109 6109]
 [   2 1031    7 ... 6109 6109 6109]
 [   2  420 1714 ... 6109 6109 6109]]
(64, 74)
[[   2   75   35 ... 6109 6109 6109]
 [   2 3846 3847 ... 6109 6109 6109]
 [   2  384 1895 ... 6109 6109 6109]
 ...
 [   2 1462   23 ...    5    1    3]
 [   2  327  515 ... 1825    1    3]
 [   2  272  134 ...  125    1    3]]
(64, 74)
[[   2   53  963 ...   46    1    3]
 [   2  563  773 ... 1748    1    3]
 [   2  585  356 ...  533    1    3]
 ...
 [   2   98  447 ... 6109 6109 6109]
 [   2 1856 1054 ... 6109 6109 6109]
 [   2 1192  240 ... 6109 6109 6109]]
(64, 74)
[[   2  121  328 ... 6109 6109 6109]
 [   2   25    7 ... 6109 6109 6109]
 [   2  553   14 ... 6109 6109 6109]
 ...
 [   2  301  233 ... 6109 6109 6109]
 [   2  195 1233 ... 6109 6109 6109]
 [   2  202   75 ... 6109 6109 6109]]
(64, 66)
[[   2  357  283 ...  112    1    3]
 [   2

(64, 75)
[[   2 1213  762 ... 6109 6109 6109]
 [   2   16   67 ... 6109 6109 6109]
 [   2 1488  460 ... 6109 6109 6109]
 ...
 [   2   29  235 ... 6109 6109 6109]
 [   2  121   19 ... 6109 6109 6109]
 [   2  291   56 ... 6109 6109 6109]]
(64, 74)
[[   2   76  297 ... 6109 6109 6109]
 [   2  302 1508 ...  464    1    3]
 [   2   48  333 ... 6109 6109 6109]
 ...
 [   2  400 1031 ... 6109 6109 6109]
 [   2  983 1036 ... 6109 6109 6109]
 [   2 1766   73 ... 6109 6109 6109]]
(64, 74)
[[   2  157 1213 ... 6109 6109 6109]
 [   2  524  856 ... 6109 6109 6109]
 [   2 2184  528 ... 6109 6109 6109]
 ...
 [   2  338   65 ...   81    1    3]
 [   2 1113 1113 ...   42    1    3]
 [   2 1959 1959 ...  859    1    3]]
(64, 74)
[[   2   19   96 ...  811    1    3]
 [   2   65    8 ...   40    1    3]
 [   2   10  466 ...  271    1    3]
 ...
 [   2  391  108 ... 6109 6109 6109]
 [   2  149  125 ... 6109 6109 6109]
 [   2  214   18 ... 6109 6109 6109]]
(64, 74)
[[   2  442  583 ... 6109 6109 6109]
 [   2

(64, 67)
[[   2  660   89 ...    1    3 6109]
 [   2   81  233 ...    1    3 6109]
 [   2  346 1451 ...    1    3 6109]
 ...
 [   2   83  502 ... 6109 6109 6109]
 [   2  179  160 ... 6109 6109 6109]
 [   2   26 1190 ... 6109 6109 6109]]
(64, 66)
[[   2   17 5130 ... 6109 6109 6109]
 [   2   22  182 ... 6109 6109 6109]
 [   2  383 2005 ... 6109 6109 6109]
 ...
 [   2  889  731 ... 6109 6109 6109]
 [   2  148   29 ... 6109 6109 6109]
 [   2 1349  988 ... 6109 6109 6109]]
(64, 66)
[[   2  509   87 ... 6109 6109 6109]
 [   2  313  178 ... 6109 6109 6109]
 [   2  173   44 ... 6109 6109 6109]
 ...
 [   2  668    5 ... 6109 6109 6109]
 [   2  883  953 ... 6109 6109 6109]
 [   2  929   69 ... 6109 6109 6109]]
(64, 66)
[[   2  179    7 ... 6109 6109 6109]
 [   2 2400   65 ... 6109 6109 6109]
 [   2  330  322 ... 6109 6109 6109]
 ...
 [   2   67  133 ... 6109 6109 6109]
 [   2  676  320 ... 6109 6109 6109]
 [   2  259   25 ... 6109 6109 6109]]
(64, 74)
[[   2  171  484 ... 6109 6109 6109]
 [   2