In [None]:
# pandas 1.3.3
# numpy 1.21.4
# tensorflow 2.6.0
# keras 2.6.0 으로 설치 후 에러 없어짐.

In [1]:
sentence = '나는 밥을 먹었다.'

source_sentence='<start>'+sentence
target_sentence=sentence+'<end>'

print("Source 문장 : ",source_sentence)
print("Target 문장 : ",target_sentence)

Source 문장 :  <start>나는 밥을 먹었다.
Target 문장 :  나는 밥을 먹었다.<end>


In [2]:
import os,re
import numpy as np
import pandas as pd
import tensorflow as tf

file_path ="./data/shakespeare.txt"
with open(file_path,'r') as f:
    raw_corpus = f.read().splitlines()
    
print(raw_corpus[:9])

['First Citizen:', 'Before we proceed any further, hear me speak.', '', 'All:', 'Speak, speak.', '', 'First Citizen:', 'You are all resolved rather to die than to famish?', '']


In [3]:
for idx,sentence in enumerate(raw_corpus):
    if len(sentence)==0: continue
    if sentence[-1]==':': continue
    if idx>9: break
    print(sentence)

Before we proceed any further, hear me speak.
Speak, speak.
You are all resolved rather to die than to famish?


#  텐서플로우 전처리 과정

## 1. 정규표현식을 이용한 corpus 생성
## 2. 코퍼스를 텐서로 변환
## 3. corpus 텐서를 tf.data.Dataset객체로 변환

# 1. 정규표현식을 이용한 corpus 생성
#### 불필요한 공백, 특수문자를 제거하는 과정

In [4]:
def preprocess_sentence(sentence):
    sentence = sentence.lower().strip() # 1
    sentence = re.sub(r"([?.!,¿])", r" \1 ", sentence) # 2
    sentence = re.sub(r'[" "]+', " ", sentence) # 3
    sentence = re.sub(r"[^a-zA-Z?.!,¿]+", " ", sentence) # 4
    sentence = sentence.strip() # 5
    sentence = '<start> ' + sentence + ' <end>' # 6
    return sentence

print(preprocess_sentence("This @_is ;;;sample        sentence."))

<start> this is sample sentence . <end>


#### 공백이나 특수문자를 제거한 코퍼스를 10개만 확인해보자.

In [5]:
corpus = []

for sentence in raw_corpus:
    if len(sentence)==0: continue
    if sentence[-1] ==":" : continue
    
    preprocessd_sentence = preprocess_sentence(sentence)
    corpus.append(preprocessd_sentence)
    
corpus[:10]

['<start> before we proceed any further , hear me speak . <end>',
 '<start> speak , speak . <end>',
 '<start> you are all resolved rather to die than to famish ? <end>',
 '<start> resolved . resolved . <end>',
 '<start> first , you know caius marcius is chief enemy to the people . <end>',
 '<start> we know t , we know t . <end>',
 '<start> let us kill him , and we ll have corn at our own price . <end>',
 '<start> is t a verdict ? <end>',
 '<start> no more talking on t let it be done away , away ! <end>',
 '<start> one word , good citizens . <end>']

# 2. 코퍼스를 텐서로 변환
##### 벡터화 == 토큰화 ? 1. 문장을 토큰화 2. 토큰들을 단어사전으로 만든다. 3. 데이터를 숫자로 만들어준다. (텐서로 만들어준다.)

In [6]:
def tokenize(corpus):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(
        num_words = 7000,
        filters = ' ',
        oov_token = "<unk>"
    )
    tokenizer.fit_on_texts(corpus) #문자 데이터를 입력받아 리스트 형태로 리턴 
    tensor = tokenizer.texts_to_sequences(corpus) #텍스트 안의 단어들을 숫자의 시퀀스로 변환
    
    #입력데이터의 시퀀스 길이를 일정하게 맞춰준다.
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,padding='post')
    print(tensor,tokenizer)
    return tensor,tokenizer

In [7]:
tensor,tokenizer = tokenize(corpus)

[[   2  143   40 ...    0    0    0]
 [   2  110    4 ...    0    0    0]
 [   2   11   50 ...    0    0    0]
 ...
 [   2  149 4553 ...    0    0    0]
 [   2   34   71 ...    0    0    0]
 [   2  945   34 ...    0    0    0]] <keras_preprocessing.text.Tokenizer object at 0x7fe5d2b50990>


In [8]:
print(tensor[:1,:10])

[[  2 143  40 933 140 591   4 124  24 110]]


In [9]:
for idx in tokenizer.index_word:
    print(idx, ":",tokenizer.index_word[idx])

1 : <unk>
2 : <start>
3 : <end>
4 : ,
5 : .
6 : the
7 : and
8 : i
9 : to
10 : of
11 : you
12 : my
13 : a
14 : that
15 : ?
16 : in
17 : !
18 : is
19 : not
20 : for
21 : s
22 : with
23 : it
24 : me
25 : your
26 : be
27 : he
28 : his
29 : but
30 : this
31 : have
32 : as
33 : d
34 : thou
35 : what
36 : him
37 : so
38 : will
39 : thy
40 : we
41 : by
42 : no
43 : all
44 : shall
45 : if
46 : her
47 : do
48 : our
49 : o
50 : are
51 : thee
52 : on
53 : now
54 : good
55 : lord
56 : come
57 : from
58 : which
59 : or
60 : sir
61 : more
62 : then
63 : ll
64 : at
65 : they
66 : here
67 : would
68 : was
69 : how
70 : she
71 : let
72 : well
73 : king
74 : than
75 : them
76 : say
77 : their
78 : am
79 : hath
80 : when
81 : there
82 : love
83 : one
84 : were
85 : go
86 : may
87 : upon
88 : us
89 : make
90 : like
91 : man
92 : know
93 : yet
94 : an
95 : should
96 : must
97 : why
98 : where
99 : had
100 : did
101 : father
102 : death
103 : tis
104 : see
105 : who
106 : some
107 : too
108 : give
109 : thes

982 : proclaim
983 : earl
984 : bride
985 : slander
986 : eat
987 : smile
988 : stone
989 : walls
990 : valour
991 : grown
992 : sort
993 : carry
994 : doom
995 : malice
996 : deadly
997 : precious
998 : adieu
999 : six
1000 : humble
1001 : judgment
1002 : kept
1003 : officer
1004 : despair
1005 : fond
1006 : calls
1007 : kindness
1008 : herself
1009 : winter
1010 : grey
1011 : kindred
1012 : abroad
1013 : secret
1014 : creature
1015 : usurp
1016 : bitter
1017 : aumerle
1018 : lands
1019 : mercutio
1020 : lewis
1021 : caius
1022 : account
1023 : idle
1024 : rivers
1025 : vile
1026 : making
1027 : volsces
1028 : cominius
1029 : cruel
1030 : swords
1031 : excuse
1032 : door
1033 : quick
1034 : flower
1035 : silence
1036 : tent
1037 : cried
1038 : gives
1039 : health
1040 : tongues
1041 : manner
1042 : execution
1043 : trouble
1044 : drawn
1045 : repent
1046 : foes
1047 : awake
1048 : divine
1049 : treason
1050 : beggar
1051 : thinks
1052 : bids
1053 : stars
1054 : dance
1055 : charity
10

1660 : heed
1661 : pieces
1662 : treacherous
1663 : heirs
1664 : imprisonment
1665 : perforce
1666 : grievous
1667 : fairer
1668 : wreck
1669 : revenged
1670 : afford
1671 : sickness
1672 : contented
1673 : grandam
1674 : innocent
1675 : lovely
1676 : sky
1677 : meant
1678 : offended
1679 : sends
1680 : bethink
1681 : forthwith
1682 : wound
1683 : slow
1684 : dagger
1685 : gift
1686 : boar
1687 : contract
1688 : manners
1689 : reverence
1690 : pure
1691 : wed
1692 : dew
1693 : advice
1694 : shape
1695 : deal
1696 : lark
1697 : slept
1698 : arrived
1699 : special
1700 : dish
1701 : trade
1702 : weather
1703 : raise
1704 : parliament
1705 : happily
1706 : rogue
1707 : vault
1708 : tarry
1709 : island
1710 : shrew
1711 : patricians
1712 : wholesome
1713 : store
1714 : repeal
1715 : complain
1716 : affections
1717 : increase
1718 : feed
1719 : slaves
1720 : dogs
1721 : mouths
1722 : hunt
1723 : constant
1724 : titus
1725 : lean
1726 : equal
1727 : brook
1728 : aim
1729 : remove
1730 : assi

2308 : paid
2309 : fardel
2310 : mystery
2311 : habit
2312 : mariana
2313 : cambio
2314 : chief
2315 : object
2316 : bread
2317 : piercing
2318 : members
2319 : discontented
2320 : mutinous
2321 : petty
2322 : helps
2323 : bestow
2324 : touching
2325 : lions
2326 : seeking
2327 : presume
2328 : parties
2329 : liking
2330 : shoes
2331 : cowardly
2332 : troop
2333 : caps
2334 : towns
2335 : entreaties
2336 : proved
2337 : looked
2338 : disease
2339 : infect
2340 : senseless
2341 : grim
2342 : wore
2343 : goddess
2344 : charges
2345 : couldst
2346 : nuptial
2347 : warriors
2348 : bands
2349 : dearer
2350 : inclined
2351 : condemned
2352 : fully
2353 : grieves
2354 : profane
2355 : safely
2356 : repose
2357 : gifts
2358 : pace
2359 : estimation
2360 : earthly
2361 : cap
2362 : wanting
2363 : smother
2364 : shown
2365 : commit
2366 : kisses
2367 : wills
2368 : destruction
2369 : affect
2370 : courteous
2371 : scratch
2372 : ought
2373 : also
2374 : twould
2375 : tricks
2376 : behavior
2377 

3257 : milk
3258 : hurried
3259 : plants
3260 : stooping
3261 : pronounced
3262 : yielding
3263 : lads
3264 : killed
3265 : herald
3266 : impatience
3267 : lour
3268 : victorious
3269 : mounting
3270 : shaped
3271 : halt
3272 : g
3273 : plucks
3274 : chamberlain
3275 : brakenbury
3276 : path
3277 : weighty
3278 : marrying
3279 : key
3280 : anne
3281 : direful
3282 : spiders
3283 : ugly
3284 : higher
3285 : dwells
3286 : gape
3287 : blessings
3288 : perfection
3289 : fouler
3290 : slanderous
3291 : unfit
3292 : moan
3293 : boon
3294 : tewksbury
3295 : bought
3296 : recover
3297 : notwithstanding
3298 : cheerfully
3299 : deceive
3300 : gather
3301 : spilt
3302 : altogether
3303 : quake
3304 : unlook
3305 : sugar
3306 : befal
3307 : conclusion
3308 : allies
3309 : villany
3310 : hardy
3311 : skulls
3312 : hideous
3313 : glories
3314 : sleeps
3315 : resolute
3316 : fills
3317 : chop
3318 : unlawful
3319 : redemption
3320 : christ
3321 : vassal
3322 : publicly
3323 : relent
3324 : interchan

4140 : mid
4141 : staring
4142 : rites
4143 : helmets
4144 : bind
4145 : thinkest
4146 : drowsy
4147 : smiled
4148 : abate
4149 : argument
4150 : appellant
4151 : knighthood
4152 : fearless
4153 : neglected
4154 : ambush
4155 : exactly
4156 : degenerate
4157 : parle
4158 : chivalry
4159 : vial
4160 : venge
4161 : wrongfully
4162 : champion
4163 : goest
4164 : foaming
4165 : plashy
4166 : orderly
4167 : violate
4168 : regreet
4169 : vigour
4170 : disloyal
4171 : decree
4172 : summers
4173 : lamp
4174 : devouring
4175 : grass
4176 : coffers
4177 : farm
4178 : coats
4179 : listen
4180 : teeming
4181 : stubborn
4182 : deposing
4183 : precedent
4184 : impute
4185 : ensue
4186 : courses
4187 : governor
4188 : richly
4189 : wasted
4190 : cobham
4191 : francis
4192 : drooping
4193 : hides
4194 : cheerful
4195 : persuades
4196 : howe
4197 : ross
4198 : willoughby
4199 : whereupon
4200 : prodigy
4201 : posts
4202 : purses
4203 : proclaimed
4204 : remembering
4205 : message
4206 : pricks
4207 : t

5122 : mutual
5123 : seemeth
5124 : compact
5125 : breach
5126 : censures
5127 : index
5128 : protect
5129 : ensuing
5130 : justices
5131 : northampton
5132 : leisurely
5133 : parlous
5134 : massacre
5135 : unquiet
5136 : toss
5137 : conquerors
5138 : crosses
5139 : wearisome
5140 : slug
5141 : oratory
5142 : dealings
5143 : sojourn
5144 : recreation
5145 : succeeding
5146 : successively
5147 : retail
5148 : ending
5149 : characters
5150 : formal
5151 : moralize
5152 : mocks
5153 : ingenious
5154 : complots
5155 : willingness
5156 : separated
5157 : tragedy
5158 : packing
5159 : unprepared
5160 : beheaded
5161 : hack
5162 : strawberries
5163 : prolong
5164 : conspire
5165 : bewitch
5166 : raze
5167 : pursuivant
5168 : lighted
5169 : builds
5170 : sailor
5171 : distraught
5172 : pry
5173 : start
5174 : unsuspected
5175 : recorded
5176 : attainder
5177 : preservation
5178 : turks
5179 : infidels
5180 : rashly
5181 : proceeded
5182 : signified
5183 : moreover
5184 : baynard
5185 : indictm

5806 : escaped
5807 : westmoreland
5808 : beck
5809 : worthless
5810 : dauphin
5811 : attentive
5812 : unpeople
5813 : suffolk
5814 : spark
5815 : abandon
5816 : disinherit
5817 : entail
5818 : birthright
5819 : creep
5820 : quietly
5821 : frivolous
5822 : privily
5823 : resteth
5824 : protectors
5825 : besiege
5826 : saves
5827 : hilt
5828 : matching
5829 : payment
5830 : woodcock
5831 : mountains
5832 : mess
5833 : grumbling
5834 : fret
5835 : impudent
5836 : assay
5837 : sicils
5838 : wrapt
5839 : cannibals
5840 : resembles
5841 : sever
5842 : breeder
5843 : ireful
5844 : vanquish
5845 : recount
5846 : gasp
5847 : scouts
5848 : robb
5849 : heated
5850 : famed
5851 : enrolled
5852 : dreaming
5853 : scapes
5854 : trodden
5855 : protection
5856 : unsheathe
5857 : tongued
5858 : cured
5859 : stings
5860 : shower
5861 : hewn
5862 : clouded
5863 : losses
5864 : setter
5865 : blowing
5866 : sways
5867 : finish
5868 : ewes
5869 : curds
5870 : bottle
5871 : nobody
5872 : begetting
5873 : sig

6622 : kick
6623 : muck
6624 : covets
6625 : erleap
6626 : suffrage
6627 : brag
6628 : unaching
6629 : recommend
6630 : contemn
6631 : requested
6632 : marketplace
6633 : auburn
6634 : diversely
6635 : skull
6636 : wedged
6637 : fog
6638 : requests
6639 : particulars
6640 : consulship
6641 : begged
6642 : enigma
6643 : counterfeitly
6644 : bewitchment
6645 : desirers
6646 : woolvish
6647 : toge
6648 : hob
6649 : antique
6650 : unswept
6651 : mountainous
6652 : heapt
6653 : limitation
6654 : endue
6655 : official
6656 : invested
6657 : methink
6658 : flouted
6659 : scornfully
6660 : friendliness
6661 : arriving
6662 : malignantly
6663 : plebeii
6664 : translate
6665 : endures
6666 : unelected
6667 : rectorship
6668 : asker
6669 : barking
6670 : portance
6671 : gibingly
6672 : ungravely
6673 : guided
6674 : preoccupied
6675 : lectures
6676 : youngly
6677 : marcians
6678 : ancus
6679 : numa
6680 : hostilius
6681 : publius
6682 : quintus
6683 : conduits
6684 : ancestor
6685 : descended
668

7472 : handiwork
7473 : carnal
7474 : pew
7475 : adulterate
7476 : dusky
7477 : intelligencer
7478 : cancel
7479 : presentation
7480 : thronging
7481 : wails
7482 : client
7483 : intestate
7484 : impart
7485 : copious
7486 : intercepts
7487 : intercepted
7488 : strangling
7489 : hidest
7490 : graven
7491 : enointed
7492 : exclamations
7493 : frightful
7494 : venturous
7495 : humphrey
7496 : breakfast
7497 : praying
7498 : unscarr
7499 : safest
7500 : lanced
7501 : whetted
7502 : tackling
7503 : reft
7504 : scaffold
7505 : demise
7506 : lethe
7507 : supposest
7508 : engrave
7509 : therewith
7510 : inducement
7511 : unadvisedly
7512 : familiarly
7513 : ruins
7514 : distressful
7515 : orient
7516 : advantaging
7517 : loan
7518 : chastised
7519 : victress
7520 : loathes
7521 : eloquent
7522 : style
7523 : infants
7524 : misusest
7525 : circling
7526 : playfellows
7527 : planets
7528 : consists
7529 : spicery
7530 : recomforture
7531 : relenting
7532 : rideth
7533 : navy
7534 : hull
7535 : 

8235 : blades
8236 : healths
8237 : plats
8238 : manes
8239 : bakes
8240 : elflocks
8241 : sluttish
8242 : learns
8243 : fantasy
8244 : wooes
8245 : puffs
8246 : misgives
8247 : revels
8248 : expire
8249 : steerage
8250 : scrape
8251 : unwashed
8252 : stools
8253 : cupboard
8254 : marchpane
8255 : grindstone
8256 : nell
8257 : antony
8258 : brisk
8259 : unplagued
8260 : mistresses
8261 : nuptials
8262 : pentecost
8263 : ethiope
8264 : snowy
8265 : trooping
8266 : fleer
8267 : portly
8268 : brags
8269 : intrusion
8270 : convert
8271 : pilgrims
8272 : chinks
8273 : begone
8274 : trifling
8275 : tiberio
8276 : petrucio
8277 : betwitched
8278 : tempering
8279 : nick
8280 : cophetua
8281 : heareth
8282 : stirreth
8283 : moveth
8284 : quivering
8285 : adjacent
8286 : circle
8287 : letting
8288 : invocation
8289 : medlars
8290 : et
8291 : caetera
8292 : poperin
8293 : pear
8294 : truckle
8295 : jests
8296 : twinkle
8297 : spheres
8298 : brightness
8299 : leans
8300 : bestrides
8301 : pacing
8

9000 : impale
9001 : guerdon
9002 : renounce
9003 : grudges
9004 : replant
9005 : becomest
9006 : joins
9007 : irrevocable
9008 : bourbon
9009 : mislike
9010 : dishonoured
9011 : appeased
9012 : hungerford
9013 : bonville
9014 : broker
9015 : amazon
9016 : marries
9017 : withstand
9018 : hitherto
9019 : mistrustful
9020 : diomede
9021 : sleight
9022 : stole
9023 : rhesus
9024 : thracian
9025 : unawares
9026 : applaud
9027 : suppress
9028 : halberds
9029 : embassade
9030 : degraded
9031 : ambassadors
9032 : brotherly
9033 : welfare
9034 : fraud
9035 : thicket
9036 : disport
9037 : captivity
9038 : lynn
9039 : flanders
9040 : huntsman
9041 : enlargement
9042 : sovereigns
9043 : thwarting
9044 : spying
9045 : avoiding
9046 : olive
9047 : laurel
9048 : creator
9049 : confiscate
9050 : eclipsed
9051 : presaging
9052 : misgive
9053 : conflicts
9054 : maketh
9055 : waned
9056 : repass
9057 : foretold
9058 : lurks
9059 : forewarned
9060 : drummer
9061 : debate
9062 : debating
9063 : pretend
90

9877 : winners
9878 : exultation
9879 : bough
9880 : mate
9881 : directing
9882 : dissever
9883 : properties
9884 : science
9885 : institutions
9886 : enriched
9887 : deputation
9888 : organs
9889 : observer
9890 : belongings
9891 : thrifty
9892 : determines
9893 : creditor
9894 : advertise
9895 : secondary
9896 : test
9897 : leaven
9898 : prefers
9899 : unquestion
9900 : concernings
9901 : commissions
9902 : aves
9903 : concludest
9904 : sanctimonious
9905 : commandments
9906 : scraped
9907 : thanksgiving
9908 : metre
9909 : shears
9910 : feelingly
9911 : mitigation
9912 : dolours
9913 : figuring
9914 : healthy
9915 : hips
9916 : sciatica
9917 : chopped
9918 : julietta
9919 : groping
9920 : trouts
9921 : peculiar
9922 : river
9923 : burgher
9924 : pulled
9925 : counsellors
9926 : clients
9927 : demigod
9928 : immoderate
9929 : ravin
9930 : bane
9931 : creditors
9932 : foppery
9933 : morality
9934 : denunciation
9935 : propagation
9936 : coffer
9937 : unhappily
9938 : glimpse
9939 : ne

10665 : affable
10666 : limp
10667 : twig
10668 : hue
10669 : nill
10670 : conformable
10671 : fatherly
10672 : ruffian
10673 : temperate
10674 : grissel
10675 : incredible
10676 : protesting
10677 : twink
10678 : novices
10679 : meacock
10680 : curstest
10681 : mart
10682 : youngling
10683 : graybeard
10684 : fry
10685 : skipper
10686 : nourisheth
10687 : flourisheth
10688 : basins
10689 : ewers
10690 : lave
10691 : hangings
10692 : tyrian
10693 : tapestry
10694 : ivory
10695 : arras
10696 : counterpoints
10697 : canopies
10698 : turkey
10699 : boss
10700 : valance
10701 : needlework
10702 : pewter
10703 : housekeeping
10704 : milch
10705 : kine
10706 : pail
10707 : sixscore
10708 : oxen
10709 : answerable
10710 : marseilles
10711 : argosies
10712 : galliases
10713 : tight
10714 : galleys
10715 : gamester
10716 : crafty
10717 : card
10718 : braves
10719 : breeching
10720 : schools
10721 : pantaloon
10722 : pedascule
10723 : aeacides
10724 : ajax
10725 : unkindly
10726 : groweth
10727 

In [10]:
src_input = tensor[:,:-1]
tgt_input = tensor[:,1:]

print(src_input[0])
print(tgt_input[0])

[  2 143  40 933 140 591   4 124  24 110   5   3   0   0   0   0   0   0
   0   0]
[143  40 933 140 591   4 124  24 110   5   3   0   0   0   0   0   0   0
   0   0]


# 3.corpus 텐서를 tf.data.Dataset객체로 변환

In [11]:
BUFFER_SIZE = len(src_input)
BATCH_SIZE = 256
steps_per_epoch = len(src_input) // BATCH_SIZE

VOCAB_SIZE = tokenizer.num_words +1

dataset = tf.data.Dataset.from_tensor_slices((src_input,tgt_input))
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE,drop_remainder=True)
dataset

2022-12-30 14:28:27.591928: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


<BatchDataset shapes: ((256, 20), (256, 20)), types: (tf.int32, tf.int32)>

# 6-5. 실습 (2) 인공지능 학습시키기

In [12]:
class TextGenerator(tf.keras.Model):
    def __init__(self, vocab_size, embedding_size, hidden_size):
        super().__init__()
        # Embedding 레이어, 2개의 LSTM 레이어, 1개의 Dense 레이어로 구성되어 있다.
        # Embedding 레이어는 단어 사전의 인덱스 값을 해당 인덱스 번째의 워드 벡터로 바꿔준다.
        # 이 워드 벡터는 의미 벡터 공간에서 단어의 추상적 표현으로 사용된다. 
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size) 
        self.rnn_1 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)  
        self.rnn_2 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
        self.linear = tf.keras.layers.Dense(vocab_size)
        
    def call(self, x):
        out = self.embedding(x)
        out = self.rnn_1(out)
        out = self.rnn_2(out)
        out = self.linear(out)
        
        return out
# embedding size 값이 커질수록 단어의 추상적인 특징들을 더 잡아낼 수 있지만
# 그만큼 충분한 데이터가 없으면 안좋은 결과 값을 가져옵니다!   
embedding_size = 256 # 워드 벡터의 차원수를 말하며 단어가 추상적으로 표현되는 크기입니다.
hidden_size = 1024 # 모델에 얼마나 많은 일꾼을 둘 것인가? 정도로 이해하면 좋다.
model = TextGenerator(tokenizer.num_words + 1, embedding_size , hidden_size) # tokenizer.num_words에 +1인 이유는 문장에 없는 pad가 사용되었기 때문이다.

In [13]:
embedding_size = 256
hidden_size = 1024
model = TextGenerator(tokenizer.num_words+1, embedding_size,hidden_size)

In [14]:
for src_sample, tgt_sample in dataset.take(1): break
    
model(src_sample)

<tf.Tensor: shape=(256, 20, 7001), dtype=float32, numpy=
array([[[-2.02200506e-04,  7.06054270e-05, -1.86313409e-04, ...,
          5.01989387e-04,  2.44223775e-05, -4.86011326e-04],
        [-2.83828296e-04, -9.31309842e-05, -4.01828700e-04, ...,
          1.04830298e-03,  7.78802714e-05, -8.87074566e-04],
        [-3.99782322e-04, -3.40607658e-05, -2.34434672e-04, ...,
          1.23562384e-03, -9.83536593e-05, -1.25827920e-03],
        ...,
        [-1.11453666e-03, -4.07708460e-04, -2.08654930e-03, ...,
         -1.69334223e-03, -1.28406857e-03, -2.07696576e-03],
        [-1.15215522e-03, -3.97050346e-04, -2.19418434e-03, ...,
         -1.95000065e-03, -1.23840093e-03, -2.73245410e-03],
        [-1.18474453e-03, -4.04955936e-04, -2.25602044e-03, ...,
         -2.17633392e-03, -1.20432116e-03, -3.36169126e-03]],

       [[-2.02200506e-04,  7.06054270e-05, -1.86313409e-04, ...,
          5.01989387e-04,  2.44223775e-05, -4.86011326e-04],
        [-2.05405973e-04,  2.71574154e-05, -3.

In [15]:
import tensorflow as tf
print(tf.__version__)

2.6.0


In [16]:
model.summary()

Model: "text_generator_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      multiple                  1792256   
_________________________________________________________________
lstm_2 (LSTM)                multiple                  5246976   
_________________________________________________________________
lstm_3 (LSTM)                multiple                  8392704   
_________________________________________________________________
dense_1 (Dense)              multiple                  7176025   
Total params: 22,607,961
Trainable params: 22,607,961
Non-trainable params: 0
_________________________________________________________________


In [17]:
optimizer = tf.keras.optimizers.Adam() # Adam은 현재 가장 많이 사용하는 옵티마이저이다. 자세한 내용은 차차 배운다.
loss = tf.keras.losses.SparseCategoricalCrossentropy( # 훈련 데이터의 라벨이 정수의 형태로 제공될 때 사용하는 손실함수이다.
    from_logits=True, # 기본값은 False이다. 모델에 의해 생성된 출력 값이 정규화되지 않았음을 손실 함수에 알려준다. 즉 softmax함수가 적용되지 않았다는걸 의미한다. 
    reduction='none'  # 기본값은 SUM이다. 각자 나오는 값의 반환 원할 때 None을 사용한다.
)
# 모델을 학습시키키 위한 학습과정을 설정하는 단계이다.
model.compile(optimizer=optimizer, loss=loss) # 손실함수와 훈련과정을 설정했다.
model.fit(dataset, epochs=30) # 만들어둔 데이터셋으로 모델을 학습한다. 30번 학습을 반복하겠다는 의미다.

Epoch 1/30


2022-12-30 14:28:36.003464: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7fe5d45c2f10>

# 6-6. 실습 (3) 잘 만들어졌는지 평가하기

In [18]:
#문장생성 함수 정의
#모델에게 시작 문장을 전달하면 모델이 시작 문장을 바탕으로 작문을 진행
def generate_text(model, tokenizer, init_sentence="<start>", max_len=20): #시작 문자열을 init_sentence 로 받으며 디폴트값은 <start> 를 받는다
    # 테스트를 위해서 입력받은 init_sentence도 텐서로 변환합니다
    test_input = tokenizer.texts_to_sequences([init_sentence]) #텍스트 안의 단어들을 숫자의 시퀀스의 형태로 변환
    test_tensor = tf.convert_to_tensor(test_input, dtype=tf.int64)
    end_token = tokenizer.word_index["<end>"]

    # 단어 하나씩 예측해 문장을 만듭니다
    #    1. 입력받은 문장의 텐서를 입력합니다
    #    2. 예측된 값 중 가장 높은 확률인 word index를 뽑아냅니다
    #    3. 2에서 예측된 word index를 문장 뒤에 붙입니다
    #    4. 모델이 <end>를 예측했거나, max_len에 도달했다면 문장 생성을 마칩니다 (도달 하지 못하였으면 while 루프를 돌면서 다음 단어를 예측)
    while True: #루프를 돌면서 init_sentence에 단어를 하나씩 생성성
        # 1
        predict = model(test_tensor) 
        # 2
        predict_word = tf.argmax(tf.nn.softmax(predict, axis=-1), axis=-1)[:, -1] 
        # 3 
        test_tensor = tf.concat([test_tensor, tf.expand_dims(predict_word, axis=0)], axis=-1)
        # 4 
        if predict_word.numpy()[0] == end_token: break
        if test_tensor.shape[1] >= max_len: break

    generated = ""
    # tokenizer를 이용해 word index를 단어로 하나씩 변환합니다 
    for word_index in test_tensor[0].numpy():
        generated += tokenizer.index_word[word_index] + " "

    return generated #최종적으로 모델이 생성한 문장을 반환

In [None]:
generate_text(model, tokenizer, init_sentence="<start> mother") # 시작문장으로 he를 넣어 문장생성 함수 실행