In [1]:
from __future__ import print_function
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM, Input, Flatten, Bidirectional
from keras.layers.normalization import BatchNormalization
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.metrics import categorical_accuracy
import numpy as np
import random
import sys
import os
import time
import codecs
import collections
from six.moves import cPickle

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
#import spacy, and french model
import spacy
nlp = spacy.load('fr')

In [3]:
data_dir = 'data/News'# data directory containing input.txt
save_dir = 'save' # directory to store models
seq_length = 30 # sequence length
sequences_step = 1 #step to create sequences

In [4]:
file_list = ["101","102","103","104","105","106","107","108","109","110","111","112","113"]

vocab_file = os.path.join(save_dir, "words_vocab.pkl")

In [5]:
def create_wordlist(doc):
    wl = []
    for word in doc:
        if word.text not in ("\n","\n\n",'\u2009','\xa0'):
            wl.append(word.text.lower())
    return wl

In [6]:
wordlist = []
for file_name in file_list:
    input_file = os.path.join(data_dir, file_name + ".txt")
    #read data
    with codecs.open(input_file, "r") as f:
        data = f.read()
    #create sentences
    doc = nlp(data)
    wl = create_wordlist(doc)
    wordlist = wordlist + wl

In [7]:
# count the number of words
word_counts = collections.Counter(wordlist)

# Mapping from index to word : that's the vocabulary
vocabulary_inv = [x[0] for x in word_counts.most_common()]
vocabulary_inv = list(sorted(vocabulary_inv))

# Mapping from word to index
vocab = {x: i for i, x in enumerate(vocabulary_inv)}
words = [x[0] for x in word_counts.most_common()]

#size of the vocabulary
vocab_size = len(words)
print("vocab size: ", vocab_size)

#save the words and vocabulary
with open(os.path.join(vocab_file), 'wb') as f:
    cPickle.dump((words, vocab, vocabulary_inv), f)

vocab size:  1842


In [8]:
#create sequences
sequences = []
next_words = []
for i in range(0, len(wordlist) - seq_length, sequences_step):
    sequences.append(wordlist[i: i + seq_length])
    next_words.append(wordlist[i + seq_length])

print('nb sequences:', len(sequences))

nb sequences: 7858


In [9]:
X = np.zeros((len(sequences), seq_length, vocab_size), dtype=np.bool)
y = np.zeros((len(sequences), vocab_size), dtype=np.bool)
for i, sentence in enumerate(sequences):
    for t, word in enumerate(sentence):
        X[i, t, vocab[word]] = 1
    y[i, vocab[next_words[i]]] = 1

In [10]:
def bidirectional_lstm_model(seq_length, vocab_size):
    print('Build LSTM model.')
    model = Sequential()
    model.add(Bidirectional(LSTM(rnn_size, activation="relu"),input_shape=(seq_length, vocab_size)))
    model.add(Dropout(0.6))
    model.add(Dense(vocab_size))
    model.add(Activation('softmax'))
    
    optimizer = Adam(lr=learning_rate)
    callbacks=[EarlyStopping(patience=2, monitor='val_loss')]
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=[categorical_accuracy])
    return model

In [11]:
rnn_size = 256 # size of RNN
batch_size = 32 # minibatch size
seq_length = 30 # sequence length
num_epochs = 50 # number of epochs
learning_rate = 0.001 #learning rate
sequences_step = 1 #step to create sequences

In [12]:
md = bidirectional_lstm_model(seq_length, vocab_size)
md.summary()

Build LSTM model.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_1 (Bidirection (None, 512)               4298752   
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1842)              944946    
_________________________________________________________________
activation_1 (Activation)    (None, 1842)              0         
Total params: 5,243,698
Trainable params: 5,243,698
Non-trainable params: 0
_________________________________________________________________


In [13]:
#fit the model
callbacks=[EarlyStopping(patience=4, monitor='val_loss'),
           ModelCheckpoint(filepath=save_dir + "/" + 'my_model_gen_sentences_lstm.{epoch:02d}-{val_loss:.2f}.hdf5',\
                           monitor='val_loss', verbose=0, mode='auto', period=2)]
history = md.fit(X, y,
                 batch_size=batch_size,
                 shuffle=True,
                 epochs=num_epochs,
                 callbacks=callbacks,
                 validation_split=0.01)

Train on 7779 samples, validate on 79 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


In [14]:
#save the model
md.save(save_dir + "/" + 'my_model_gen_sentences_lstm.final.hdf5')

In [15]:
#load vocabulary
print("loading vocabulary...")
vocab_file = os.path.join(save_dir, "words_vocab.pkl")

with open(os.path.join(save_dir, 'words_vocab.pkl'), 'rb') as f:
        words, vocab, vocabulary_inv = cPickle.load(f)

vocab_size = len(words)

loading vocabulary...


In [16]:
from keras.models import load_model
# load the model
print("loading model...")
model = load_model(save_dir + "/" + 'my_model_gen_sentences_lstm.final.hdf5')

loading model...


In [17]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [18]:
#initiate sentences
seed_sentences = "le vainqueur de 12 tournois majeurs ne veut pas imposer trop de pression ."
generated = ''
sentence = []
for i in range (seq_length):
    sentence.append("a")

seed = seed_sentences.split()

for i in range(len(seed)):
    sentence[seq_length-i-1]=seed[len(seed)-i-1]

generated += ' '.join(sentence)
print('Generating text with the following seed: "' + ' '.join(sentence) + '"')

print ()

Generating text with the following seed: "a a a a a a a a a a a a a a a a le vainqueur de 12 tournois majeurs ne veut pas imposer trop de pression ."



In [19]:
words_number = 100
#generate the text
for i in range(words_number):
    #create the vector
    x = np.zeros((1, seq_length, vocab_size))
    for t, word in enumerate(sentence):
        x[0, t, vocab[word]] = 1.
    #print(x.shape)

    #calculate next word
    preds = model.predict(x, verbose=0)[0]
    next_index = sample(preds, 0.33)
    next_word = vocabulary_inv[next_index]

    #add the next word to the text
    generated += " " + next_word
    # shift the sentence by one, and and the next word at its end
    sentence = sentence[1:] + [next_word]

print(generated)


a a a a a a a a a a a a a a a a le vainqueur de 12 tournois majeurs ne veut pas imposer trop de pression . de et . à , . de et de la , , de , . , et , - nous la joueurs . il , a des sera et l' carrière le , plus en se un autant , . . . » « , qui de dans , a un ( un de de de ai de présenté de les en celle la il . de ce était , de , en la 4 , , , de , à six « « , le chances , , , la de la l’ 4 . de le de grand


In [1]:
#import gensim library
import gensim
from gensim.models.doc2vec import LabeledSentence

import numpy as np
import os
import time
import codecs

#parameters
data_dir = 'data/News'# data directory containing input.txt
save_dir = 'save' # directory to store models
file_list = ["101","102","103","104","105","106","107","108","109","110","111","112","113"]


In [2]:

#import spacy, and french model
import spacy
nlp = spacy.load('fr')

#initiate sentences and labels lists
sentences = []
sentences_label = []

#create sentences function:
def create_sentences(doc):
    ponctuation = [".","?","!",":","…"]
    sentences = []
    sent = []
    for word in doc:
        if word.text not in ponctuation:
            if word.text not in ("\n","\n\n",'\u2009','\xa0'):
                sent.append(word.text.lower())
        else:
            sent.append(word.text.lower())
            if len(sent) > 1:
                sentences.append(sent)
            sent=[]
    return sentences

#create sentences from files
for file_name in file_list:
    input_file = os.path.join(data_dir, file_name + ".txt")
    #read data
    with codecs.open(input_file, "r") as f:
        data = f.read()
    #create sentences
    doc = nlp(data)
    sents = create_sentences(doc)
    sentences = sentences + sents
    
#create labels
for i in range(np.array(sentences).shape[0]):
    sentences_label.append("ID" + str(i))

In [3]:
class LabeledLineSentence(object):
    def __init__(self, doc_list, labels_list):
        self.labels_list = labels_list
        self.doc_list = doc_list
    def __iter__(self):
        for idx, doc in enumerate(self.doc_list):
            yield gensim.models.doc2vec.LabeledSentence(doc,[self.labels_list[idx]])

In [4]:
def train_doc2vec_model(data, docLabels, size=300, sample=0.000001, dm=0, hs=1, window=10, min_count=0, workers=8,alpha=0.024, min_alpha=0.024, epoch=15, save_file='./data/doc2vec.w2v') :
    startime = time.time()
    
    print("{0} articles loaded for model".format(len(data)))

    it = LabeledLineSentence(data, docLabels)

    model = gensim.models.Doc2Vec(size=size, sample=sample, dm=dm, window=window, min_count=min_count, workers=workers,alpha=alpha, min_alpha=min_alpha, hs=hs) # use fixed learning rate
    model.build_vocab(it)
    for epoch in range(epoch):
        print("Training epoch {}".format(epoch + 1))
        model.train(it,total_examples=model.corpus_count,epochs=model.iter)
        # model.alpha -= 0.002 # decrease the learning rate
        # model.min_alpha = model.alpha # fix the learning rate, no decay
        
    #saving the created model
    model.save(os.path.join(save_file))
    print('model saved')

In [5]:
train_doc2vec_model(sentences, sentences_label, size=500,sample=0.0,alpha=0.025, min_alpha=0.001, min_count=0, window=10, epoch=20, dm=0, hs=1, save_file='./data/doc2vec.w2v')

314 articles loaded for model
Training epoch 1


  import sys
  if sys.path[0] == '':


Training epoch 2
Training epoch 3
Training epoch 4
Training epoch 5
Training epoch 6
Training epoch 7
Training epoch 8
Training epoch 9
Training epoch 10
Training epoch 11
Training epoch 12
Training epoch 13
Training epoch 14
Training epoch 15
Training epoch 16
Training epoch 17
Training epoch 18
Training epoch 19
Training epoch 20
model saved


In [6]:
#import library
from six.moves import cPickle

#load the model
d2v_model = gensim.models.doc2vec.Doc2Vec.load('./data/doc2vec.w2v')

sentences_vector=[]

t = 500

for i in range(len(sentences)):
    if i % t == 0:
        print("sentence", i, ":", sentences[i])
        print("***")
    sent = sentences[i]
    sentences_vector.append(d2v_model.infer_vector(sent, alpha=0.001, min_alpha=0.001, steps=10000))
    
#save the sentences_vector
sentences_vector_file = os.path.join(save_dir, "sentences_vector_500_a001_ma001_s10000.pkl")
with open(os.path.join(sentences_vector_file), 'wb') as f:
    cPickle.dump((sentences_vector), f)

sentence 0 : ['un', 'mois', 'après', 'la', 'mort', 'de', 'leur', 'amie', 'd’', 'enfance', ',', 'quatre', 'jeunes', 'femmes', 'à', 'l’', 'aube', 'de', 'la', 'trentaine', 'se', 'réunissent', 'dans', 'une', 'maison', 'de', 'campagne', '.']
***


In [7]:
nb_sequenced_sentences = 15
vector_dim = 500

X_train = np.zeros((len(sentences), nb_sequenced_sentences, vector_dim), dtype=np.float)
y_train = np.zeros((len(sentences), vector_dim), dtype=np.float)

t = 1000
for i in range(len(sentences_label)-nb_sequenced_sentences-1):
    if i % t == 0: print("new sequence: ", i)
    
    for k in range(nb_sequenced_sentences):
        sent = sentences_label[i+k]
        vect = sentences_vector[i+k]
        
        if i % t == 0:
            print("  ", k + 1 ,"th vector for this sequence. Sentence ", sent, "(vector dim = ", len(vect), ")")
            
        for j in range(len(vect)):
            X_train[i, k, j] = vect[j]
    
    senty = sentences_label[i+nb_sequenced_sentences]
    vecty = sentences_vector[i+nb_sequenced_sentences]
    if i % t == 0: print("  y vector for this sequence ", senty, ": (vector dim = ", len(vecty), ")")
    for j in range(len(vecty)):
        y_train[i, j] = vecty[j]

print(X_train.shape, y_train.shape)

new sequence:  0
   1 th vector for this sequence. Sentence  ID0 (vector dim =  500 )
   2 th vector for this sequence. Sentence  ID1 (vector dim =  500 )
   3 th vector for this sequence. Sentence  ID2 (vector dim =  500 )
   4 th vector for this sequence. Sentence  ID3 (vector dim =  500 )
   5 th vector for this sequence. Sentence  ID4 (vector dim =  500 )
   6 th vector for this sequence. Sentence  ID5 (vector dim =  500 )
   7 th vector for this sequence. Sentence  ID6 (vector dim =  500 )
   8 th vector for this sequence. Sentence  ID7 (vector dim =  500 )
   9 th vector for this sequence. Sentence  ID8 (vector dim =  500 )
   10 th vector for this sequence. Sentence  ID9 (vector dim =  500 )
   11 th vector for this sequence. Sentence  ID10 (vector dim =  500 )
   12 th vector for this sequence. Sentence  ID11 (vector dim =  500 )
   13 th vector for this sequence. Sentence  ID12 (vector dim =  500 )
   14 th vector for this sequence. Sentence  ID13 (vector dim =  500 )
   15 th

In [8]:
from __future__ import print_function
from keras import regularizers
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Dropout, Embedding, Flatten, Bidirectional, Input, LSTM
from keras.callbacks import EarlyStopping,ModelCheckpoint
from keras.optimizers import Adam
from keras.metrics import categorical_accuracy, mean_squared_error, mean_absolute_error, logcosh
from keras.layers.normalization import BatchNormalization

def bidirectional_lstm_model(seq_length, vector_dim):
    print('Building LSTM model...')
    model = Sequential()
    model.add(Bidirectional(LSTM(rnn_size, activation="relu"),input_shape=(seq_length, vector_dim)))
    model.add(Dropout(0.5))
    model.add(Dense(vector_dim))
    
    optimizer = Adam(lr=learning_rate)
    callbacks=[EarlyStopping(patience=2, monitor='val_loss')]
    model.compile(loss='logcosh', optimizer=optimizer, metrics=['acc'])
    print('LSTM model built.')
    return model

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [9]:
rnn_size = 512 # size of RNN
vector_dim = 500
learning_rate = 0.0001 #learning rate

model_sequence = bidirectional_lstm_model(nb_sequenced_sentences, vector_dim)

Building LSTM model...
LSTM model built.


In [10]:
batch_size = 30 # minibatch size

callbacks=[EarlyStopping(patience=5, monitor='val_loss'),
           ModelCheckpoint(filepath=save_dir + "/" + 'my_model_sequence_lstm.{epoch:02d}.hdf5',\
                           monitor='val_loss', verbose=1, mode='auto', period=5)]

history = model_sequence.fit(X_train, y_train,
                 batch_size=batch_size,
                 shuffle=True,
                 epochs=40,
                 callbacks=callbacks,
                 validation_split=0.1)

#save the model
model_sequence.save(save_dir + "/" + 'my_model_sequence_lstm.final.hdf5')

Train on 282 samples, validate on 32 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40

Epoch 00005: saving model to save/my_model_sequence_lstm.05.hdf5
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40

Epoch 00010: saving model to save/my_model_sequence_lstm.10.hdf5
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40

Epoch 00015: saving model to save/my_model_sequence_lstm.15.hdf5
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40

Epoch 00020: saving model to save/my_model_sequence_lstm.20.hdf5
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40

Epoch 00025: saving model to save/my_model_sequence_lstm.25.hdf5


In [1]:
from __future__ import print_function
import numpy as np
import os
import scipy
from six.moves import cPickle

In [2]:
save_dir = 'save' # directory to store models

In [3]:
#import spacy, and french model
import spacy
nlp = spacy.load('fr')

In [4]:
#import gensim library
import gensim
from gensim.models.doc2vec import LabeledSentence

#load the doc2vec model
print("loading doc2Vec model...")
d2v_model = gensim.models.doc2vec.Doc2Vec.load('./data/doc2vec.w2v')

print("model loaded!")

loading doc2Vec model...
model loaded!


In [5]:
#load vocabulary
print("loading vocabulary...")
vocab_file = os.path.join(save_dir, "words_vocab.pkl")

with open(os.path.join(save_dir, 'words_vocab.pkl'), 'rb') as f:
        words, vocab, vocabulary_inv = cPickle.load(f)

vocab_size = len(words)
print("vocabulary loaded !")

loading vocabulary...
vocabulary loaded !


In [6]:
from keras.models import load_model
# load the keras models
print("loading word prediction model...")
model = load_model(save_dir + "/" + 'my_model_gen_sentences_lstm.final.hdf5')
print("model loaded!")
print("loading sentence selection model...")
model_sequence = load_model(save_dir + "/" + 'my_model_sequence_lstm.final.hdf5')
print("model loaded!")

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


loading word prediction model...
model loaded!
loading sentence selection model...
model loaded!


In [7]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [8]:
def create_seed(seed_sentences,nb_words_in_seq=20, verbose=False):
    #initiate sentences
    generated = ''
    sentence = []
    
    #fill the sentence with a default word
    for i in range (nb_words_in_seq):
        sentence.append("le")

    seed = seed_sentences.split()
    
    if verbose == True : print("seed: ",seed)

    for i in range(len(sentence)):
        sentence[nb_words_in_seq-i-1]=seed[len(seed)-i-1]
        #print(i, sentence)

    generated += ' '.join(sentence)
    
    if verbose == True : print('Generating text with the following seed: "' + ' '.join(sentence) + '"')

    return [generated, sentence]

In [9]:
def generate_phrase(sentence, max_words = 50, nb_words_in_seq=20, temperature=1, verbose = False):
    generated = ""
    words_number = max_words - 1
    ponctuation = [".","?","!",":","…"]
    seq_length = nb_words_in_seq
    #sentence = []
    is_punct = False
    
    #generate the text
    for i in range(words_number):
        #create the vector
        x = np.zeros((1, seq_length, vocab_size))
        for t, word in enumerate(sentence):
            #print(t, word, vocab[word])
            x[0, nb_words_in_seq-len(sentence)+t, vocab[word]] = 1.
        #print(x.shape)

        #calculate next word
        preds = model.predict(x, verbose=0)[0]
        next_index = sample(preds, temperature)
        next_word = vocabulary_inv[next_index]
        
        if verbose == True:
            predv = np.array(preds)
            #arr = np.array([1, 3, 2, 4, 5])
            wi = predv.argsort()[-3:][::-1]
            print("potential next words: ", vocabulary_inv[wi[0]], vocabulary_inv[wi[1]], vocabulary_inv[wi[2]])

        #add the next word to the text
        if is_punct == False:
            if next_word in ponctuation:
                is_punct = True
            generated += " " + next_word
            # shift the sentence by one, and and the next word at its end
            sentence = sentence[1:] + [next_word]

    return(generated, sentence)

In [10]:
def define_phrases_candidates(sentence, max_words = 50,\
                              nb_words_in_seq=20, \
                              temperature=1, \
                              nb_candidates_sents=10, \
                              verbose = False):
    phrase_candidate = []
    generated_sentence = ""
    for i in range(nb_candidates_sents):
        generated_sentence, new_sentence = generate_phrase(sentence, \
                                                           max_words = max_words, \
                                                           nb_words_in_seq = nb_words_in_seq, \
                                                           temperature=temperature, \
                                                           verbose = False)
        phrase_candidate.append([generated_sentence, new_sentence])
    
    if verbose == True :
        for phrase in phrase_candidate:
            print("   " , phrase[0])
    return phrase_candidate

In [11]:
def create_sentences(doc):
    ponctuation = [".","?","!",":","…"]
    sentences = []
    sent = []
    for word in doc:
        if word.text not in ponctuation:
            if word.text not in ("\n","\n\n",'\u2009','\xa0'):
                sent.append(word.text.lower())
        else:
            sent.append(word.text.lower())
            if len(sent) > 1:
                sentences.append(sent)
            sent=[]
    return sentences

In [12]:
def generate_training_vector(sentences_list, verbose = False):
    if verbose == True : print("generate vectors for each sentence...")
    seq = []
    V = []

    for s in sentences_list:
        #infer the vector of the sentence, from the doc2vec model
        v = d2v_model.infer_vector(create_sentences(nlp(s))[0], alpha=0.001, min_alpha=0.001, steps=10000)
    #create the vector array for the model
        V.append(v)
    V_val=np.array(V)
    #expand dimension to fit the entry of the model : that's the training vector
    V_val = np.expand_dims(V_val, axis=0)
    if verbose == True : print("Vectors generated!")
    return V_val

In [13]:
def select_next_phrase(model, V_val, candidate_list, verbose=False):
    sims_list = []
    
    #calculate prediction
    preds = model.predict(V_val, verbose=0)[0]
    
    #calculate vector for each candidate
    for candidate in candidate_list:
        #calculate vector
        #print("calculate vector for : ", candidate[1])
        V = np.array(d2v_model.infer_vector(candidate[1]))
        #calculate csonie similarity
        sim = scipy.spatial.distance.cosine(V,preds)
        #populate list of similarities
        sims_list.append(sim)
    
    #select index of the biggest similarity
    m = max(sims_list)
    index_max = sims_list.index(m)
    
    if verbose == True :
        print("selected phrase :")
        print("     ", candidate_list[index_max][0])
    return candidate_list[index_max]

In [14]:
def generate_paragraphe(phrase_seed, sentences_seed, \
                        max_words = 50, \
                        nb_words_in_seq=20, \
                        temperature=1, \
                        nb_phrases=30, \
                        nb_candidates_sents=10, \
                        verbose=True):
    
    sentences_list = sentences_seed
    sentence = phrase_seed   
    text = []
    
    for p in range(nb_phrases):
        if verbose == True : print("")
        if verbose == True : print("#############")
        print("phrase ",p+1, "/", nb_phrases)
        if verbose == True : print("#############")       
        if verbose == True:
            print('Sentence to generate phrase : ')
            print("     ", sentence)
            print("")
            print('List of sentences to constrain next phrase : ')
            print("     ", sentences_list)
            print("")
    
        #generate seed training vector
        V_val = generate_training_vector(sentences_list, verbose = verbose)

        #generate phrase candidate
        if verbose == True : print("generate phrases candidates...")
        phrases_candidates = define_phrases_candidates(sentence, \
                                                       max_words = max_words, \
                                                       nb_words_in_seq = nb_words_in_seq, \
                                                       temperature=temperature, \
                                                       nb_candidates_sents=nb_candidates_sents, \
                                                       verbose = verbose)
        
        if verbose == True : print("select next phrase...")
        next_phrase = select_next_phrase(model_sequence, \
                                         V_val,
                                         phrases_candidates, \
                                         verbose=verbose)
        
        print("Next phrase: ",next_phrase[0])
        if verbose == True :
            print("")
            print("Shift phrases in sentences list...")
        for i in range(len(sentences_list)-1):
            sentences_list[i]=sentences_list[i+1]

        sentences_list[len(sentences_list)-1] = next_phrase[0]
        
        if verbose == True:
            print("done.")
            print("new list of sentences :")
            print("     ", sentences_list)     
        sentence = next_phrase[1]
        
        text.append(next_phrase[0])
    
    return text

In [15]:
s1 = "Un mois après la mort de leur amie d’enfance, "
s2 = "quatre jeunes femmes à l’aube de la trentaine se réunissent dans une maison de campagne."
s3 = "Elles posent sur la table le journal intime de la défunte ."
s4 = "À travers les mots de Coco, ces filles aussi franches que différentes plongent dans leurs souvenirs : "
s5 = "de la naissance de leur amitié à leur découverte de l’amour, de la sexualité et de la vie ."
s6 = "Coco, c’est une rencontre avec une gang de filles ."
s7 = "Une comédie dramatique qui se penche ouvertement sur nos différents rapports à l’amour : "
s8 = "de la naïveté à la sexualité précoce, de l’abstinence aux aventures débridées, du romantisme à l’infidélité, des désirs aux désillusions en passant par l’homosexualité, la solitude, l’image corporelle et, surtout, le rêve de la maternité ."
s9 = "Au fil des ans, sans trop s’en apercevoir, ces amies ont tissé entre elles la plus solide des relations amoureuses, "
s10 = "celle qui survit au-delà de la mort ."
s11 = "s' écrie le jeune homme ."
s12 = "Premier texte de Nathalie Doummar, "
s13 = "Coco faisait salle comble lors de sa création sur la scène de La Petite Licorne à l’hiver 2016 ."
s14 = "Mathieu Quesnel, qui a précédemment signé la mise en scène de L’amour est un dumpling présenté dans le cadre des 5 à 7 de La Licorne, "
s15 = "dirige ce quintette d’actrices ."


In [16]:
sentences_list = [s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15]
print(sentences_list)

['Un mois après la mort de leur amie d’enfance, ', 'quatre jeunes femmes à l’aube de la trentaine se réunissent dans une maison de campagne.', 'Elles posent sur la table le journal intime de la défunte .', 'À travers les mots de Coco, ces filles aussi franches que différentes plongent dans leurs souvenirs\xa0: ', 'de la naissance de leur amitié à leur découverte de l’amour, de la sexualité et de la vie .', 'Coco, c’est une rencontre avec une gang de filles .', 'Une comédie dramatique qui se penche ouvertement sur nos différents rapports à l’amour\xa0: ', 'de la naïveté à la sexualité précoce, de l’abstinence aux aventures débridées, du romantisme à l’infidélité, des désirs aux désillusions en passant par l’homosexualité, la solitude, l’image corporelle et, surtout, le rêve de la maternité .', 'Au fil des ans, sans trop s’en apercevoir, ces amies ont tissé entre elles la plus solide des relations amoureuses, ', 'celle qui survit au-delà de la mort .', "s' écrie le jeune homme .", 'Premi

In [17]:
phrase_seed, sentences_seed = create_seed(s1 + " " + s2 + " " +\
                                          s3 + " " + s4+ " " + s5 + " " +\
                                          s6 + " " + s7 + " " + s8 + " " +\
                                          s9+ " " + s10 + " " + s11 + " " +\
                                          s12 + " " + s13 + " " + s14+ " " + s15,20)
print(phrase_seed)
print(sentences_seed)

L’amour est un dumpling présenté dans le cadre des 5 à 7 de La Licorne, dirige ce quintette d’actrices .
['L’amour', 'est', 'un', 'dumpling', 'présenté', 'dans', 'le', 'cadre', 'des', '5', 'à', '7', 'de', 'La', 'Licorne,', 'dirige', 'ce', 'quintette', 'd’actrices', '.']


In [18]:
text = generate_paragraphe(sentences_seed, sentences_list, \
                           max_words = 80, \
                           nb_words_in_seq = 30,\
                           temperature=0.201, \
                           nb_phrases=5, \
                           nb_candidates_sents=7, \
                           verbose=False)

phrase  1 / 5


IndexError: list index out of range

In [None]:
print("generated text: ")
for t in text:
    print(t)