### Using a neural network to generate French commune names

Original notebook: 
https://github.com/fchollet/keras/blob/master/examples/lstm_text_generation.py

Data source:
https://www.insee.fr/fr/information/2114819

In [24]:
import pandas as pd
import numpy as np
from IPython.display import HTML

from __future__ import print_function
import keras
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM, RNN, SimpleRNNCell, SimpleRNN
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import random
import sys
import io

In [26]:
communes = pd.read_csv('communes.txt', header=None, encoding='latin')
communes.head()

names = communes[0].values
text = '\n'.join(names)

chars = sorted(list(set(text)))
print('total chars: {}'.format(len(chars)))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

total chars: 76


In [27]:
print('Corpus length:', len(text), 'lines:', len(names))
print('First 10 lines:', names[:10])
print('Number of unique chars:', len(chars))

Corpus length: 457275 lines: 35885
First 10 lines: ["L'Abergement-Clémenciat" "L'Abergement-de-Varey" 'Ambérieu-en-Bugey'
 'Ambérieux-en-Dombes' 'Ambléon' 'Ambronay' 'Ambutrix' 'Andert-et-Condon'
 'Anglefort' 'Apremont']
Number of unique chars: 76


In [28]:
# cut the text in semi-redundant sequences of maxlen characters
maxlen = 10
step = 3

sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('Number of sequences:', len(sentences))
print('First 10 sequences and next chars:')
for i in range(10):
    print('[{}]:[{}]'.format(sentences[i], next_chars[i]))

Number of sequences: 152422
First 10 sequences and next chars:
[L'Abergeme]:[n]
[bergement-]:[C]
[gement-Clé]:[m]
[ent-Clémen]:[c]
[-Clémencia]:[t]
[émenciat
L]:[']
[nciat
L'Ab]:[e]
[at
L'Aberg]:[e]
[L'Abergeme]:[n]
[bergement-]:[d]


In [29]:
print('Vectorization...')
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1
print('Size of X: {:.2f} MB'.format(X.nbytes/1024/1024))
print('Size of y: {:.2f} MB'.format(y.nbytes/1024/1024))

Vectorization...
Size of X: 110.47 MB
Size of y: 11.05 MB


In [30]:
# ### Initialization
# 
# Now we are ready to create a recurrent model.  Keras contains three types of recurrent layers:
# 
#  * `SimpleRNN`, a fully-connected RNN where the output is fed back to input.
#  * `LSTM`, the Long-Short Term Memory unit layer.
#  * `GRU`, the Gated Recurrent Unit layer.
# 
# See https://keras.io/layers/recurrent/ for more information.

# Number of hidden units to use:
nb_units = 64

model = Sequential()

# Recurrent layers supported: SimpleRNN, LSTM, GRU:
model.add(LSTM(nb_units, input_shape=(maxlen, len(chars))))

# To stack multiple RNN layers, all RNN layers except the last one need
# to have "return_sequences=True".  An example of using two RNN layers:
#model.add(SimpleRNN(16,
#                    input_shape=(maxlen, len(chars)),
#                    return_sequences=True))
#model.add(SimpleRNN(32))

model.add(Dense(units=len(chars)))
model.add(Activation('softmax'))

optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy',
              optimizer=optimizer)

print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_2 (LSTM)                (None, 64)                36096     
_________________________________________________________________
dense_2 (Dense)              (None, 76)                4940      
_________________________________________________________________
activation_1 (Activation)    (None, 76)                0         
Total params: 41,036
Trainable params: 41,036
Non-trainable params: 0
_________________________________________________________________
None


In [32]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [31]:
class SampleResult(keras.callbacks.Callback):

    def on_epoch_end(self, epoch, logs={}):

        start_index = random.randint(0, len(text) - maxlen - 1)

        for diversity in [0.2, 0.5, 1.0, 1.2]:
            generated = ''
            sentence = text[start_index: start_index + maxlen]
            generated += sentence
            print()
            print('----- Generating with diversity',
                  diversity, 'seed: "' + sentence + '"')
            sys.stdout.write(generated)

            for i in range(100):
                x = np.zeros((1, maxlen, len(chars)))
                for t, char in enumerate(sentence):
                    x[0, t, char_indices[char]] = 1.

                preds = self.model.predict(x, verbose=0)[0]
                next_index = sample(preds, diversity)
                next_char = indices_char[next_index]

                generated += next_char
                sentence = sentence[1:] + next_char

                sys.stdout.write(next_char)
                sys.stdout.flush()
        print('\n\n')
sample_callback = SampleResult()

In [34]:
history = model.fit(X, y, 
                        epochs=50, 
                        batch_size=512,
                        verbose=2,
                       callbacks=[sample_callback])

Epoch 1/50
 - 7s - loss: 1.5728

----- Generating with diversity 0.2 seed: "snes-les-A"
snes-les-Amens
Le Chapelle
Champagne
Chanans
Charon
Charennes
Charney-le-Petit
Saint-Priein-de-Charenne
Saint-
----- Generating with diversity 0.5 seed: "snes-les-A"
snes-les-Ameraulie
Aubembre
Aulling
Auguin
Auberten
Anneville-en-Val
Le Chapelle
Chamanting
Valennes
Fresnes-F
----- Generating with diversity 1.0 seed: "snes-les-A"
snes-les-Ampunet
Buutoude
Bunischedeu
Boulières
Bouieu
Boulosseron
Le Poulic
Pouronne
Guranc
Buzey
Bureau
Char
----- Generating with diversity 1.2 seed: "snes-les-A"
snes-les-Andel
Noèrele-la-Grain
Noouzaret
Norval
Ousysheieu
Evelou
Theffas
Trécaul
Toncy-en
Almane
Aubasclay
M


Epoch 2/50
 - 7s - loss: 1.5591

----- Generating with diversity 0.2 seed: "Farébersvi"
Farébersviller
Le Pentilles
Poussay-le-Bois
Saint-Martin-de-la-Pois
Saint-Martin-de-Lande
Saint-Christol
Saint
----- Generating with diversity 0.5 seed: "Farébersvi"
Farébersviller
Villers-les-Bérier
Le Marie-de-L

  after removing the cwd from sys.path.


a T-Arnignay-lèz-Saint-Fréifrencourt
Lemilly
Les Enzucychiincol
Éb


Epoch 3/50
 - 7s - loss: 1.5463

----- Generating with diversity 0.2 seed: "en
Samadet"
en
Samadet
Sarnes
Saint-Martin-de-Rivière
Saint-Germain-en-Brie
Bourg-sur-Mer
Saint-Martin-de-Charney
Saint-Je
----- Generating with diversity 0.5 seed: "en
Samadet"
en
Samadet
Sarnay-le-Grance
La Velle-Cardaine
Marcy-sur-Arbe
Charbion
Chizac
Charmont-les-Bois
Saint-Martin-de
----- Generating with diversity 1.0 seed: "en
Samadet"
en
Samadet
Serlevier
Saint-Crémont-sur-Cher
Vermichier
Siennes
Siches
Soyoyer
Rermonqueville
Saint-Aubin-le-Ch
----- Generating with diversity 1.2 seed: "en
Samadet"
en
Samadet
Sacaluyel
Villels
Vizevat-Téréond
Lenjouzot
Jubrac
Jampagnec
Stégouze
Vientiouss
Brécofes
Bray-en-


Epoch 4/50
 - 7s - loss: 1.5358

----- Generating with diversity 0.2 seed: "ourt
Macog"
ourt
Macoge
Marey
Marsac
Marches
Marchare
Marches
Marches
Marches-de-Lande
Saint-Martin-de-Carbert
Saint-Marti
----- Generating with diversity 0

Saint-André
Saint-Aubin-du-Sech
Saint-Martin-du-Bois
Saint-André-de-Bourg
Guingeat
Le Cournes
Courn
----- Generating with diversity 1.0 seed: "n-Yvelines"
n-Yvelines
Praisencelle
Aintiers
Anten
Locaride-Biré
La Grandbeuille
Andourt
Auvré
Auganville
Habey-et-Torquan
----- Generating with diversity 1.2 seed: "n-Yvelines"
n-Yvelines
Pons
Holergues-Eruan-sur-Martse
Aunerges
Autun
Aviaqueff
Vénoule
Sevelemmes
Saint-Sébace
Megnia-Ciz


Epoch 27/50
 - 8s - loss: 1.4285

----- Generating with diversity 0.2 seed: "e
La Goula"
e
La Goulain-sur-Seille
Armeuil-sur-Mer
Saint-Georges-de-la-Forters
Saint-Martin-de-Beaumain
Saint-Martin-de-C
----- Generating with diversity 0.5 seed: "e
La Goula"
e
La Goulaimbon
Warmont
Warssan
Wacquers
Wary-la-Chapelle
Perrigneux
Le Pin
Pray-Chalaine
Le Thieu
Les Losses

----- Generating with diversity 1.0 seed: "e
La Goula"
e
La Goulaye
Foussé
Gomivéles-lès-Belle
Villeflafotte
Le Fores
Fouheillem
Norrefont
Montép
Montlestatt
Rois-le
----- Generating with diversity 1.

Villeneuve-le-Château
Beaucourt
Beaumont-Fontain-Courbonne
Courville
Cournay
Courcelles
Courcelles
Cussy
----- Generating with diversity 0.5 seed: "queux
Vill"
queux
Villes
Verneux
Verdut
Vesnes
Vernel-sur-Couand
Saint-Marcel-de-Marthe
Saint-André
Saint-Martin-d'Arche
S
----- Generating with diversity 1.0 seed: "queux
Vill"
queux
Villevier
Vieillod
Vivigny-de-Méarne
Saint-Nicolas
L'Angle-Laussonnac
Pourvières-la-Valvage
Champpey-Sal
----- Generating with diversity 1.2 seed: "queux
Vill"
queux
Villeur
Vrouilly
Crény
Craharcon
Clasceaumont-Harinard
Paisnes
La Beaunier
Beaupongy
Chapeyrouse
Ferragn


Epoch 39/50
 - 7s - loss: 1.4081

----- Generating with diversity 0.2 seed: "quoy
Buire"
quoy
Buire
Bures
Buresse
Buret
Bure
Buret
Buret
Bures
Bures
Buresse
Buresse
Burières
Bussy-sur-Meuze
Saint-Jea
----- Generating with diversity 0.5 seed: "quoy
Buire"
quoy
Buiremont
Brissy-de-Carbel
Saint-Jean-de-Vernes
Saint-André-en-Brierne
Montagne
Montfort-Pont-sur-Eure
Sa
----- Generating with diversi

Saint-Maurice-le-Pentaix
tesen-la-Palle
Vaci-Lampont
Villonge-Morand
Saint-Chésal
Labestiaud
Réchelin
Ridon



Epoch 50/50
 - 8s - loss: 1.3956

----- Generating with diversity 0.2 seed: "y
Bassenev"
y
Bassenevaude
La Chapelle-Saint-Amand-le-Grande
Montmaroux
Montreuil-le-Château
Saint-Martin-de-Bourg-le-Peti
----- Generating with diversity 0.5 seed: "y
Bassenev"
y
Basseneville
Reins
Risois-les-Porbaux
Benay
Beauval-Saint-Germain-de-Prétiez
Le Mardier-Sainte-Marie-d'Aurba
----- Generating with diversity 1.0 seed: "y
Bassenev"
y
Basseneville
Taillie
Trabron
Eisan
Eshernes
Estéguen
Essaisné
Fléonay-sur-Arbret
Béchant
Bellefondaise
Ville
----- Generating with diversity 1.2 seed: "y
Bassenev"
y
Basseneve
Warquier-le-Prigu
Centescac
Lelouss
Rasolsas-Eaux-Beaumot
Château-d'Auché
Vecombierguet
Moissand
M


