## Dinosaur name generation character by character using First order Hidden Markov model:

 In this notebook, I attempted to generate dinosaur name one character at a time using Hidden Markov Model. The dataset and some lines of codes were obtained from Prof Andrew Ng's DL course on Coursera. I'm using a HMM package called hmmlearn. It requires Python 2. The details of this package can be obtained here :
https://github.com/hmmlearn/hmmlearn

First of all, lets import all the packages. 

In [1]:
# import packages
import numpy as np
from hmmlearn import hmm

Then load the dataset:

In [2]:
# Load dataset
with open('dinos.txt','r') as f:
    data = f.read()
    
# Convert all characters to lower case
data = data.lower()

# Get all unique characters in the data, and saved them into a list
chars = list(set(data))

print('Vocab size :',len(chars))
print('Data size :',len(data))

('Vocab size :', 27)
('Data size :', 19909)


Since the HMM package can only crunch number. Two dictionaries were created.

char_to_ixchar_to : converts characters to numbers for HMM to train

ix_to_char : converts samples (in numbers) back to characters

In [3]:
char_to_ixchar_to  = { ch:i for i,ch in enumerate(sorted(chars)) }
ix_to_char = { i:ch for i,ch in enumerate(sorted(chars)) }
print(ix_to_char)
print(char_to_ixchar_to)

{0: '\n', 1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z'}
{'\n': 0, 'a': 1, 'c': 3, 'b': 2, 'e': 5, 'd': 4, 'g': 7, 'f': 6, 'i': 9, 'h': 8, 'k': 11, 'j': 10, 'm': 13, 'l': 12, 'o': 15, 'n': 14, 'q': 17, 'p': 16, 's': 19, 'r': 18, 'u': 21, 't': 20, 'w': 23, 'v': 22, 'y': 25, 'x': 24, 'z': 26}


In [4]:
data = data.split('\n')
data

['aachenosaurus',
 'aardonyx',
 'abdallahsaurus',
 'abelisaurus',
 'abrictosaurus',
 'abrosaurus',
 'abydosaurus',
 'acanthopholis',
 'achelousaurus',
 'acheroraptor',
 'achillesaurus',
 'achillobator',
 'acristavus',
 'acrocanthosaurus',
 'acrotholus',
 'actiosaurus',
 'adamantisaurus',
 'adasaurus',
 'adelolophus',
 'adeopapposaurus',
 'aegyptosaurus',
 'aeolosaurus',
 'aepisaurus',
 'aepyornithomimus',
 'aerosteon',
 'aetonyxafromimus',
 'afrovenator',
 'agathaumas',
 'aggiosaurus',
 'agilisaurus',
 'agnosphitys',
 'agrosaurus',
 'agujaceratops',
 'agustinia',
 'ahshislepelta',
 'airakoraptor',
 'ajancingenia',
 'ajkaceratops',
 'alamosaurus',
 'alaskacephale',
 'albalophosaurus',
 'albertaceratops',
 'albertadromeus',
 'albertavenator',
 'albertonykus',
 'albertosaurus',
 'albinykus',
 'albisaurus',
 'alcovasaurus',
 'alectrosaurus',
 'aletopelta',
 'algoasaurus',
 'alioramus',
 'aliwalia',
 'allosaurus',
 'almas',
 'alnashetri',
 'alocodon',
 'altirhinus',
 'altispinax',
 'alvarez

In [5]:
sequences = []
lengths = []
for i in range(len(data)):
    temp = []
    for character in data[i]:
        temp.append(character)
    temp.append('\n')
    sequences.append(temp)
    lengths.append(len(temp))
sequences = np.concatenate(sequences)
print(len(lengths))
print(sequences.shape)
print(lengths)
print(sequences)

1536
(19910L,)
[14, 9, 15, 12, 14, 11, 12, 14, 14, 13, 14, 13, 11, 17, 11, 12, 15, 10, 12, 16, 14, 12, 11, 17, 10, 17, 12, 11, 12, 12, 12, 11, 14, 10, 14, 13, 13, 13, 12, 14, 16, 16, 15, 15, 13, 14, 10, 11, 13, 14, 11, 12, 10, 9, 11, 6, 11, 9, 11, 11, 14, 11, 11, 13, 13, 14, 13, 11, 13, 13, 17, 12, 12, 11, 12, 12, 11, 14, 12, 11, 14, 11, 12, 11, 13, 11, 14, 12, 17, 13, 11, 12, 13, 15, 13, 12, 14, 15, 13, 9, 11, 5, 11, 6, 9, 12, 12, 17, 9, 12, 12, 15, 16, 19, 14, 14, 12, 18, 19, 12, 12, 11, 12, 16, 13, 13, 13, 14, 12, 11, 16, 14, 13, 13, 13, 9, 12, 15, 15, 12, 13, 14, 11, 17, 13, 16, 11, 11, 9, 16, 15, 9, 14, 16, 14, 15, 13, 13, 12, 9, 12, 13, 9, 10, 7, 14, 13, 13, 11, 14, 14, 11, 7, 14, 12, 6, 17, 13, 9, 11, 14, 11, 9, 14, 10, 13, 13, 11, 11, 13, 11, 11, 15, 14, 12, 12, 8, 14, 11, 13, 12, 14, 15, 14, 15, 17, 12, 14, 7, 15, 16, 10, 12, 13, 14, 11, 10, 17, 14, 15, 18, 17, 13, 14, 18, 11, 15, 13, 14, 14, 12, 12, 13, 13, 18, 12, 13, 14, 12, 12, 13, 15, 13, 13, 16, 16, 14, 13, 16, 10, 12, 1

In [6]:
sequences_cvt = [char_to_ixchar_to[sequences[i]] for i in range(len(sequences))]
sequences_cvt

[1,
 1,
 3,
 8,
 5,
 14,
 15,
 19,
 1,
 21,
 18,
 21,
 19,
 0,
 1,
 1,
 18,
 4,
 15,
 14,
 25,
 24,
 0,
 1,
 2,
 4,
 1,
 12,
 12,
 1,
 8,
 19,
 1,
 21,
 18,
 21,
 19,
 0,
 1,
 2,
 5,
 12,
 9,
 19,
 1,
 21,
 18,
 21,
 19,
 0,
 1,
 2,
 18,
 9,
 3,
 20,
 15,
 19,
 1,
 21,
 18,
 21,
 19,
 0,
 1,
 2,
 18,
 15,
 19,
 1,
 21,
 18,
 21,
 19,
 0,
 1,
 2,
 25,
 4,
 15,
 19,
 1,
 21,
 18,
 21,
 19,
 0,
 1,
 3,
 1,
 14,
 20,
 8,
 15,
 16,
 8,
 15,
 12,
 9,
 19,
 0,
 1,
 3,
 8,
 5,
 12,
 15,
 21,
 19,
 1,
 21,
 18,
 21,
 19,
 0,
 1,
 3,
 8,
 5,
 18,
 15,
 18,
 1,
 16,
 20,
 15,
 18,
 0,
 1,
 3,
 8,
 9,
 12,
 12,
 5,
 19,
 1,
 21,
 18,
 21,
 19,
 0,
 1,
 3,
 8,
 9,
 12,
 12,
 15,
 2,
 1,
 20,
 15,
 18,
 0,
 1,
 3,
 18,
 9,
 19,
 20,
 1,
 22,
 21,
 19,
 0,
 1,
 3,
 18,
 15,
 3,
 1,
 14,
 20,
 8,
 15,
 19,
 1,
 21,
 18,
 21,
 19,
 0,
 1,
 3,
 18,
 15,
 20,
 8,
 15,
 12,
 21,
 19,
 0,
 1,
 3,
 20,
 9,
 15,
 19,
 1,
 21,
 18,
 21,
 19,
 0,
 1,
 4,
 1,
 13,
 1,
 14,
 20,
 9,
 19,
 1,
 21,
 18,
 21,
 19,


In [7]:
final_seqs = []
for stuff in sequences_cvt:
    final_seqs.append([stuff])
final_seqs = np.array(final_seqs)
print(final_seqs.shape)
print(final_seqs)

(19910L, 1L)
[[ 1]
 [ 1]
 [ 3]
 ...
 [21]
 [12]
 [ 0]]


In [8]:
model = hmm.MultinomialHMM(n_components = 20, n_iter=150)
model.fit(final_seqs,lengths)

  return np.log(self.emissionprob_)[:, np.concatenate(X)].T
  np.log(self.transmat_),
  np.log(self.transmat_),
  np.log(self.transmat_),
  np.log(self.startprob_),
  np.log(self.startprob_),


MultinomialHMM(algorithm='viterbi', init_params='ste', n_components=20,
        n_iter=150, params='ste',
        random_state=<mtrand.RandomState object at 0x00000000069BA870>,
        startprob_prior=1.0, tol=0.01, transmat_prior=1.0, verbose=False)

In [9]:
samples = model.sample(n_samples = 441)

In [10]:
samples = np.concatenate(samples[0])
#print(samples)
samples_cvt = [ix_to_char[samples[i]] for i in range(len(samples))]
samples_cvt = ''.join(samples_cvt)
samples_cvt = samples_cvt.split('\n')
print(samples_cvt)

['qortemlocoonronaps', 'chasaurus', 'tus', 'postosaurus', 'tonacus', 'cesaurus', 'fonghensa', 'cymaaasaurus', 'tosciliusaurus', 'coptargichosaurus', 'pogus', 'thus', 'tocs', 'toconatolokuenbanonathus', 'teriera', 'cs', 'thus', 'toractosaurus', 'ton', 'saurus', 'tasaurus', 'kann', 'pkon', '', 'con', 'connimosaurus', 'tus', 'cosaurus', 'tonghotaresaurus', 'topalson', 'tosaurus', 'talitophchopteanosaurus', 'tisaurus', 'chiosaurus', 'tosaurus', 'cenans', 'tosaurabnasaurus', 'songites', 'tosaurus', 'tostops', 'coplocheps', 'cosaurus', 'trocesaurus', 'thosaurus', 'rochiratyt']


Some of them really sounds like dinosaur name which ends with -saurus. For example, cesaurus, tosciliusaurus, cosaurus etc