Reference Links: 
* Character and Word Language Models
    * https://stackabuse.com/python-for-nlp-developing-an-automatic-text-filler-using-n-grams/

**To be noted**: <br> Definitly, neural language models are the best. <br> But let us see here, how to build decent character-level language models using tri-grams and bi-grams

In [73]:
import numpy as np
import pandas as pd
import random
from collections import Counter

**Objective**: *To randomly generate new dinosaurus names*

###### Understanding the data

Data, containing a list of dinosaur names, is taken from Deeplearning.ai Coursera Sequence Models Course Week 1

In [74]:
names = pd.read_csv('../Data/dinos.txt',index_col=False,header=None)

In [75]:
names.columns = ['dino_full_name']
names['dino_full_name'] = names['dino_full_name'].str.lower()

In [76]:
names['dino_full_name_first'] = names['dino_full_name'].apply(lambda x: x[0:-6])
names['dino_full_name_last'] = names['dino_full_name'].apply(lambda x: x[-6:])

In [77]:
names.head()

Unnamed: 0,dino_full_name,dino_full_name_first,dino_full_name_last
0,aachenosaurus,aacheno,saurus
1,aardonyx,aa,rdonyx
2,abdallahsaurus,abdallah,saurus
3,abelisaurus,abeli,saurus
4,abrictosaurus,abricto,saurus


In [78]:
# most dinosaur names end up being among the top 10
names['dino_full_name_last'].value_counts()[0:20]

saurus    713
ratops     67
raptor     50
enator     29
suchus     22
ndylus     14
pteryx     14
atitan     12
ephale     11
nathus     11
opelta     10
omimus      9
otitan      9
rannus      8
lophus      7
nglong      7
anlong      7
lestes      6
hoides      6
asaura      6
Name: dino_full_name_last, dtype: int64

In [79]:
names['dino_full_name_last'].value_counts()['lestes']

6

In [80]:
dino_last_names = [last_name for last_name in names['dino_full_name_last'] if last_name in names['dino_full_name_last'].value_counts().index[0:10]]

In [81]:
pd.Series(dino_last_names).value_counts()

saurus    713
ratops     67
raptor     50
enator     29
suchus     22
ndylus     14
pteryx     14
atitan     12
nathus     11
ephale     11
dtype: int64

In [82]:
char_bigrams_list = [list(zip(text,text[1:])) for text in names['dino_full_name_first']]
char_bigrams_list_unfurl = [each_bigram for each_list in char_bigrams_list for each_bigram in each_list]

In [83]:
char_bigrams_list_unfurl[0:5]

[('a', 'a'), ('a', 'c'), ('c', 'h'), ('h', 'e'), ('e', 'n')]

In [84]:
len(char_bigrams_list_unfurl)

7668

In [85]:
char_trigrams_list = [list(zip(text,text[1:],text[2:])) for text in names['dino_full_name_first']]
char_trigrams_list_unfurl = [each_trigram for each_list in char_trigrams_list for each_trigram in each_list]

In [86]:
print(char_trigrams_list_unfurl[0:5])
print(len(char_trigrams_list_unfurl))

[('a', 'a', 'c'), ('a', 'c', 'h'), ('c', 'h', 'e'), ('h', 'e', 'n'), ('e', 'n', 'o')]
6180


In [87]:
list_first_2_chars = [text[0:2] for text in names.dino_full_name_first if len(text)>1]

In [90]:
new_dino_name_length = np.random.randint(low=7, high=15, size=10)
for length in new_dino_name_length:
    created_dino_name = []
    current_seq = random.choice(list_first_2_chars)
    created_dino_name.append(current_seq)
    for i in range(length-6):
        next_char_possibilities = [element[2] for element in char_trigrams_list_unfurl if element[0]==current_seq[0] and element[1]==current_seq[1]]
        if next_char_possibilities !=[]:
            next_char = random.choice(next_char_possibilities)
        else:
            next_char_possibilities = [element[1] for element in char_bigrams_list_unfurl if element[0]==current_seq[1]]
            if next_char_possibilities!=[]:
                next_char = random.choice(next_char_possibilities)
            else:
                print("took from overall list")
                next_char = random.choice(char_split)
        created_dino_name.append(next_char)
        current_seq = created_dino_name[-2:]
    created_dino_name_str = "".join(created_dino_name)
    created_dino_name_str = created_dino_name_str + random.choice(dino_last_names)
    print(created_dino_name_str)

broplisaurus
prorkhorpsaurus
jintagasaurus
carsaurus
gapophongasaurus
teglangsaurus
borudomachsaurus
zheisaurus
horachillisaurus
aveiancsaurus
