In [1]:
from IPython.core.display import display, HTML

In [2]:
display(HTML("<style>.container {width:100% !important;}</style>"))

In [3]:
import pandas as pd
import re
import random

In [4]:
def fetch_url(url, fname):
    'get contents of url put in fname'
    fin = req.urlopen(url)
    data = fin.read()
    with open(fname, mode='wb') as fout:
        fout.write(data)
    #  context mgr closes file

In [5]:
def from_file(fname, size=1, encoding='utf8'):
    with open(fname, encoding=encoding) as fin:
        data = fin.read()
    m = Markov(data, size=size)
    return m

In [6]:
def from_file_words(fname, size=1, encoding='utf8'):
    with open(fname, encoding=encoding) as fin:
        data = fin.read()
    m = MarkovWords(data, size=size)
    return m

In [7]:
class Markov:
    def __init__(self, data, size=1):
        '''This is the constructor'''
        # This is a comment
        self.tables = []
        for i in range(size):
            self.tables.append(get_table(data, size=i+1))
        #self.table = get_table(data)

    def predict(self, txt):  # this is a method
        table = self.tables[len(txt)-1]
        options = table.get(txt, {})
        #options = self.table.get(txt, {})
        if not options:
            raise KeyError('{} not found'.format(txt))
            #raise KeyError(f'{txt} not found')
        possibles = []   # list literal
        for key, count in options.items():
            for i in range(count):
                possibles.append(key)
        return random.choice(possibles)

In [8]:
def get_table(txt, size=1):  # this is a function
    """
    Returns a transition table for txt

    >>> get_table('ab')
    {'a': {'b': 1}}
    """
    results = {}  # dictionary literal
    for idx in range(len(txt)):
        chars = txt[idx:idx + size]
        try:
            out = txt[idx+size]
        except IndexError:
            break
        char_dict = results.get(chars, {})
        char_dict.setdefault(out, 0)
        char_dict[out] += 1
        results[chars] = char_dict
    return results

In [9]:
class MarkovWords:
    def __init__(self, data, size=1):
        '''This is the constructor'''
        # This is a comment
        self.tables = []
        for i in range(size):
            self.tables.append(get_table_words(data, size=i+1))
        #self.table = get_table(data)
        
    def combinations(self):
        return self.tables

    def predict(self, txt):  # this is a method
        #table = self.tables[len(txt)-1]
        txt = txt.strip()
        ws = re.split('\s+', txt)
        table = self.tables[len(ws)-1]  # select dictionary based on number of words in txt
        try:
            search_txt = [x for x in table if txt in x.lower()][0]  # search for key in dictionary
        except IndexError:
            search_txt = txt
        options = table.get(search_txt, {})
        #options = self.table.get(txt, {})
        if not options:
            raise KeyError('{} not found'.format(txt))
            #raise KeyError(f'{txt} not found')
        possibles = []   # list literal
        for key, count in options.items():
            for i in range(count):
                possibles.append(key)
        return random.choice(possibles)

In [10]:
def get_table_words(txt, size=1):  # this is a function
    """
    Returns a transition table for txt

    >>> get_table('ab cd')
    {'ab': {'cb': 1}}
    """
    words = re.split('\s+', txt)
    words_count = len(words)
    results = {}  # dictionary literal
    for idx in range(words_count):
        chars = ' '.join(words[idx:idx + size])
        try:
            out = words[idx+size]
        except IndexError:
            break
        char_dict = results.get(chars, {})
        char_dict.setdefault(out, 0)
        char_dict[out] += 1
        results[chars] = char_dict
    return results

In [11]:
w = MarkovWords('the one quick brown fox jumps over the one lazy dog', size=4)

In [12]:
w.predict('the one')

'lazy'

In [13]:
w.combinations()

[{'the': {'one': 2},
  'one': {'quick': 1, 'lazy': 1},
  'quick': {'brown': 1},
  'brown': {'fox': 1},
  'fox': {'jumps': 1},
  'jumps': {'over': 1},
  'over': {'the': 1},
  'lazy': {'dog': 1}},
 {'the one': {'quick': 1, 'lazy': 1},
  'one quick': {'brown': 1},
  'quick brown': {'fox': 1},
  'brown fox': {'jumps': 1},
  'fox jumps': {'over': 1},
  'jumps over': {'the': 1},
  'over the': {'one': 1},
  'one lazy': {'dog': 1}},
 {'the one quick': {'brown': 1},
  'one quick brown': {'fox': 1},
  'quick brown fox': {'jumps': 1},
  'brown fox jumps': {'over': 1},
  'fox jumps over': {'the': 1},
  'jumps over the': {'one': 1},
  'over the one': {'lazy': 1},
  'the one lazy': {'dog': 1}},
 {'the one quick brown': {'fox': 1},
  'one quick brown fox': {'jumps': 1},
  'quick brown fox jumps': {'over': 1},
  'brown fox jumps over': {'the': 1},
  'fox jumps over the': {'one': 1},
  'jumps over the one': {'lazy': 1},
  'over the one lazy': {'dog': 1}}]

In [14]:
len(re.split('\s+', "ab cd"))

2

In [15]:
get_table_words("ab cd", size=1)

{'ab': {'cd': 1}}

In [16]:
words = get_table_words("the one quick brown fox jumps over the on2 lazy dog", size=2)

In [17]:
tables = []
words1 = get_table_words("the one quick brown fox jumps over the on2 lazy dog", size=1)
words2 = get_table_words("the one quick brown fox jumps over the on2 lazy dog", size=2)
tables.append(words1)
tables.append(words2)
tables

[{'the': {'one': 1, 'on2': 1},
  'one': {'quick': 1},
  'quick': {'brown': 1},
  'brown': {'fox': 1},
  'fox': {'jumps': 1},
  'jumps': {'over': 1},
  'over': {'the': 1},
  'on2': {'lazy': 1},
  'lazy': {'dog': 1}},
 {'the one': {'quick': 1},
  'one quick': {'brown': 1},
  'quick brown': {'fox': 1},
  'brown fox': {'jumps': 1},
  'fox jumps': {'over': 1},
  'jumps over': {'the': 1},
  'over the': {'on2': 1},
  'the on2': {'lazy': 1},
  'on2 lazy': {'dog': 1}}]

In [18]:
words

{'the one': {'quick': 1},
 'one quick': {'brown': 1},
 'quick brown': {'fox': 1},
 'brown fox': {'jumps': 1},
 'fox jumps': {'over': 1},
 'jumps over': {'the': 1},
 'over the': {'on2': 1},
 'the on2': {'lazy': 1},
 'on2 lazy': {'dog': 1}}

In [19]:
any([x for x in words if 'over t' in x.lower()])

True

In [20]:
[x for x in words if 'the o' in x.lower()][0]

'the one'

In [21]:
[value for key, value in words.items() if 'over t' in key.lower()]

[{'on2': 1}]

In [22]:
def repl(m):
    print("Welcome to the Markov REPL. (Hit Ctl-C to exit)")
    while True:
        try:
            txt = input('>')
        except KeyboardInterrupt:
            print("Goodbye")
            break
        try:
            res = m.predict(txt)
        except KeyError:
            print("Word not found")
        except IndexError:
            print('Try again')
        else:
            print(res)

In [23]:
m = Markov('the quick brown fox jumps over the lazy dog', size=3)

In [24]:
m.predict('o')

'x'

In [25]:
pp = from_file_words('pp.txt', size=4)

In [26]:
pp.predict('What')

'can'

In [27]:
get_table("the quick brown fox jumps over the lazy dog", size=2)

{'th': {'e': 2},
 'he': {' ': 2},
 'e ': {'q': 1, 'l': 1},
 ' q': {'u': 1},
 'qu': {'i': 1},
 'ui': {'c': 1},
 'ic': {'k': 1},
 'ck': {' ': 1},
 'k ': {'b': 1},
 ' b': {'r': 1},
 'br': {'o': 1},
 'ro': {'w': 1},
 'ow': {'n': 1},
 'wn': {' ': 1},
 'n ': {'f': 1},
 ' f': {'o': 1},
 'fo': {'x': 1},
 'ox': {' ': 1},
 'x ': {'j': 1},
 ' j': {'u': 1},
 'ju': {'m': 1},
 'um': {'p': 1},
 'mp': {'s': 1},
 'ps': {' ': 1},
 's ': {'o': 1},
 ' o': {'v': 1},
 'ov': {'e': 1},
 've': {'r': 1},
 'er': {' ': 1},
 'r ': {'t': 1},
 ' t': {'h': 1},
 ' l': {'a': 1},
 'la': {'z': 1},
 'az': {'y': 1},
 'zy': {' ': 1},
 'y ': {'d': 1},
 ' d': {'o': 1},
 'do': {'g': 1}}

In [28]:
get_table_words("the one quick brown fox jumps over the one lazy dog", size=1)

{'the': {'one': 2},
 'one': {'quick': 1, 'lazy': 1},
 'quick': {'brown': 1},
 'brown': {'fox': 1},
 'fox': {'jumps': 1},
 'jumps': {'over': 1},
 'over': {'the': 1},
 'lazy': {'dog': 1}}

In [29]:
a = "the one quick brown fox jumps over the one lazy dog"

In [30]:
words = re.split('\s+', a)
words

['the',
 'one',
 'quick',
 'brown',
 'fox',
 'jumps',
 'over',
 'the',
 'one',
 'lazy',
 'dog']

In [31]:
chars = words[0:0 + 3]
chars

['the', 'one', 'quick']

In [32]:
' '.join(chars)

'the one quick'

In [33]:
l = a.split(" ", 2)

In [34]:
a.split(" ", 2)[:2]

['the', 'one']

In [35]:
" ".join(a.split(" ", 2)[:2])

'the one'

In [36]:
ll = a.split(" ")

In [37]:
for i in ll:
    print(i)

the
one
quick
brown
fox
jumps
over
the
one
lazy
dog


In [38]:
for i in range(len(ll)-1):
    print(ll[i], ll[i+1])

the one
one quick
quick brown
brown fox
fox jumps
jumps over
over the
the one
one lazy
lazy dog


In [39]:
class MarkovNumbers:
    def __init__(self, data, size=1, skipLast=0):
        '''This is the constructor'''
        # This is a comment
        self.tables = []
        for i in range(size):
            self.tables.append(get_table_numbers(data.strip().rstrip('\n'), size=i+1, skipLast=skipLast))
        #print('__init__ self.tables ::', self.tables)

    def add(self, data, size=1, skipLast=0):
        for i in range(size):
            tmp_tbl = self.tables[i]
            #print('self.tables[',i,'] : ', tmp_tbl)
            #self.tables.append(get_table_numbers(data.strip().rstrip('\n'), size=i+1, skipLast=skipLast))
            new_dic = (get_table_numbers(data.strip().rstrip('\n'), size=i+1, skipLast=skipLast))
            #print('new_dic : ', new_dic)
            for key in new_dic:
                #print(key, '->', new_dic[key])
                if key in tmp_tbl:
                    #merged_dic = {**tmp_tbl[key], **new_dic[key]}
                    merged_dic = (tmp_tbl[key]).copy()
                    #print('merged_dic : ', merged_dic, ', new_dic[key] : ', new_dic[key])
                    for kk, value in (new_dic[key]).items():
                        #print(kk, ' :::: ', value)
                        if kk in merged_dic:
                            new_value = int(merged_dic[kk]) + int(value)
                            merged_dic[kk] = new_value
                        else:
                            merged_dic[kk] = value
                    #print(key, 'found in self.tables: ', tmp_tbl[key], 'to merge with', new_dic[key], ' :: ', merged_dic)
                    #merged_dic = dict(sorted(merged_dic.items(), key=operator.itemgetter(1), reverse=True))
                    merged_dic = dict(sorted(merged_dic.items(), key=lambda x: (int(-x[1]),int(x[0]))))
                    tmp_tbl[key] = merged_dic
                    self.tables[i] = tmp_tbl
                else:
                    tmp_tbl[key] = new_dic[key]
                    self.tables[i] = tmp_tbl

    def combinations(self):
        return self.tables
    
    def predict(self, txt):  # this is a method
        #print('self.tables ::', self.tables)
        #table = self.tables[len(txt)-1]
        txt = txt.strip()
        ws = re.split('\s+', txt.rstrip('\n'))
        table = self.tables[len(ws)-1]  # select dictionary based on number of words in txt
        search_txt = txt
        #try:
        #    search_txt = [x for x in table if txt in x.lower()][0]  # search for key in dictionary
        #except IndexError:
        #    search_txt = txt
        options = table.get(search_txt, {})
        #options = self.table.get(txt, {})
        #if not options:
        #    raise KeyError('{} not found'.format(txt))
        #possibles = []   # list literal
        #for key, count in options.items():
        #    for i in range(count):
        #        possibles.append(key)
        #return random.choice(possibles)
        return options
    
    def t_matrix(self, level=0):
        comb_0 = (self.tables[level])
        comb_0 = dict(sorted(comb_0.items(), key=lambda x: (str(x[0]))))  # int
        # rows list initialization
        top_row = {}
        for i in comb_0:
            for j in comb_0[i]:
                q = top_row.get(j, 0)
                top_row[j] = q + comb_0[i][j]
        top_row_keys = list(top_row.keys())
        top_row_keys = sorted(top_row_keys)
        top_row_keys[:0] = ['id']
        transition_matrix = pd.DataFrame(columns = top_row_keys)
        for i in comb_0:
            comb_0_row = {}
            d_tmp = comb_0[i]
            for j in top_row_keys:
                if j != 'id':
                    q1 = d_tmp.get(str(j), 0)
                    qt = top_row.get(str(i), 0)
                    comb_0_row[j] = q1 #int(q1)/int(qt)  # q1 - frequency,  sum(d_tmp.values())
            comb_0_row['id'] = i
            transition_matrix = transition_matrix.append(comb_0_row, ignore_index=True)
        transition_matrix = transition_matrix.set_index('id')
        return transition_matrix.replace(0.0, '')

    def f_matrix(self):
        t_m = self.t_matrix().replace('', 0.0)
        frequency_matrix = pd.DataFrame(columns = t_m.columns)

        #for index, row in t_m.iterrows():
            #print(f'Index: {index}, row: {row.values}, sum: {sum(row.values)}')
            #comb_0_row = (row.values / sum(row.values))
            #print((comb_0_row))
            #frequency_matrix = frequency_matrix.append(comb_0_row, ignore_index=True)
        return frequency_matrix

In [40]:
class MarkovNumbersNewLine:
    def __init__(self, data, size=1, skipLast=0):
        '''This is the constructor'''
        self.tables = []
        self.combinations = []
        for i in range(size):
            self.tables.append(get_table_numbers_new_line(data.strip().rstrip("\n"), size=i+1, skipLast=skipLast))

    def combinations(self):
        return self.tables
    
    def predict(self, txt):  # this is a method
        #table = self.tables[len(txt)-1]
        txt = txt.strip()
        ws = re.split('\s+', txt)
        table = self.tables[len(ws)-1]  # select dictionary based on number of words in txt
        search_txt = txt
        #try:
        #    search_txt = [x for x in table if txt in x.lower()][0]  # search for key in dictionary
        #except IndexError:
        #    search_txt = txt
        options = table.get(search_txt, {})
        #options = self.table.get(txt, {})
        #if not options:
        #    raise KeyError('{} not found'.format(txt))
        #possibles = []   # list literal
        #for key, count in options.items():
        #    for i in range(count):
        #        possibles.append(key)
        #return random.choice(possibles)
        return options

In [41]:
def get_table_numbers(txt, size=1, skipLast=0):
    """
    Returns a transition table for txt

    >>> get_table('ab cd')
    {'ab': {'cb': 1}}
    """    
    words = re.split('\s+', txt)
    
    if skipLast>0:
        words = words[:-skipLast]

    words_count = len(words)
    results = {}  # dictionary literal
    for idx in range(words_count):
        chars = ' '.join(words[idx:idx + size])
        try:
            out = words[idx+size]
        except IndexError:
            break
        if out:
            char_dict = results.get(chars, {})
            char_dict.setdefault(out, 0)
            char_dict[out] += 1
            results[chars] = char_dict
    return results

In [42]:
def from_file_numbers(fname, size=1, skipLast=0, encoding='utf8'):
    with open(fname, encoding=encoding) as fin:
        data = fin.read()
    m = MarkovNumbers(data, size=size, skipLast=skipLast)
    return m

In [43]:
def from_file_numbers_new_line(fname, size=1, skipLast=0, encoding='utf8'):
    with open(fname, encoding=encoding) as fin:
        data = fin.read()
    m = MarkovNumbersNewLine(data, size=size)
    return m

In [50]:
m = MarkovNumbers('', size=2, skipLast=0)
m.add('1 23 3 1 2 1 2', size=2, skipLast=0)
m.add('1 1 2 3 5', size=2, skipLast=0)
m.predict('1')
#print(m.t_matrix(0))
mn = m.t_matrix(0)
mn

Unnamed: 0_level_0,1,2,23,3,5
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1.0,3.0,1.0,,
2,1.0,,,1.0,
23,,,,1.0,
3,1.0,,,,1.0


In [48]:
m.predict('23')

{'3': 1}