In [19]:
import random
import numpy as np

In [73]:
# 1-st order chain
def get_trans_matrix_1(seq):
    n = 10
    trans_matrix = [[0] * n for _ in range(n)]

    for i in range(0, len(seq) - 1):
        trans_matrix[int(seq[i])][int(seq[i+1])] += 1

    for row in trans_matrix:
        occurrences_sum = sum(row)
        if occurrences_sum > 0:
            row[:] = [el / occurrences_sum for el in row]

    return trans_matrix


# 2-nd order chain
def get_trans_matrix_2(seq):
    n = 10
    trans_matrix = [[0] * n for _ in range(n * n)]

    for i in range(0, len(seq) - 2):
        prev_state = int(seq[i]) * 10 + int(seq[i + 1])
        curr_state = int(seq[i + 2])
        trans_matrix[prev_state][curr_state] += 1

    for row in trans_matrix:
        occurrences_sum = sum(row)
        if occurrences_sum > 0:
            row[:] = [el / occurrences_sum for el in row]
            if round(sum(row)) > 1:
                print("error")

    return trans_matrix


# 3-rd order chain
def get_trans_matrix_3(seq):
    n = 10
    trans_matrix = [[0] * n for _ in range(n * n * n)]

    for i in range(0, len(seq) - 3):
        prev_state = int(seq[i]) * 100 + int(seq[i + 1]) * 10 + int(seq[i + 2])
        curr_state = int(seq[i + 3])
        trans_matrix[prev_state][curr_state] += 1

    for row in trans_matrix:
        occurrences_sum = sum(row)
        if occurrences_sum > 0:
            row[:] = [el / occurrences_sum for el in row]
            if round(sum(row)) > 1:
                print("error")

    return trans_matrix


# 4-th order chain
def get_trans_matrix_4(seq):
    n = 10
    trans_matrix = [[0] * n for _ in range(n * n * n * n)]
    for i in range(0, len(seq) - 4):
        prev_state = int(seq[i]) * 1000 + int(seq[i + 1]) * 100
        prev_state += int(seq[i + 2]) * 10 + int(seq[i + 3])
        curr_state = int(seq[i + 4])
        trans_matrix[prev_state][curr_state] += 1

    for row in trans_matrix:
        occurrences_sum = sum(row)
        if occurrences_sum > 0:
            row[:] = [el / occurrences_sum for el in row]
            if round(sum(row)) > 1:
                print("error")

    return trans_matrix

In [74]:
def transition_matrix(seq, ord):
    """
    Generate transition matrix of given order
    """
    n = 10
    trans_matrix = [[0] * n for _ in range(n ** ord)]
    for i in range(len(seq) - ord):
        prev_state = sum([int(seq[i + _]) * 10 ** (ord - (_ + 1)) for _ in range(ord)])
        cur_state = int(seq[i + ord])
        trans_matrix[prev_state][cur_state] += 1
    
    for row in trans_matrix:
        oc_sum = sum(row)
        if oc_sum > 0:
            row[:] = [el / oc_sum for el in row]
            if (round(sum(row))) > 1:
                print("error")
    return trans_matrix

In [85]:
def weighted_choice(weights):
    totals = []
    running_total = 0

    for w in weights:
        running_total += w
        totals.append(running_total)
    rnd = random.random() * running_total
    for i, total in enumerate(totals):
        if rnd < total:
            return i
    return -1

In [86]:
def markov_chain_1_predict(seq):
    size = len(seq)
    trans_matrix = transition_matrix(seq, 1)
    state = int(seq[-2])
    return weighted_choice(trans_matrix[state])


def markov_chain_2_predict(seq):
    size = len(seq)
    if size <= 2:
        return markov_chain_1_predict(seq)
    trans_matrix = transition_matrix(seq, 2)
    state = int(seq[-3]) * 10 + int(seq[-2])
    return weighted_choice(trans_matrix[state])


def markov_chain_3_predict(seq):
    size = len(seq)
    if size <= 2:
        return markov_chain_1_predict(seq)
    elif size <= 3:
        return markov_chain_2_predict(seq)
    trans_matrix = transition_matrix(seq, 3)
    state = int(seq[-4]) * 100 + int(seq[-3]) * 10 + int(seq[-2])
    return weighted_choice(trans_matrix[state])


def markov_chain_4_predict(seq):
    size = len(seq)
    if size <= 2:
        return markov_chain_1_predict(seq)
    elif size <= 3:
        return markov_chain_2_predict(seq)
    elif size <= 4:
        return markov_chain_3_predict(seq)
    trans_matrix = transition_matrix(seq, 4)
    state = int(seq[-5]) * 1000 + int(seq[-4]) * 100 + int(seq[-3]) * 10 + int(seq[-2])
    return weighted_choice(trans_matrix[state])

In [120]:
def mchain_pred(seq, ord):
    size = len(seq)
    my_ord = 4
    if size <= 2:
        my_ord = 1
    elif size <= 3:
        my_ord = 2
    elif size <= 4:
        my_ord = 3
    ord = min(ord, my_ord)
    trans_matrix = transition_matrix(seq, ord)
    print("ord:", ord, len(trans_matrix))
    print(trans_matrix)
    state = sum([int(seq[-(ord - (_))]) * 10 ** (ord - (_ + 1)) for _ in range(ord)])
    print("state:", state)
    return weighted_choice(trans_matrix[state])

In [128]:
mchain_pred([1, 2, 1, 2, 3, 2, 5], 1)

ord: 1 10
[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.3333333333333333, 0.0, 0.3333333333333333, 0.0, 0.3333333333333333, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
state: 5


-1

In [None]:
import csv

def get_one_digit_sequences(path):
    one_digit_sequences = []
    with open(path, mode='r') as csv_file:
        csv_reader = csv.reader(csv_file)
        for row in csv_reader:
            is_one_digit = True
            tmp_seq = row[1].split(',')
            for el in tmp_seq:
                if len(el) > 1:
                    is_one_digit = False
                    break
            if is_one_digit:
                one_digit_sequences.append(''.join(row[1].split(',')))
    return one_digit_sequences

path = "../data/train.csv"
one_digit_sequences = get_one_digit_sequences(path)
print(len(one_digit_sequences))
    # print(one_digit_sequences[2])
    # print("Predicted digit: ", markov_chain_predict(one_digit_sequences[2]))

    # markov_chain_1_predict(one_digit_sequences[0])

num_of_predicted = 0
for seq in one_digit_sequences:
    if mchain_pred(seq, 1) == int(seq[len(seq) - 1]):
        num_of_predicted += 1

print("Number of correctly predicted digits (1-st order): ", num_of_predicted)
print(f"Algo 1-st order accuracy = {num_of_predicted / len(one_digit_sequences) * 100:.3f} %")

counter = 0
num_of_predicted = 0
for seq in one_digit_sequences:
    if mchain_pred(seq, 2) == int(seq[len(seq) - 1]):
        num_of_predicted += 1
    elif markov_chain_2_predict(seq) == -1:
        print(counter)
    counter += 1

print("Number of correctly predicted digits (2-nd order): ", num_of_predicted)
print(f"Algo 2-nd order accuracy = {num_of_predicted / len(one_digit_sequences) * 100:.3f} %")

num_of_predicted = 0
for seq in one_digit_sequences:
    if mchain_pred(seq, 3) == int(seq[len(seq) - 1]):
        num_of_predicted += 1

print("Number of correctly predicted digits (3-rd order): ", num_of_predicted)
print(f"Algo 3-rd order accuracy = {num_of_predicted / len(one_digit_sequences) * 100:.3f} %")

    
num_of_predicted = 0
for seq in one_digit_sequences:
    if mchain_pred(seq, 4) == int(seq[len(seq) - 1]):
        num_of_predicted += 1

print("Number of correctly predicted digits (4-th order): ", num_of_predicted)
print(f"Algo 4-rd order accuracy = {num_of_predicted / len(one_digit_sequences) * 100:.3f} %")
    

5973
Number of correctly predicted digits (1-st order):  2339
Algo 1-st order accuracy = 39.160 %
Number of correctly predicted digits (2-nd order):  4143
Algo 2-nd order accuracy = 69.362 %
Number of correctly predicted digits (3-rd order):  5128
Algo 3-rd order accuracy = 85.853 %


In [153]:
import pandas as pd
import sys
sys.path.append('..')
from preproc.filters import single_digit, non_empty
from models.mchain import MarkovChain
from tools import seq_to_num, acc_score

In [154]:
df_train = pd.read_csv("../data/train.csv", index_col=0)

In [155]:
X = seq_to_num(df_train.Sequence, target_split=False, pad=False)

In [156]:
Xsingle = X[X.map(single_digit) & X.map(lambda seq: len(seq) > 1)]

In [157]:
Xs, y = Xsingle.map(lambda seq: seq[:-1]), Xsingle.map(lambda seq: seq[-1])

In [165]:
mc = MarkovChain(n_prev=4, verbose=False)

In [169]:
_, ind, pred = mc.predict(Xs[:2000])

In [170]:
acc_score(y[ind], pred)

0.7428571428571429

In [37]:
acc_score(Xs.map(lambda seq: seq[-1]), y_pred)

0.8687426753725096

In [45]:
[(-_, 10 ** (_ - 1)) for _ in range(4, 0, -1)]

[(-4, 1000), (-3, 100), (-2, 10), (-1, 1)]