## Wzorce 2d

Biblioteki:

In [1]:
import numpy as np
import pandas as pd
from PIL import Image
from time import perf_counter
from random import randint

1. Zaimplementuj algorytm wyszukiwania wzorca 2-wymiarowego



In [2]:
def get_diff_columns(pattern):
    columns = []
    indexes = []
    alphabet = set()
    for i in range(len(pattern[0])):
        col = []
        for j in range(len(pattern)):
            col.append(pattern[j][i])
            alphabet.add(pattern[j][i])

        if col in columns:
            idx = columns.index(col)
            indexes.append(idx)
        else:
            columns.append(col)
            indexes.append(len(columns) - 1)
    return columns, indexes, alphabet

In [3]:
def vertical_automaton(columns, letters):
    tt = [{}]
    words = [[]]
    states = [0] * len(columns)

    for i in range(len(columns[0])):
        for j in range(len(columns)):
            if columns[j][i] in tt[states[j]]:
                states[j] = tt[states[j]][columns[j][i]]
            else:
                tt[states[j]][columns[j][i]] = len(tt)
                words.append(words[states[j]] + [columns[j][i]])
                states[j] = len(tt)
                tt.append({})

    for i in range(len(tt)):
        for l in letters:
            if l not in tt[i]:
                suffix = (words[i] + [l])[1:]
                state = 0
                for s in suffix:
                    if s in tt[state]:
                        state = tt[state][s]
                    else:
                        state = 0

                tt[i][l] = state
    return tt, states

In [4]:
def horizontal_automaton(pattern, letters):
    result = []
    for state in range(len(pattern) + 1):
        result.append({})
        for l in letters:
            next_state = min(len(pattern), state + 1)
            while True:
                if pattern[:next_state] == (pattern[:state] + [l])[state - next_state + 1:state + 1]:
                    break
                next_state -= 1
            result[state][l] = next_state
    return result

In [5]:
def main_automaton(pattern):
    columns, indexes, letters = get_diff_columns(pattern)
    vertical_tt, vertical_states = vertical_automaton(columns, letters)

    new_pattern = [vertical_states[indexes[i]] for i in range(len(indexes))]
    horizontal_tt = horizontal_automaton(new_pattern, vertical_states)
    horizontal_state = len(horizontal_tt) - 1
    return vertical_tt, horizontal_tt, horizontal_state

In [6]:
def pattern_matching_2d(text, pattern, automaton=None):
    if automaton is None:
        vertical_tt, horizontal_tt, horizontal_state = main_automaton(pattern)
    else:
        vertical_tt, horizontal_tt, horizontal_state = automaton

    result = []
    vertical_states = []
    for i in range(len(text)):
        if len(text[i]) < len(vertical_states):
            vertical_states = vertical_states[:len(text[i])]
        elif len(vertical_states) < len(text[i]):
            vertical_states = vertical_states + [0] * (len(text[i]) - len(vertical_states))

        new_horizontal_state = 0
        for j in range(len(text[i])):
            if text[i][j] in vertical_tt[vertical_states[j]]:
                vertical_states[j] = vertical_tt[vertical_states[j]][text[i][j]]
            else:
                vertical_states[j] = 0
            if vertical_states[j] in horizontal_tt[new_horizontal_state]:
                new_horizontal_state = horizontal_tt[new_horizontal_state][vertical_states[j]]
                if new_horizontal_state == horizontal_state:
                    result.append((i - len(pattern) + 1, j - len(pattern[0]) + 1))
            else:
                new_horizontal_state = 0
    return result

2. Znajdź w załączonym pliku "haystack.txt" wszyskie sytuacje, gdy taka sama litera występuje na tej samej pozycji w dwóch kolejnych linijkach. Zwróć uwagę, na nierówną długość linii w pliku.


In [7]:
with open("haystack.txt") as f:
    text = f.readlines()

In [8]:
for i in range(ord("a"), ord("z") + 1):
    pattern = [chr(i), chr(i)]
    result = pattern_matching_2d(text, pattern)
    print(f"Given pattern: {pattern}")
    print(f"All indexes:\n{result}")
    print("----------------------------")

Given pattern: ['a', 'a']
All indexes:
[(0, 82), (3, 30), (5, 60), (6, 63), (20, 6), (28, 69), (31, 50), (31, 73), (33, 66), (37, 4), (52, 12), (53, 12), (53, 48), (56, 11), (57, 36), (58, 36), (59, 24), (64, 2), (64, 14), (64, 22), (65, 35), (69, 35), (76, 21), (76, 74), (77, 42), (77, 61), (78, 59), (79, 37)]
----------------------------
Given pattern: ['b', 'b']
All indexes:
[]
----------------------------
Given pattern: ['c', 'c']
All indexes:
[(3, 54), (10, 45), (13, 10), (41, 0), (68, 0), (82, 41)]
----------------------------
Given pattern: ['d', 'd']
All indexes:
[(37, 19)]
----------------------------
Given pattern: ['e', 'e']
All indexes:
[(0, 63), (1, 8), (4, 77), (7, 65), (10, 1), (10, 64), (14, 2), (15, 43), (17, 6), (18, 27), (20, 10), (21, 61), (22, 53), (24, 3), (24, 65), (28, 67), (28, 73), (29, 38), (29, 43), (37, 48), (40, 11), (40, 26), (41, 57), (42, 36), (42, 48), (46, 52), (47, 50), (51, 31), (57, 54), (58, 50), (58, 54), (59, 73), (63, 66), (65, 69), (66, 72), (

3. Znajdź wszystkie wystąpienia "th" oraz "t h" w dwóch kolejnych liniach na tej samej pozycji.



In [9]:
pattern_matching_2d(text, ["th", "th"])

[]

In [10]:
pattern_matching_2d(text, ["t h", "t h"])

[(37, 0)]

4. Wybierz przynajmniej 4 litery (małe). Znajdź wszystkie wystąpienia tej litery w załączonym pliku "haystack.png"


In [11]:
def convert_image(file_name):
    image = Image.open(file_name)
    pixels = list(image.getdata())
    width, height = image.size
    text = []
    i = width
    for pixel in pixels:
        if i == width:
            i = 0
            text.append([])
        text[-1].append(pixel[0])
        i += 1
    return text

In [12]:
text = convert_image("haystack.png")

Litera: c


In [13]:
c = convert_image("patterns/c.png")
c

[[255, 248, 135, 45, 8, 24, 86, 214],
 [249, 61, 0, 0, 0, 0, 0, 55],
 [136, 0, 40, 180, 236, 233, 171, 98],
 [46, 8, 222, 255, 255, 255, 255, 255],
 [9, 54, 255, 255, 255, 255, 255, 255],
 [8, 54, 255, 255, 255, 255, 255, 255],
 [46, 8, 223, 255, 255, 255, 255, 255],
 [136, 0, 42, 182, 237, 233, 171, 98],
 [248, 58, 0, 0, 0, 0, 0, 55],
 [255, 247, 130, 41, 7, 26, 87, 214]]

In [14]:
c_matched = pattern_matching_2d(text, c)
print(f"Liczba wystąpień c to: {len(c_matched)}.")

Liczba wystąpień c to: 213.


In [15]:
f = convert_image("patterns/f.png")
f


[[255, 255, 235, 92, 24, 0, 79],
 [255, 255, 92, 0, 0, 0, 79],
 [255, 255, 17, 49, 233, 255, 255],
 [255, 247, 0, 102, 255, 255, 255],
 [103, 0, 0, 0, 0, 0, 191],
 [103, 0, 0, 0, 0, 0, 191],
 [255, 243, 0, 103, 255, 255, 255],
 [255, 243, 0, 103, 255, 255, 255],
 [255, 243, 0, 103, 255, 255, 255],
 [255, 243, 0, 103, 255, 255, 255],
 [255, 243, 0, 103, 255, 255, 255],
 [255, 243, 0, 103, 255, 255, 255],
 [255, 243, 0, 103, 255, 255, 255],
 [255, 243, 0, 103, 255, 255, 255]]

In [16]:
f_matched = pattern_matching_2d(text, f)
print(f"Liczba wystąpień f to: {len(f_matched)}.")

Liczba wystąpień f to: 88.


In [17]:
n = convert_image("patterns/n.png")
n


[[163, 0, 187, 162, 40, 6, 39, 166, 255],
 [163, 0, 92, 0, 0, 0, 0, 1, 192],
 [163, 0, 5, 145, 232, 239, 132, 0, 89],
 [163, 0, 118, 255, 255, 255, 255, 34, 44],
 [163, 0, 179, 255, 255, 255, 255, 66, 32],
 [163, 0, 187, 255, 255, 255, 255, 67, 31],
 [163, 0, 187, 255, 255, 255, 255, 67, 31],
 [163, 0, 187, 255, 255, 255, 255, 67, 31],
 [163, 0, 187, 255, 255, 255, 255, 67, 31],
 [163, 0, 187, 255, 255, 255, 255, 67, 31]]

In [18]:
n_matched = pattern_matching_2d(text, n)
print(f"Liczba wystąpień n to: {len(n_matched)}.")

Liczba wystąpień n to: 526.


In [19]:
t = convert_image("patterns/t.png")
t

[[255, 171, 0, 179, 255, 255, 255],
 [255, 171, 0, 179, 255, 255, 255],
 [255, 171, 0, 179, 255, 255, 255],
 [123, 0, 0, 0, 0, 0, 95],
 [123, 0, 0, 0, 0, 0, 95],
 [255, 171, 0, 179, 255, 255, 255],
 [255, 171, 0, 179, 255, 255, 255],
 [255, 171, 0, 179, 255, 255, 255],
 [255, 171, 0, 179, 255, 255, 255],
 [255, 173, 0, 176, 255, 255, 255],
 [255, 190, 0, 108, 245, 255, 255],
 [255, 241, 9, 0, 0, 0, 95],
 [255, 255, 176, 46, 6, 0, 95]]

In [20]:
t_matched = pattern_matching_2d(text, t)
print(f"Liczba wystąpień t to: {len(t_matched)}.")

Liczba wystąpień t to: 450.


5. Znajdź wszystkie wystąpienia słowa "p a t t e r n" w haystack.png.

In [21]:
pattern = convert_image("patterns/pattern.png")
pattern_matched = pattern_matching_2d(text, pattern)
print(f"Liczba wystąpień pattern to: {len(pattern_matched)}.")

Liczba wystąpień pattern to: 5.


6. Porównaj czas budowania automatu i czas wyszukiwania dla różnych rozmiarów wzorca

In [22]:
def building_times(text_size):
    building_times = []
    for i in text_size:
        pattern = [[chr(randint(ord('a'), ord('z'))) for _ in range(i)] for _ in range(i)]
        start = perf_counter()
        main_automaton(pattern)
        end = perf_counter()
        building_times += [i, end - start]
    df = pd.DataFrame(data={"size": building_times[::2],
                            "time [s]": building_times[1::2]})
    return df

In [23]:
text_size = [i for i in range(10, 180, 10)]
df_1 = building_times(text_size)
df_1

Unnamed: 0,size,time [s]
0,10,0.001552
1,20,0.00942
2,30,0.028553
3,40,0.064087
4,50,0.122806
5,60,0.208673
6,70,0.370228
7,80,0.52679
8,90,0.784194
9,100,1.093919


In [24]:
def searching_times(text, text_size, path_size):
    pattern = [line[:path_size] for line in text[:path_size]]
    automaton = main_automaton(pattern)
    searching_times = []

    for i in text_size:
        text = [line[:i] for line in text[:i]]
        start = perf_counter()
        pattern_matching_2d(text, pattern, automaton)
        end = perf_counter()
        searching_times += [i, end - start]
    df = pd.DataFrame(data={"size": searching_times[::2],
                            "time [s]": searching_times[1::2]})
    return df

In [25]:
text = convert_image("haystack.png")
text_size = [i for i in range(500, 10001, 500)]
path_size = 25
df_2 = searching_times(text, text_size, path_size)
df_2



Unnamed: 0,size,time [s]
0,500,0.041125
1,1000,0.033349
2,1500,0.031807
3,2000,0.03187
4,2500,0.03279
5,3000,0.03292
6,3500,0.033341
7,4000,0.03303
8,4500,0.032938
9,5000,0.03381


7. Podziel plik na 2, 4 i 8 fragmentów (w poziomie) i porównaj czas przeszukiwania


In [26]:
def divide_and_measure(text, path_size):
    pattern = [line[:path_size] for line in text[:path_size]]
    result = []

    for div in [2, 4, 8]:
        length = len(text) // div
        intervals = [text[i * length:(i + 1) * length] for i in range(div)]
        start = perf_counter()
        for i in intervals:
            pattern_matching_2d(i, pattern)
        end = perf_counter()
        result += [div, end - start]
    df = pd.DataFrame(data={"fragments": result[::2],
                            "time [s]": result[1::2]})
    return df

In [27]:
path_size = 25
df_3 = divide_and_measure(text, path_size)
df_3


Unnamed: 0,fragments,time [s]
0,2,0.260176
1,4,0.235473
2,8,0.229677
