In [1]:
import numpy as np
import pandas as pd

FILENAME = "./ascii_art.txt"

NUM = 26

HEIGHT = 7

WIDTH = 10

In [2]:
def read_ascii_art() -> np.ndarray:
    matrices = list()
    with open(FILENAME, "r") as f:
        h_count = 0
        matrix = np.zeros((HEIGHT, WIDTH), dtype=int)
        while True:
            line = f.readline()
            if not line:
                break
            if h_count == HEIGHT:
                matrices.append(matrix)
                h_count = 0
                matrix = np.zeros((HEIGHT, WIDTH), dtype=int)
            else:
                for i, char in enumerate(line):
                    if char == "#":
                        matrix[h_count][i] = 1
                h_count += 1
    return np.array(matrices)

In [3]:
def feature_extract(matrices: np.ndarray) -> pd.DataFrame:
    sums = np.sum(matrices, axis=2)[:, [0, 3, 4, -1]]
    feature1 = sums[:, 0] + sums[:, -1]
    feature2 = sums[:, 1] + sums[:, 2]
    feature3 = np.sum(matrices, axis=1)[:, 3]
    return pd.DataFrame({"f1": feature1, "f2": feature2, "f3": feature3})

In [4]:
matrices = read_ascii_art()
matrices.shape

(26, 7, 10)

In [5]:
np.sum(np.sum(matrices, axis=2), axis=1)

array([32, 40, 26, 36, 30, 24, 32, 33, 18, 22, 29, 20, 37, 36, 34, 30, 36,
       36, 30, 20, 31, 27, 40, 27, 20, 26])

In [6]:
df = feature_extract(matrices)
df["f4"] = df["f1"] * df["f2"] * df["f3"] + df["f1"] + df["f2"] + df["f3"]
df["label"] = np.array([chr(i + 65) for i in range(26)])
duplicates = df[df.duplicated(subset=["f4"], keep=False)]

In [7]:
df

Unnamed: 0,f1,f2,f3,f4,label
0,7,13,3,296,A
1,16,12,3,607,B
2,12,4,2,114,C
3,16,8,2,282,D
4,16,8,3,411,E
5,10,8,2,180,F
6,12,10,2,264,G
7,8,13,1,126,H
8,8,4,2,78,I
9,8,6,1,63,J


In [8]:
duplicates

Unnamed: 0,f1,f2,f3,f4,label
