In [None]:
# imports
import os, cv2
import numpy as np
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import umap.umap_ as umap
import hdbscan
import pickle
from sklearn.cluster import DBSCAN
from umap import UMAP
from img2vec_pytorch import Img2Vec
from PIL import Image
import torch
import time
from scipy.spatial import distance_matrix

In [None]:
# functions
# preproocessing of images
def data_preprocess(paths):
    data = []
    size = (100, 150)
    for img in paths:
        try:
            img_path = img.replace('\n', '')
            image = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE) #read images in grayscale
            image_small = cv2.resize(image, size, interpolation = cv2.INTER_AREA) #resize images to unified size (in our case 100 X 150)
            image_bin = cv2.adaptiveThreshold(image_small, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,\
                cv2.THRESH_BINARY,11,2) #binarization (threshold value is a gaussian-weighted sum of the neighbourhood values minus the constant C)
            # every image have binary format with 0 and 1
            bin_new = []
            for i in image_bin:
                i_new = []
                for e in i:
                    if e == 255:
                        e = 1
                        i_new.append(e)
                    else:
                        e = 0
                        i_new.append(e)
                bin_new.append(i_new)            
            # print(bin_new)
            roi = np.array(bin_new, dtype = np.uint8).reshape(-1) #mask image format in type [x0, x1, x2 ...]
            # print(roi)
            # list
            mask = roi.tolist()
            # add filename to the end of list
            row_data_list = mask + list([img])
            # get array of data
            data.append(row_data_list)
        except:
            # print(img)
            with open('logs_dir.log', 'a', encoding = 'utf-8') as f:
                f.write(f'{img}\n')
            continue
    return data

# IMG2VEC
# get file paths from dataframe
def clear_paths(df):
    df_to_list = df.loc[:, 15000].tolist()
    paths_for_embed = []
    for d in df_to_list:
        clear_path = d.replace('\n', '')
        paths_for_embed.append(clear_path)
    return paths_for_embed

# get image embeddings from dataframe
def vec_img(paths):
    # img2vec = Img2Vec(cuda=False)
    img2vec = Img2Vec(cuda=True)
    size = (100, 150)
    vec_list = []
    for p in paths:
        image = cv2.imread(p, cv2.IMREAD_GRAYSCALE) #read images in grayscale
        image_small = cv2.resize(image, size, interpolation = cv2.INTER_AREA) #resize images to unified size (in our case 100 X 150)
        image_bin = cv2.adaptiveThreshold(image_small, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,\
                    cv2.THRESH_BINARY,11,2) # get mask with values 0 and 1
        img_rgb = cv2.cvtColor(image_bin, cv2.COLOR_BGR2RGB) #change to RGB colour space
        pil_img = Image.fromarray((img_rgb * 255).astype(np.uint8)) #convert to pillow object
        # print(pil_img)
        vectors = img2vec.get_vec(pil_img, tensor=True) #img2vec get img PIL object
        vec_list.append(vectors)
    return vec_list

# get arrays from tensors
def tensor_to_array(vec_list):
    arr_list = []
    for v in vec_list:
        numpy_arr = v.numpy()
        resh_array = numpy_arr.reshape(-1)
        arr_list.append(resh_array)
    return arr_list

# functions for data vis
#get resize images in numpy arrays
def img_reshape(img):
    img = Image.open(img).convert('RGB')
    img = img.resize((200,300))
    img = np.asarray(img) 
    return img

# get numpy arrays from image list
def get_array(random_set):
    img_arr = [] #image array as numpy array
    for i in random_set:
        try:
            img_arr.append(img_reshape(i)) 
        except FileNotFoundError: 
            continue
    return img_arr

# function for image visualization from random sample (any 10 images)
def show_sample(img_arr, path_list):
    rows=2
    cols = 5
    img_count = 0
    files_count = 0

    fig, axes = plt.subplots(nrows=rows, ncols=cols, figsize=(20,10))

    for i in range(rows):
        for j in range(cols):        
            if img_count < len(img_arr):
                axes[i, j].imshow(img_arr[img_count])
                axes[i, j].set_ylabel(str(path_list[files_count]), fontsize = 5)
                img_count+=1
                files_count+=1

In [None]:
# MAIN
# clustering with UMAP + HDBSCAN
with open('your_image_paths.txt') as f:
    paths = f.readlines()
data = data_preprocess(paths)
df = pd.DataFrame(data)
paths_for_embed = clear_paths(df)

# image feature extraction with img2vec и resnet
start_time = time.time()
vec_list = vec_img(paths_for_embed)
end_time = time.time()
elapsed_time = end_time - start_time
print('Elapsed time: ', elapsed_time)

# convert tensors to numpy arrays
arr_list = tensor_to_array(vec_list)

In [None]:
# UMAP with image vectorizing data (img2vec)
# MAIN
vec_embedding = umap.UMAP(
    n_neighbors=5,
    min_dist=0.1,
    metric='correlation',
    n_components = 2,
    random_state=42,
).fit_transform(arr_list)

In [None]:
# MAIN
plt.figure(figsize = (10,10))
plt.scatter(vec_embedding[:, 0], vec_embedding[:, 1], cmap = 'Spectral')
plt.show()
# plt.savefig("cluster_umap_img2vec.png")

In [None]:
# clustering of all image data and assignment of classes
# MAIN
labels_vec = hdbscan.HDBSCAN(
    min_samples=50,
    min_cluster_size=50,
).fit_predict(vec_embedding)

In [None]:
# vec_embedding.shape

In [None]:
# save label list of sample images to file 'pickle'
# with open('labels_vec_data.pkl', 'wb') as f:
#     pickle.dump(labels_vec, f)

# save embeddings list of images to file 'pickle'
# with open('vec_embeddings_data.pkl', 'wb') as f:
#     pickle.dump(vec_embedding, f)

In [None]:
# MAIN
clustered_vec = (labels_vec >= 0)
plt.scatter(vec_embedding[~clustered_vec, 0],
            vec_embedding[~clustered_vec, 1],
                    c=(0.1, 0.1, 0.1),
            s=0.1,
            alpha=0.8)

plt.scatter(vec_embedding[clustered_vec, 0],
            vec_embedding[clustered_vec, 1],
            c = labels_vec[clustered_vec],
            s=0.1,
            cmap='Spectral')
# plt.savefig("cluster_42272_dbscan_img2vec.png")

In [None]:
# count unique values of cluster's labels
uniq_labels, counts = np.unique(labels_vec, return_counts=True)

In [None]:
# print(np.asarray((uniq_labels, counts)).T)

In [None]:
sort_df = pd.DataFrame({"Values": uniq_labels, "Counts": counts})
sort_df.sort_values(by = 'Counts', ascending=False)

In [None]:
# datavis of propotion of clusters
ax = sort_df.sort_values(by = 'Counts', ascending=False).groupby(['Values']).sum().plot(figsize=(20,20), 
                                                                                        kind = 'pie', y = 'Counts', 
                                                rotatelabels=True, title = 'Количество изображений в каждом кластере',
                                                                                       cmap = 'Spectral')
# plt.savefig("common_clusters_diagram.png", dpi = 100)
# we choosed the biggest cluster with pages containing bibliographic records (number 20 in our sample data)

In [None]:
# final dataframe with cluster's labels and linked filenames
# MAIN
df_clusters = pd.DataFrame({"Filepath": paths_for_embed, "Cluster": labels_vec})

In [None]:
# look at cluster 20 (the biggest one) with bibliographic records
c_20 = df_clusters.loc[((df_clusters['Cluster'] == 20))]
c_20_example = c_20.sample(n = 10)
c_20_list = c_20_example["Filepath"].tolist()
c_20_arr = get_array(c_20_list)
c_20_show = show_sample(c_20_arr, c_20_list)
# plt.savefig("cluster_20.jpg", dpi = 100)