<h1 align='center' style='border: 1px dotted blue; color: red'>Keep Babies Safe - Clustering</h1>

In [None]:
import sklearn

import os
import re
import sys

import matplotlib.pyplot as plt
import cv2
import pytesseract
from tqdm import tqdm

import numpy as np
import pandas as pd

import torch
from torchvision import models

import seaborn as sb

from sklearn.cluster import AgglomerativeClustering

<h2 align='center' color='blue' style='border: 1px dotted blue; color: green'>Safety Dataset</h2>

In [None]:
class SafetyDataset:
    class Data:
        def __init__(self):
            self.photo_name = []
            self.flatten_photo = []
            self.text = []
    
    def __init__(self, path):
        self.data = self.Data()
        self.path = path
        self.conf = r'-- oem 2'
        
        # use resnet50 model for prediction
        self.model = models.resnet50(pretrained=True)
        
        # store the images
        self.store_images()
        
    def load_image(self, path, h, w):
        orig_img = cv2.imread(path)
        img = cv2.resize(orig_img, (h, w))
        img = np.expand_dims(img, 0)
        return orig_img, torch.from_numpy(img.copy()).transpose(3, 1).float()
    
    def store_images(self):
        list_ = os.listdir(self.path)
        for mem in tqdm(list_):
            # append photo name
            self.append_name(mem)
            
            imagePath = self.path + '/' + mem
            orig_img, tensor = self.load_image(imagePath, 224, 224)
            
            # append orig photo to text
            self.append_text(orig_img)
            
            features = self.predict_image(tensor)
            
            # append flatten photo
            self.append_tensor(features)

            
    def predict_image(self, tensor):
        features = self.model(tensor)
        return np.array(features.detach().numpy())
            
    def append_name(self, mem):
        self.data.photo_name.append(mem)
    
    def append_text(self, img):
        self.data.text.append(pytesseract.image_to_string(img, config=self.conf))
        
    def append_tensor(self, features):
        self.data.flatten_photo.append(features.flatten())
        
    def get_flatten_photos(self):
        return np.array(self.data.flatten_photo, dtype='float64')

In [None]:
# images path
path = '../input/keep-babies-safe/dataset/images'

# load the dataset
ds = SafetyDataset(path)

# get the features
features = ds.get_flatten_photos()

<h2 align='center' style='border: 1px dotted blue; color: green'>Agglomerative Clustering</h2>

In [None]:
class Clustering:
    class Data:
        def __init__(self):
            self.df = None
            
        def set_data(self, names, classes, brands):
            df = {
                'Image' : names,
                'Class_of_image' : classes,
                'Brand_name' : brands
            }
            self.df = pd.DataFrame(df)
    
    def __init__(self, n_clusters):
        self.data = self.Data()
        self.kmeans = AgglomerativeClustering(n_clusters=n_clusters)
        self.idx_to_classes = { 0: 'toys', 1: 'consumer_products'}
        
    def fit(self, features):
        self.kmeans.fit(features)
        
    def prep_data(self, photo_names, texts):
        classes = np.array([self.idx_to_classes[label] for label in self.kmeans.labels_])

        subs = [re.sub("\s\s+", " ", text) for text in np.array(texts)]
        brands = ['Unamed' if (len(sub) == 0 or sub == " ") else sub for sub in subs]
        
        self.data.set_data(photo_names, classes, brands)
        
    def get_data(self):
        return self.data.df

In [None]:
cluster = Clustering(2)
cluster.fit(features)
cluster.prep_data(ds.data.photo_name, ds.data.text)
cluster_df = cluster.get_data()

<h2 align='center' style='border: 1px dotted blue; color: green'>Draw Cluster</h2>

In [None]:
def draw(features, cluster_df):
    dim_reduced = pd.DataFrame(features)
    dim_reduced = dim_reduced.rename(columns = { 0: 'V1', 1 : 'V2'})
    dim_reduced['Category'] = list (cluster_df['Class_of_image'])
    plt.figure(figsize = (10, 5))
    sb.scatterplot(data=dim_reduced, x='V1', y='V2', hue='Category')
    plt.grid(True)
    plt.show()

In [None]:
draw(features, cluster_df)