In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os,path
from PIL import Image, ImageDraw
from zipfile import ZipFile
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from concurrent.futures import ProcessPoolExecutor
import cv2 as cv
np.random.seed(1337)

In [None]:
kaggle_input_folder = path.Path("../input")
kuzushiji_folder = kaggle_input_folder/"kuzushiji-recognition"
unicode_translation = kuzushiji_folder/"unicode_translation.csv"
train_images_zip = kuzushiji_folder/"train_images.zip"
train_csv = kuzushiji_folder/"train.csv"
train_images_folder = path.Path("../temp/train")

In [None]:
df_train_labels = pd.read_csv(train_csv)
map_unicode_translation = {unicode_char:japanese_char for unicode_char,japanese_char in pd.read_csv(unicode_translation).iterrows()}
train_zip_interface = ZipFile(train_images_zip)

In [None]:
get_file_name = lambda x: "{}.jpg".format(x)
def get_label_array(x):
    x = np.array(x.split(' ')).reshape(-1,5)
    bboxes = x[:,1:].astype(int)
    labels = x[:,0]
    return pd.DataFrame.from_dict({'unicode_char':labels.tolist(), 'bboxes':bboxes.tolist()})
get_japanese_char = lambda x: map_unicode_translation[x]
def get_image_as_np_array_from_file(file_handler,file_name):
    with file_handler.open(file_name) as file:
        data = Image.open(file)
        data.load()
    return np.asarray(data)

def get_sample_file(file_dir):
    return np.random.choice(os.listdir(file_dir))
def convert_to_gray_scale(image):
    return np.dot(image[...,:3], [0.299, 0.587, 0.114])

In [None]:
train_zip_interface.extractall(train_images_folder)

In [None]:
sample_file = train_images_folder/get_sample_file(train_images_folder)

In [None]:
class FileHandler:
    def __init__(self,file_name,mode):
        self.file_name = file_name
        self.mode=mode
        
    def __enter__(self):
        self.file_obj = open(self.file_name,self.mode)
        return self.file_obj
    
    def __exit__(self,exc_type, exc_value, exc_traceback):
        if exc_type is None:
            self.file_obj.close()
        else:
            print(exc_value)
        return True
    
    @classmethod
    def open(cls,file_name,mode='rb'):
        return cls(file_name,mode)

In [None]:
image_arr = get_image_as_np_array_from_file(FileHandler,sample_file)


### Get Image Sizes

In [None]:
def get_image_size_dimension(image):
    size = image.shape[:2]
    if len(image.shape)>2:
        dimension = image.shape[-1]
    else:
        dimension = 1
    return size, dimension

In [None]:
def get_images_from_dir(dir_folder):
    file_names = os.listdir(dir_folder)
    for i in file_names:
        yield get_image_as_np_array_from_file(FileHandler,dir_folder/i)

In [None]:
images = get_images_from_dir(train_images_folder)

In [None]:
with ProcessPoolExecutor() as executor:
    future = executor.map(get_image_size_dimension,images)

In [None]:
future = list(future)

In [None]:
def get_df_from_shape_dim(array):
    recs = []
    for h_w, dim in array:
        h = h_w[0]
        w = h_w[1]
        d = dim
        recs.append([w,h,d])
    return pd.DataFrame(recs,columns=["width","height","dimension"])

In [None]:
df_shape_dim = get_df_from_shape_dim(future)

In [None]:
df_shape_dim["dimension"].value_counts()

### We have all the color Images as all the dimensions are of three

In [None]:
pd.plotting.scatter_matrix(df_shape_dim[["width","height"]],figsize=(15,10))

#### It can be observed that Images have different Sizes. Also the Hist Plots are right skewed

In [None]:
df_shape_dim[["width","height"]].describe()

### Looks like our images are centered at 2000,3000 with larger variation in height

In [None]:
def visualize_images(images,gray_scale=False):
    images = list(images)
    n_images = len(images)
    cols = n_images//2
    if n_images%2:
        cols+=1
    fig,axs = plt.subplots(2,cols,figsize=(10,20))
    for i_image in range(n_images):
        col = i_image//2
        row = i_image%2
        if gray_scale:
            axs[row,col].imshow(convert_to_gray_scale(images[i_image]),cmap='gray')
        else:
            axs[row,col].imshow(images[i_image])
    plt.show()

In [None]:
def get_sample_images_from_dir(file_dir,n_samples=6):
    file_names = os.listdir(file_dir)
    file_names = np.random.choice(file_names,n_samples)
    for i in file_names:
        yield get_image_as_np_array_from_file(FileHandler,file_dir/i)

In [None]:
images = get_sample_images_from_dir(train_images_folder)

In [None]:
visualize_images(images,gray_scale=True)

### As we would be focused on Characters Only Convert the Images to Grey Scale