In [29]:
# import libraries
import os 
import pandas as pd
import cv2
import numpy as np
import matplotlib.pyplot as plt

In [30]:
data_dir = '../data'

In [31]:
raw_data_dir = os.path.join(data_dir, 'raw')
processed_data_dir = os.path.join(data_dir, 'processed')

In [32]:
os.makedirs(raw_data_dir, exist_ok=True)
os.makedirs(processed_data_dir, exist_ok=True)

In [33]:
sudoku_dataset_dir = os.path.join(raw_data_dir, 'sudoku_dataset')
# ! git clone https://github.com/wichtounet/sudoku_dataset.git ../data/raw/sudoku_dataset

In [34]:
sudoku_ocr_dataset_dir = os.path.join(processed_data_dir, 'sudoku_ocr_dataset')
os.makedirs(sudoku_ocr_dataset_dir, exist_ok=True)

In [35]:
os.listdir(sudoku_dataset_dir)

['.git',
 '.gitignore',
 'baptiste.sh',
 'datasets',
 'images',
 'jean.sh',
 'mixed',
 'mixed_incomplete',
 'mixed_natural',
 'original',
 'outlines_sorted.csv',
 'README.rst',
 'tools',
 'wip']

In [36]:
image_dir = os.path.join(sudoku_dataset_dir, 'images')

In [37]:
annotation_path = os.path.join(sudoku_dataset_dir, 'outlines_sorted.csv')
annotation_df = pd.read_csv(annotation_path)
annotation_df.head()

Unnamed: 0,filepath,p1_x,p1_y,p2_x,p2_y,p3_x,p3_y,p4_x,p4_y
0,./images/image32.jpg,112,35,583,35,600,435,105,444
1,./images/image1082.jpg,101,270,885,272,872,1060,105,1053
2,./images/image125.jpg,13,11,409,0,423,415,10,427
3,./images/image50.jpg,41,10,552,4,568,443,30,446
4,./images/image188.jpg,14,10,534,9,538,447,20,457


In [38]:
orientation_dict = {
    "image1083": 1,
    "image1024": 2,
    "image1031": 2,
    "image1036": 2,
    "image1037": 2,
    "image1039": 2,
    "image1040": 2,
}

In [40]:
def create_ocr_dataset(): 
        
    for ann_idx, row in annotation_df.iterrows():
        file_path = row['filepath']
        coord_dict = row[row.keys()[1:]].to_dict()
        base_name = os.path.basename(file_path)
        image_path = os.path.join(image_dir, base_name)
        file_name = os.path.splitext(base_name)[0]
    
        orientation = orientation_dict.get(file_name)
            
        data_file_path = os.path.join(image_dir, f'{file_name}.dat')
        image = cv2.imread(image_path)
    
        data = pd.read_csv(data_file_path, delimiter='\t', skiprows=2, names=['data'])
        
        x_min = min(coord_dict['p1_x'], coord_dict['p2_x'], coord_dict['p3_x'], coord_dict['p4_x'])
        x_max = max(coord_dict['p1_x'], coord_dict['p2_x'], coord_dict['p3_x'], coord_dict['p4_x'])
        y_min = min(coord_dict['p1_y'], coord_dict['p2_y'], coord_dict['p3_y'], coord_dict['p4_y'])
        y_max = max(coord_dict['p1_y'], coord_dict['p2_y'], coord_dict['p3_y'], coord_dict['p4_y'])
        
        image = image[y_min: y_max, x_min: x_max]
    
        if orientation is not None:
            image = cv2.rotate(image, orientation)
        
        H, W, _ = image.shape
        
        h = int(H/9)
        w = int(W/9)
        
        for idx in range(9):
            file_name = f"{data['data'].iloc[idx].replace(' ', '')}_{ann_idx}.jpg"
            file_path = os.path.join(sudoku_ocr_dataset_dir, file_name)
            row_image = image[idx*h: (idx+1)*h, :]
            cv2.imwrite(file_path, row_image)
        

In [11]:
def get_width_height_stat():
    width_list, height_list = [], []
    
    for ann_idx, row in annotation_df.iterrows():
        coord_dict = row[row.keys()[1:]].to_dict()
        
        x_min = min(coord_dict['p1_x'], coord_dict['p2_x'], coord_dict['p3_x'], coord_dict['p4_x'])
        x_max = max(coord_dict['p1_x'], coord_dict['p2_x'], coord_dict['p3_x'], coord_dict['p4_x'])
        y_min = min(coord_dict['p1_y'], coord_dict['p2_y'], coord_dict['p3_y'], coord_dict['p4_y'])
        y_max = max(coord_dict['p1_y'], coord_dict['p2_y'], coord_dict['p3_y'], coord_dict['p4_y'])
    
        width = x_max - x_min
        height = y_max - y_min
        
        width_list.append(width)
        height_list.append(height)
    
    annotation_df['width'] = width_list
    annotation_df['height'] = height_list
    
    print('average weight:', annotation_df['width'].mean())
    print('average height:', annotation_df['height'].mean())