# question

## pdf

In [1]:
import os
from glob import glob 
import pandas as pd
from pdf2image import convert_from_path
from datetime import datetime

class QuePdf:
    def __init__(self):
        self.key = 'pdf_path'
        self.setup_log()
    
    def setup_log(self):
        self.log_path = '/home/ryh/embedding-match/ocr/datasets/quePdf.pickle'
        self.log_columns = [self.key, 'created_time']
        try:
            self.log = pd.read_pickle(self.log_path)
        except:
            self.log = pd.DataFrame(columns=self.log_columns)
        self.path_set = set(self.log[self.key].values)
    
    def update_log(self, pdf_path):
        row = pd.DataFrame(columns=self.log_columns)
        
        row[self.key] = [pdf_path]
        row['created_time'] = [datetime.now()]
        
        self.log = self.log.append(row)
        self.path_set.add(pdf_path)
        
    def save_log(self):
        self.log = self.log.sort_values(by=self.key, ascending=True).reset_index(drop=True)
        self.log.to_pickle(self.log_path)
        
    def get_pdf_path_list(self, year='*', subject='*', press='*'):
        d = '/home/ryh/embedding-match/ocr/datasets/taiwan/13to15/press/%s/%s/%s/question/pdf/*.pdf'%(year, subject, press)
        pdf_path_list = sorted(glob(d))
        return pdf_path_list
    
    def pdf2papers(self, pdf_path):
        if pdf_path in self.path_set:
            #print('alread exist', '---', pdf_path)
            return
        
        images = convert_from_path(pdf_path)
        
        old_dirname = os.path.dirname(pdf_path)
        old_basename = os.path.basename(pdf_path)
        
        new_dirname = old_dirname.replace('pdf', 'paper')
        os.makedirs(new_dirname, exist_ok=True)
        for i, image in enumerate(images):     
            print(i+1, len(images), end='\r')
            new_basename = old_basename.replace('.pdf', '') + '---%02d'%(i+1) + '.png' 
            img_path = os.path.join(new_dirname, new_basename)
            image.save(img_path, 'PNG')
            
        self.update_log(pdf_path)
        
    
        
    

### steps

In [2]:
quePdf = QuePdf()

In [3]:
quePdf.log
quePdf.path_set

{'/home/ryh/embedding-match/ocr/datasets/taiwan/13to15/press/108/數學/南一/question/pdf/108上[南一]國中試卷-(三)數學-B卷-(中上)-(題).pdf',
 '/home/ryh/embedding-match/ocr/datasets/taiwan/13to15/press/108/數學/南一/question/pdf/108上[南一]國中試卷-數學五A卷-(優等)-3上（題）.pdf',
 '/home/ryh/embedding-match/ocr/datasets/taiwan/13to15/press/108/數學/南一/question/pdf/108上[南一]國中試卷-數學五B卷-(中上)-3上（題）.pdf',
 '/home/ryh/embedding-match/ocr/datasets/taiwan/13to15/press/108/數學/南一/question/pdf/108下-南一國中試卷數學(4)A卷(優等)題-2下.pdf'}

In [4]:
pdf_path_list = quePdf.get_pdf_path_list(year='108', subject='數學', press='南一')
len(pdf_path_list)


18

In [5]:
pdf_path = pdf_path_list[3]
pdf_path

'/home/ryh/embedding-match/ocr/datasets/taiwan/13to15/press/108/數學/南一/question/pdf/108上[南一]國中試卷-數學五B卷-(中上)-3上（題）.pdf'

In [6]:
quePdf.pdf2papers(pdf_path)

In [7]:
quePdf.save_log()

In [8]:
quePdf.log


Unnamed: 0,pdf_path,created_time
0,/home/ryh/embedding-match/ocr/datasets/taiwan/...,2021-03-05 13:25:53.115240
1,/home/ryh/embedding-match/ocr/datasets/taiwan/...,2021-03-05 14:16:49.106101
2,/home/ryh/embedding-match/ocr/datasets/taiwan/...,2021-03-05 14:19:27.114984
3,/home/ryh/embedding-match/ocr/datasets/taiwan/...,2021-03-05 13:20:09.422361


## paper

### class

In [23]:
from google.cloud import vision
import io
from glob import glob 
import os
import re
from datetime import datetime
import pandas as pd
pd.set_option('display.max_rows', 500)

%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import matplotlib.patches as patches

class QuePaper:
    def __init__(self):
        self.key = 'paper_path'
        self.setup_log()
        self.setup_special()
        self.vision_client = vision.ImageAnnotatorClient() 
    
    def setup_log(self):
        self.log_path = '/home/ryh/embedding-match/ocr/datasets/quePaper.pickle'
        self.log_columns = [self.key, 'created_time', 'q_list', 'page', 'check_number', 'check_page', 'is_split']
        try:
            self.log = pd.read_pickle(self.log_path)
        except:
            self.log = pd.DataFrame(columns=self.log_columns)
        self.path_set = set(self.log[self.key].values)
    
    def update_log(self, path, q_list, page, check_number):
        row = pd.DataFrame(columns=self.log_columns)
        
        row['created_time'] = [datetime.now()]
        row[self.key] = [path]
        row['q_list'] = [q_list]
        row['page'] = [page]
        row['check_number'] = [check_number]
        row['check_page'] = [0]
        row['is_split'] = [0]
        
        self.log = self.log.append(row)
        self.path_set.add(path)
        
    def save_log(self, log):
        log = log.sort_values(by=self.key, ascending=True).reset_index(drop=True)
        log.to_pickle(self.log_path)
        self.setup_log()
        
    def get_paper_path_list(self, year='*', subject='*', press='*', pdf_path=''):
        path = os.path.basename(pdf_path)
        path = path.replace('.pdf', '').replace('[', '*').replace(']', '*')
        d = '/home/ryh/embedding-match/ocr/datasets/taiwan/13to15/press/%s/%s/%s/question/paper/%s*.png'%(year, subject, press, path)
        paper_path_list = sorted(glob(d))
        return paper_path_list
    
    def paper2segments(self, paper_path):
        if paper_path in self.path_set:
            #print('alread exist', '---', paper_path)
            return #?
        
        with open(paper_path, 'rb') as f:
            self.paper_byte = f.read()
        self.paper_arr = mpimg.imread(paper_path)
        self.paper_height, self.paper_width = self.paper_arr.shape[0], self.paper_arr.shape[1]
        
        ann_list = self.detect_text(self.paper_byte)
        q_ann_list = self.filter_ann_list(ann_list)
        q_list = self.get_q_list(q_ann_list)
        check_number = self.get_check_number(q_list)
        #print(check_number)
        self.update_log(paper_path, q_list, self.page, check_number)
        
    def detect_text(self, paper_byte):
        image = vision.Image(content=paper_byte)
        response = self.vision_client.text_detection(image=image)
        text_annotation_list = response.text_annotations
        return text_annotation_list[1:]
    
    def filter_ann_list(self, ann_list):
        result = []
        for i, ann in enumerate(ann_list):
            xy, xy_float, rec_width, rec_height, text = self.get_rec_info(ann)
            words = ''.join([x.description for x in ann_list[i:i+5]])
            c1 = self.is_valid_words(words)
            c2 = self.is_valid_x_position(xy_float)
            c3 = self.pre_is_valid(i, ann_list)
            c4 = self.is_exclude(xy_float, text) == False
            c5 = self.is_include(xy_float, text) == True
            if (c1 and c2 and c3 and c4) or c5:
                #print(i, words) #?
                result.append(ann)   
            self.setup_page(xy_float[1], words) #!
        return result
    
    def pre_is_valid(self, i, text_annotation_list):
        if i==0: return True
        cur = text_annotation_list[i]
        pre = text_annotation_list[i-1]
        _, cur_xy_float, _, _, _ = self.get_rec_info(cur)
        _, pre_xy_float, _, _, _ = self.get_rec_info(pre)
        cur_x, cur_y = cur_xy_float[0], cur_xy_float[1]
        pre_x, pre_y = pre_xy_float[0], pre_xy_float[1]
        x_diff, y_diff = abs(cur_x-pre_x), abs(cur_y-pre_y)
        pre_text = pre.description
        
        # cur跟pre都在同一邊
        # 假如前一個字不是 ")" 就有問題
        if (cur_x < 0.3 and pre_x < 0.3) or (cur_x > 0.5 and pre_x > 0.5):
            if x_diff < 0.05 and y_diff < 0.03 and pre_text != ')':
                return False
        return True
    
    def setup_page(self, y_float, words):
        if y_float < 0.2 or y_float > 0.8:
            pat = '^(\(|\{)?([1-9]|1[0-9])(-|–)([1-2])'
            x = re.search(pat, words)
            if x != None:
                self.page = x.group(0).replace('–', '-').replace('(', '').replace('{', '') 
    
    def is_include(self, xy_float, text):
        if (xy_float, text) in self.include_list: return True 
        return False
    
    def is_exclude(self, xy_float, text):
        if (xy_float, text) in self.exclude_list: return True 
        return False
    
    def setup_special(self):
        self.special_path = '/home/ryh/embedding-match/ocr/datasets/special.pickle'
        try:
            self.special = pd.read_pickle(self.special_path)
            self.special = self.special.drop_duplicates(subset=['xy_float', 'text'], keep='last', inplace=False)
        except:
            special_columns = ['is_include', 'xy_float', 'text', 'paper_path']
            self.special = pd.DataFrame(columns=special_columns)
        
        df_include = self.special[self.special.is_include=='include']
        df_exclude = self.special[self.special.is_include=='exclude']
        
        self.include_list = list(zip(df_include.xy_float, df_include.text))
        self.exclude_list = list(zip(df_exclude.xy_float, df_exclude.text))
    
    def update_special(self, is_include='exclude', xy_float=(0,0), text='', paper_path=''):
        row = pd.DataFrame()
        row['is_include'] = [is_include]
        row['xy_float'] = [xy_float]
        row['text'] = [text]
        row['paper_path'] = [paper_path]
        self.special = self.special.append(row)
        self.special.to_pickle(self.special_path)
        self.setup_special()
    
    def is_valid_words(self, words):
        pat = '^([1-9]|[1-2][0-9])(\.)(.)(.)'
        x = re.search(pat, words)
        return (x != None)
    
    def is_valid_x_position(self, xy_float):
        x_float, y_float = xy_float[0], xy_float[1]
        is_valid = (0.15 > x_float > 0) or (0.6 > x_float > 0.49)
        return is_valid
    
    def get_rec_info(self, ann):
        vertices = ann.bounding_poly.vertices
        xy0 = vertices[0] # 左上
        xy1 = vertices[1] # 右下？
        xy2 = vertices[2] # 右上？
        xy3 = vertices[3] # 左下
        xy = (xy0.x, xy0.y)
        xy_float = (xy0.x / self.paper_width, xy0.y / self.paper_height)
        rec_width = xy1.x - xy0.x
        rec_height = xy3.y - xy0.y
        text = ann.description
        return xy, xy_float, rec_width, rec_height, text
    
    def get_q_list(self, q_ann_list):
        up_space = 40 # 題目的頂部 
        down_space = 10 # 題目的底部
        bottom_space = 100 # 蓋掉xyz88.net
        L = []
        R = []
        for ann in q_ann_list:
            xy, xy_float, rec_width, rec_height, text = self.get_rec_info(ann)
            num = self.get_num(text)
            q = {'xy': xy, 'xy_float':xy_float, 'rec_width': rec_width, 'rec_height': rec_height, 'text': text, 'num': num}
            x_, y_ =  xy_float[0], xy_float[1]
            if x_ < 0.45:
                L.append(q)
            else:
                R.append(q)
                
        left_x = min([q['xy'][0] for q in L]) if len(L) != 0 else 'xxx' # 左右的界限
        right_x = min([q['xy'][0] for q in R]) if len(R) != 0 else 'xxx'
        
        L = sorted(L, key=(lambda q: q['xy'][1]), reverse=False ) # 分成左右兩邊，從上排到下
        R = sorted(R, key=(lambda q: q['xy'][1]), reverse=False )
        
        for i in range(len(L)):
            cur = L[i]
            y2 = L[i+1]['xy'][1] if i != len(L)-1 else self.paper_height - bottom_space
            cur['coor'] = {'x1': left_x, 'x2': right_x, 'y1': cur['xy'][1]-up_space, 'y2': y2-down_space}

        for i in range(len(R)):
            cur = R[i]
            y2 = R[i+1]['xy'][1] if i != len(R)-1 else self.paper_height - bottom_space
            cur['coor'] = {'x1': right_x, 'x2': self.paper_width, 'y1': cur['xy'][1]-up_space, 'y2': y2-down_space}

        q_list = L + R
        return q_list
    
    def get_num(self, text):
        try:
            if '.' in text:
                num = int(text[:text.index('.')])
            else:
                num = int(text)
        except:
            num = 999
        return num
    
    def get_edgecolor(self, text='6.2'):
        if '.' in text and text[-1] != '.':
            return 'b'
        else:
            return 'r'
        
    def show_paper(self, paper_path='', q_list=[], show_border=False, show_rec=True):
        img = mpimg.imread(paper_path)
        img_height, img_width = img.shape[0], img.shape[1]
        d = 50
        fig, ax = plt.subplots(figsize=(img_height/d, img_width/d), dpi=d)
        
        try:
            for q in q_list:
                if show_rec:
                    xy, xy_float, rec_width, rec_height, text = q['xy'], q['xy_float'], q['rec_width'], q['rec_height'], q['text']
                    edgecolor = self.get_edgecolor(text)
                    rect = patches.Rectangle(xy, rec_width, rec_height, linewidth=2, edgecolor=edgecolor, facecolor='none')
                    ax.add_patch(rect)

                if show_border:
                    coor = q['coor']
                    x1, x2, y1, y2 = coor['x1'], coor['x2'], coor['y1'], coor['y2']
                    xy = (x1, y1)
                    rec_width = abs(x2 - x1)
                    rec_height = abs(y2 - y1)
                    rect = patches.Rectangle(xy, rec_width, rec_height, linewidth=2, edgecolor='g', facecolor='none')
                    ax.add_patch(rect)
        except:
            pass
        
        ax.imshow(img, interpolation='none')
        plt.tight_layout()
        plt.show()
        
    def update_check_number(self, log_index=0, check_number='delete', specical_list=[]):
        if check_number=='delete':
            for specical in specical_list:
                self.update_special(is_include=specical['is_include'], xy_float=specical['xy_float'], text=specical['text'], paper_path=specical['paper_path'])
            paper_path = self.log.loc[log_index, 'paper_path'] 
            self.log = self.log.drop(index=log_index) 
            self.path_set.remove(paper_path)
            self.paper2segments(paper_path)
        else:
            self.log.loc[log_index, 'check_number'] = check_number
        self.save_log(self.log)
        
    def get_check_number(self, q_list):
        check_number = 'ok'
        for i in range(1, len(q_list)):
            cur = q_list[i]
            pre = q_list[i-1]
            cur_num = cur['num']
            pre_num = pre['num']
            if cur_num - pre_num != 1:
                if cur_num != 1:
                    check_number = 'to_check'
        return check_number
    
    def get_df_check_number(self):
        df_check_number = self.log[self.log.check_number=='to_check'].copy()
        return df_check_number
    
    def get_df_check_page(self):
        df_check_number = self.log[self.log.check_page==0].copy()
        return df_check_number
    
    def update_df_check_page(self, df_check_page, item_list=[]):
        self.log.loc[df_check_page.index, 'check_page'] = 1
        for item in item_list:
            index = item['index']
            page = item['page']
            if page=='delete':
                paper_path = self.log.loc[index, 'paper_path']
                self.log = self.log.drop(index=index) 
                os.remove(paper_path) #!
            else:
                self.log.at[index, 'page'] = page
        self.save_log(self.log)
    
    
    

In [24]:
# text = '1.xxx'
# text = '23'
# int(text[:text.index('.')])

### steps

In [25]:
quePaper = QuePaper()


In [26]:
paper_path_list = quePaper.get_paper_path_list(year='*', subject='*', press='*', pdf_path='')
len(paper_path_list)

144

In [29]:
for i, paper_path in enumerate(paper_path_list):
    print(i+1, len(paper_path_list), end='\r')
    quePaper.paper2segments(paper_path)
quePaper.save_log(quePaper.log)

144 144

### debug

In [None]:
# for i in [43, 58, 73, 94, 98, 115]:
#     print('#'*80)
#     q_list = quePaper.log.q_list.values[i]
#     paper_path = quePaper.log.paper_path.values[i]
#     page = quePaper.log.page.values[i]
#     check_number = quePaper.log.check_number.values[i]
    
#     quePaper.paper2segments(paper_path)
#     quePaper.show_paper(paper_path=paper_path, q_list=q_list, show_border=False, show_rec=True)
    
#     print('log_index', i)
#     print(paper_path)
#     print(check_number)
#     for j, q in enumerate(q_list):
#         [xy_float, text] = [q['xy_float'], q['text']]
#         print(j, (xy_float, text))
    

In [None]:
# 7, 8, 1, '8', 1, 2
# 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, '9', 3, 4, 
# 8, 9, '6', 10, 1, 1, 2, 3, 4
# 1, 2, '6', '8', 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3
# 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, '6', 2, 3
# 10, 1, 1, 2, 3, '8', 4,

### check number

In [76]:
df_check_number = quePaper.get_df_check_number()
df_check_number

Unnamed: 0,paper_path,created_time,q_list,page,check_number,check_page,is_split


In [77]:
for i in df_check_number.index[:1]:
    print('#'*80)
    q_list = quePaper.log.q_list.values[i]
    paper_path = quePaper.log.paper_path.values[i]

    quePaper.show_paper(paper_path=paper_path, q_list=q_list, show_border=True, show_rec=True)
    print('log_index', i)
    print(paper_path)
    for j, q in enumerate(q_list):
        [xy_float, text] = [q['xy_float'], q['text']]
        print(j, (xy_float, text))
    

#### not delete

In [63]:
# log_index = 87
# check_number='include'
# specical_list=[]

# quePaper.update_check_number(log_index=log_index, check_number=check_number, specical_list=specical_list)


#### delete

In [None]:
log_index 
/home/ryh/embedding-match/ocr/datasets/taiwan/13to15/press/108/數學/南一/question/paper/108下-南一國中試卷數學(4)A卷(優等)題-2下---04.png
0 ((0.06625357483317446, 0.04261548471047495), '10.')
1 ((0.06291706387035272, 0.26284970722186074), '1.')
2 ((0.11630123927550047, 0.7260897852960312), '1')
3 ((0.5738798856053384, 0.048471047495120365), '2')
4 ((0.5724499523355576, 0.20722186076772933), '3')
5 (, )
6 ((0.5181124880838894, 0.3428757319453481), '4.')

In [75]:
# log_index = 115
# check_number='delete'

# s1 = {}
# s1['xy_float'] = (0.5938989513822688, 0.22706571242680545)
# s1['text'] = '8.'
# s1['paper_path'] = '/home/ryh/embedding-match/ocr/datasets/taiwan/13to15/press/108/數學/南一/question/paper/108下-南一國中試卷數學(4)A卷(優等)題-2下---04.png'
# s1['is_include'] = 'exclude'

# # s2 = {}
# # s2['xy_float'] = (0.1404602109300096, 0.2659188386671066)
# # s2['text'] = '8.'
# # s2['paper_path'] = '/home/ryh/embedding-match/ocr/datasets/taiwan/13to15/press/108/數學/南一/question/paper/108上[南一]國中試卷-數學五B卷-(中上)-3上（題）---07.png'
# # s2['is_include'] = 'exclude'

# specical_list = [
#     s1, 
# #     s2
# ]

# quePaper.update_check_number(log_index=log_index, check_number=check_number, specical_list=specical_list)


In [79]:
# quePaper.special

### check page

In [None]:
df_check_page = quePaper.get_df_check_page()
df_check_page

In [None]:
# for i in [83]:
#     print('#'*80)
#     q_list = quePaper.log.q_list.values[i]
#     paper_path = quePaper.log.paper_path.values[i]

#     quePaper.show_paper(paper_path=paper_path, q_list=q_list, show_border=True, show_rec=True)
#     print('log_index', i)
#     print(paper_path)
#     for j, q in enumerate(q_list):
#         [xy_float, text] = [q['xy_float'], q['text']]
#         print(j, (xy_float, text))
    

In [None]:
# %%bash 
# rm "/home/ryh/embedding-match/ocr/datasets/taiwan/13to15/press/108/數學/南一/question/paper/108上[南一]國中試卷-(三)數學-B卷-(中上)-(題)---33.png"
# rm "/home/ryh/embedding-match/ocr/datasets/taiwan/13to15/press/108/數學/南一/question/paper/108上[南一]國中試卷-(三)數學-B卷-(中上)-(題)---34.png"


In [None]:
item_list = [
    {'index': 83, 'page': '10-2'},
#     {'index': 64, 'page': 'delete'},
#     {'index': 65, 'page': 'delete'},
]

# item_list = []

quePaper.update_df_check_page(df_check_page, item_list)

In [None]:
quePaper.log

## segment

In [None]:
class QueSeg:
    def __init__(self):
        pass

In [None]:
[43, 58, 73, 94, 98, 115]

# answer

## pdf

## paper

## segment