In [2]:
import json
from tqdm import tqdm
import pandas as pd
import requests
import os

def extract_data(df, i):

    # get image info
    data_row = df['data_row'][i]
    Data_row_id = data_row['id']
    global_key = data_row['global_key']
    img_link = data_row['row_data']

    media_attributes = df['media_attributes'][i]
    try:
        height = media_attributes['height']
        width = media_attributes['width']
    except:
        height = ''
        width = ''

    # get annotation info
    project = df['projects'][i]
    key = list(project.keys())[0]
    info = project[key]

    annotator_email = info['labels'][0]['label_details']['created_by']

    try:
        feature_id = info['labels'][0]['annotations']['classifications'][0]['feature_id']
        annotations = info['labels'][0]['annotations']['classifications'][0]['text_answer']['content']
    except:
        feature_id = ''
        annotations = ''

    return [Data_row_id, global_key, img_link, height, width, feature_id, annotator_email, annotations]



In [3]:
records = map(json.loads, open('./export_ndjson/export-result (1).ndjson'))
df = pd.DataFrame.from_records(records)
n_row = df.shape[0]
values = []
start, end = 0, 10
print(f'ndjson has {n_row} rows of data.')

for i in tqdm(range(start, end), desc=f'processing rows from {start} to {end}', total=end-start, ncols=100):
    
    # extract info
    [Data_row_id, global_key, img_link, height, width, feature_id, annotator_email, annotations] = extract_data(df, i)
    values.append([Data_row_id, global_key, img_link, height, width, feature_id, annotator_email, annotations])
    
    # download image using the URL
    if f'{Data_row_id}.jpg' not in os.listdir('./img'):
        try:
            img_data = requests.get(img_link).content
        except:
            continue

        with open(f'./img/{Data_row_id}.jpg', 'wb') as handler:
            handler.write(img_data)

# create a new data frame
df_simplify = pd.DataFrame(data=values, columns=['Data_row_id', 'global_key', 'img_link',
                                            'height', 'width', 'feature_id', 
                                            'annotator_email', 'annotations'])

print('showing first 5 rows of the simplified dataframe:')
print(df_simplify[:5])

ndjson has 6702 rows of data.


processing rows from 0 to 10: 100%|███████████████████████████████| 10/10 [00:00<00:00, 5997.86it/s]

showing first 5 rows of the simplified dataframe:
                 Data_row_id     global_key  \
0  clj89jr2p08ts0720cx160wxu   batch-3:7689   
1  clj89jr2p08tw0720dw722u8u   batch-3:7691   
2  clj89jr2p1uni078pggej5npj  batch-3:11014   
3  clj89jr2p1unm078pa7evh5rp  batch-3:11015   
4  clj89jr2p1unq078p59b4aq0e  batch-3:11016   

                                            img_link  height  width  \
0  https://images-na.ssl-images-amazon.com/images...    1856   1229   
1  https://www.litres.ru/static/bookimages/40/62/...    1960   1400   
2  https://images-na.ssl-images-amazon.com/images...    2444   1642   
3  https://image.isu.pub/180226193328-0cd01d7ef1d...    1496   1156   
4  https://dqzrr9k4bjpzk.cloudfront.net/images/12...    2818   2818   

                  feature_id          annotator_email  \
0  cljlhjhgw000f356lhji83sow  caparisonsoft@gmail.com   
1  cljlhr6zd000i356mmtrih7i8        kgtieku@gmail.com   
2  cljli159g0001356mpv1yd0jt        kgtieku@gmail.com   
3  cljliy98i




In [4]:
import easyocr
import numpy as np
import imageio
os.environ['http_proxy'] = 'http://127.0.0.1:7890'
os.environ['https_proxy'] = 'https://127.0.0.1:7890'
import pickle as pk

# Detect English only. (easier to do text matching)
reader = easyocr.Reader(['en'], gpu=True)
# Detect Chinese too. However, the labeler might not be able to write chinese characters.
# reader = easyocr.Reader(['ch_sim', 'en'], gpu=True)

ocr_dict = dict()
file_list = sorted(os.listdir('./img/'))
for i, name in tqdm(enumerate(file_list), desc=f'Doing OCR', total=len(file_list), ncols=100):
    if name.endswith('jpg'):
        try:
            img = np.asarray(imageio.v2.imread(f'./img/{name}'))
            w, h = img.shape[0], img.shape[1]
            result = reader.readtext(f'./img/{name}', paragraph=False, detail=True)
            ocr_dict[name] = [[w, h], result]

        except:
            print(f'ERROR!! skip: {name}')
            continue


Doing OCR: 100%|████████████████████████████████████████████████████| 11/11 [00:22<00:00,  2.01s/it]


In [5]:
# Two functions for text quality filtering and text matching
def ocr_process(ocr):

    w, h = ocr[0]
    text_info = ocr[1]
    out_text = []

    for info in text_info:

        confidence = float(info[2])
        text = info[1]
        bbox = np.array(info[0])
        wb_max, hb_max = np.max(bbox, axis=0)
        wb_min, hb_min = np.min(bbox, axis=0)
        wb = wb_max - wb_min
        hb = hb_max - hb_min

        h_ratio = hb/h

        if confidence < 0.1 or len(text) < 3 or h_ratio < 0.05:
            continue
        else:
            out_text.append(text)

    return out_text


# dynamic programming for approximate text matching. (based on edit distance)
def text_match(detected_text, annotation):

    detected_text = str(detected_text).lower()
    annotation = str(annotation).lower()

    l1 = len(detected_text)
    l2 = len(annotation)
    T = np.zeros([l1+1, l2+1])
    T[:, 0] = [i for i in range(l1+1)]

    for i in range(1, l1+1):
        for j in range(1, l2+1):

            if detected_text[i-1] == annotation[j-1]:
                T[i, j] = T[i-1, j-1]
            else:
                T[i, j] = min(T[i-1, j], T[i, j-1]) + 1

    return 1 - min(T[-1, :]) / l1


In [6]:
# Do text filtering and matching

OCR_review = []
ocr_selected = []
for i, (name, annotation) in tqdm(enumerate(zip(df_simplify['Data_row_id'], df_simplify['annotations'])), total=df_simplify.shape[0], ncols=100):

    try:
        ocr = ocr_dict[f'{name}.jpg']
        detected_text = ocr_process(ocr)
        match_score_sum = 0
        total_l = 0
        for text in detected_text:
            match_score_sum += text_match(text, annotation) * len(text)
            total_l += len(text)

        r = match_score_sum / total_l

    except:
        OCR_review.append('error')
        ocr_selected.append('')
        continue

    if r < 0.8:
        OCR_review.append('no')
    else:
        OCR_review.append('good')

    ocr_selected.append(detected_text)


df_simplify['ocr_selected'] = ocr_selected

if 'OCR_review' not in df_simplify.columns:
    df_simplify.insert(loc=0, column='OCR_review', value=OCR_review)
else:
    df_simplify['OCR_review'] = OCR_review

print('showing first 5 rows of the simplified dataframe:')
print(df_simplify[:5])


100%|██████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 209.84it/s]

showing first 5 rows of the simplified dataframe:
  OCR_review                Data_row_id     global_key  \
0       good  clj89jr2p08ts0720cx160wxu   batch-3:7689   
1       good  clj89jr2p08tw0720dw722u8u   batch-3:7691   
2       good  clj89jr2p1uni078pggej5npj  batch-3:11014   
3       good  clj89jr2p1unm078pa7evh5rp  batch-3:11015   
4       good  clj89jr2p1unq078p59b4aq0e  batch-3:11016   

                                            img_link  height  width  \
0  https://images-na.ssl-images-amazon.com/images...    1856   1229   
1  https://www.litres.ru/static/bookimages/40/62/...    1960   1400   
2  https://images-na.ssl-images-amazon.com/images...    2444   1642   
3  https://image.isu.pub/180226193328-0cd01d7ef1d...    1496   1156   
4  https://dqzrr9k4bjpzk.cloudfront.net/images/12...    2818   2818   

                  feature_id          annotator_email  \
0  cljlhjhgw000f356lhji83sow  caparisonsoft@gmail.com   
1  cljlhr6zd000i356mmtrih7i8        kgtieku@gmail.com   
2  




In [7]:
import clip
import imageio
from PIL import Image
import torch
import torch.optim as optim

# get clip text embedding for sentences
def get_text_embeds(sentences, model):

    text = clip.tokenize(sentences).to(device)
    text_features = model.encode_text(text).to(torch.float)

    return text_features

# learn the best linear combination of sentence embeddings that approximate the image embedding.
def select_sentence(text_features, image_features):

    n = text_features.shape[0]
    device = text_features.device
    w = torch.nn.Parameter(torch.ones([n, 1]).to(device))
    optimizer = optim.Adam([w], lr=0.001, weight_decay=0.0, betas=(0.9, 0.999), amsgrad=False,
                           eps=1e-08)
    cos_sim = torch.nn.CosineSimilarity(dim=1)

    for _ in range(1000):
        w1 = torch.relu(w)
        y = text_features.detach() * w1
        y = torch.sum(y, dim=0)
        loss = 1 - cos_sim(image_features.detach(), y) + torch.norm(w, p=1)

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_([w], 1.0)
        optimizer.step()


    return loss.item()

In [9]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model, preprocess = clip.load('ViT-B/32', device, jit=False)

cos_sim = []
for i, (name, annotation) in tqdm(enumerate(zip(df_simplify['Data_row_id'], df_simplify['annotations'])), total=df_simplify.shape[0], ncols=150):

    try:
        image = preprocess(Image.open(f'./img/{name}.jpg')).unsqueeze(0).to(device)
        image_features = model.encode_image(image).to(torch.float)
        text_features = get_text_embeds(annotation.split('.'), model)

        d = select_sentence(text_features, image_features)
        cos_sim.append(d)

    except:
        cos_sim.append(torch.nan)
        continue
        
if 'CLIP_cosine' not in df_simplify.columns:
    df_simplify.insert(loc=0, column='CLIP_cosine', value=cos_sim)
else:
    df_simplify['CLIP_cosine'] = cos_sim

print('showing first 5 rows of the simplified dataframe:')
print(df_simplify[:5])

df_simplify.to_csv('./review.csv', index=False)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:03<00:00,  3.25it/s]

showing first 5 rows of the simplified dataframe:
   CLIP_cosine OCR_review                Data_row_id     global_key  \
0     0.700348       good  clj89jr2p08ts0720cx160wxu   batch-3:7689   
1     0.664635       good  clj89jr2p08tw0720dw722u8u   batch-3:7691   
2     0.660056       good  clj89jr2p1uni078pggej5npj  batch-3:11014   
3     0.667256       good  clj89jr2p1unm078pa7evh5rp  batch-3:11015   
4     0.668083       good  clj89jr2p1unq078p59b4aq0e  batch-3:11016   

                                            img_link  height  width  \
0  https://images-na.ssl-images-amazon.com/images...    1856   1229   
1  https://www.litres.ru/static/bookimages/40/62/...    1960   1400   
2  https://images-na.ssl-images-amazon.com/images...    2444   1642   
3  https://image.isu.pub/180226193328-0cd01d7ef1d...    1496   1156   
4  https://dqzrr9k4bjpzk.cloudfront.net/images/12...    2818   2818   

                  feature_id          annotator_email  \
0  cljlhjhgw000f356lhji83sow  caparison


