In [4]:
import pandas as pd
import sys
sys.path.insert(0, "../scripts")
from prediction_utils import *
import os
from sklearn import metrics
import unidecode

# add Cloud Vision API key to environment
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../ServiceAccountToken_VisionAPI.json'


test_dir = '../data/test_text'
test_imgs = os.listdir(test_dir)

# Get cereal info from DB
passwd = open('../db_info','r').readlines()[1].split()[0]
username = open('../db_info','r').readlines()[0].split()[0]
dbname = 'cereals'
db = sqlalchemy.create_engine(f'mysql+pymysql://{username}:{passwd}@localhost/{dbname}')
conn = db.connect()
s = 'select * from cereals2'
df = pd.read_sql(s, connect_to_db())

In [5]:
def modified_jaccard_similarity(ocr_words, ocr_areas, set2):
    """ Calculate Jaccard similarity between two sets of strings"""

    total_area = np.sum(ocr_areas)
    
    intersect = set(ocr_words).intersection(set2)
    fractional_area = 0
    for word in intersect:
        indices = [i for i in range(len(ocr_words)) if ocr_words[i] == word]
        fractional_area += np.max(ocr_areas[indices] / total_area) # take max in case word occurs multiple times
    #print(f"Fractional area: {fractional_area}")
    
    return fractional_area * len(intersect) / (len(ocr_words) + len(set2) - len(intersect))
    

def get_cereal2(df, ocr_words, ocr_areas):
    # add jaccard column to dataframe
    df["jaccard"] = np.nan

    for ix, row in df.iterrows():
        # pre-process cereal name
        cereal = row['cereal_name'] + " " + row['company']
        cereal = process_string_for_comparison(cereal)
        cereal = set(cereal.split())
        # Get jaccard and add to dataframe
        jaccard = modified_jaccard_similarity(ocr_words, ocr_areas, cereal)
        df.loc[ix, "jaccard"] = jaccard

    # if no jaccard greater than zero OR multiple cereals share max jaccard, return empty string
    if df['jaccard'].max() == 0 or len(df.loc[df['jaccard']==df['jaccard'].max()]) > 1:
        predicted_cereal = ''
    else:
        predicted_cereal = df['cereal_name'][df['jaccard'].idxmax()]

    
    confidence = 1
    
    # only return positive identification if company name in OCR words
    predicted_company = df['company'][df['jaccard'].idxmax()]
    predicted_company = set(process_string_for_comparison(predicted_company).split())
    if len(predicted_company.intersection(set(ocr_words))) < len(predicted_company):
        confidence = 0
    
    predicted_cereal_set = set(process_string_for_comparison(predicted_cereal).split())
    if len(predicted_cereal_set.intersection(set(ocr_words))) < len(predicted_cereal_set):
        confidence = 0
        
    return predicted_cereal, confidence

def PolygonArea(x,y):
    """Calculate area of a polygon given vertices"""
    return 0.5*np.abs(np.dot(x,np.roll(y,1))-np.dot(y,np.roll(x,1)))

In [42]:
#stop_words = {'of', 'with', 'net', 'wt', 'oz', 'per', 'serving', 'cup'}

gt_label_ids = []
gt_labels = []

labels_1 = []
labels_2 = []
labels_1_id = []
labels_2_id = []
    
np.random.seed(17)

# main loop
for iteration in range(100):
    
    # get random file ids
    indicies = np.random.randint(len(test_imgs), size=(1,np.random.randint(10, 20))).tolist()[0]
    print(f"Running iteration {iteration}")

    # load images, get labels
    imgs = []
    widths = []
    heights = []
    for index in indicies:
        test_img = test_imgs[index]
        gt_label_ids.append(int(test_img[:3]))
        gt_labels.append(df.loc[df['label_id']==gt_label_ids[-1]]['cereal_name'].to_string(index=False).strip())

        img = Image.open(os.path.join(test_dir, test_img))
        imgs.append(img)
        heights.append(img.size[1])
        widths.append(img.size[0])
    stacked_image = Image.new('RGB', (max(widths), sum(heights)))
    stacked_img_edges = np.concatenate([np.array([0]), np.cumsum(heights)[:-1], np.array([sum(heights)])])

    # make stacked image
    for img, y_top in zip(imgs, stacked_img_edges[:-1]):
        stacked_image.paste(img, (0,y_top))

    # Detect text with Cloud Vision API and parse results
    OCR_words_all, OCR_vertices = Vision_API_OCR(stacked_image)

    for ix, index in enumerate(imgs):
        OCR_words = []
        OCR_areas = np.empty((0))
        for word, vertices in zip(OCR_words_all, OCR_vertices):
            if (vertices['y'] > stacked_img_edges[ix]).all() & (vertices['y'] < stacked_img_edges[ix+1]).all():
                #if word not in stop_words:
                word = unidecode.unidecode(word)
                OCR_words.append(word)
                OCR_areas = np.append(OCR_areas, PolygonArea(vertices['x'], vertices['y']))
        
        # strip any whitespace
        OCR_words = list(map(str.strip, OCR_words))
    
        if len(OCR_words) > 0:
            labels_1.append( get_cereal(df, set(OCR_words)) )
            label, confidence = get_cereal2(df, OCR_words, OCR_areas)
            if confidence == 0: label = ''
            labels_2.append( label )
            labels_1_id.append(df.loc[df['cereal_name']==labels_1[-1]]['label_id'].to_numpy())
            labels_2_id.append(df.loc[df['cereal_name']==labels_2[-1]]['label_id'].to_numpy())

Running iteration 0
Running iteration 1
Running iteration 2
Running iteration 3
Running iteration 4
Running iteration 5
Running iteration 6
Running iteration 7
Running iteration 8
Running iteration 9
Running iteration 10
Running iteration 11
Running iteration 12
Running iteration 13
Running iteration 14
Running iteration 15
Running iteration 16
Running iteration 17
Running iteration 18
Running iteration 19
Running iteration 20
Running iteration 21
Running iteration 22
Running iteration 23
Running iteration 24
Running iteration 25
Running iteration 26
Running iteration 27
Running iteration 28
Running iteration 29
Running iteration 30
Running iteration 31
Running iteration 32
Running iteration 33
Running iteration 34
Running iteration 35
Running iteration 36
Running iteration 37
Running iteration 38
Running iteration 39
Running iteration 40
Running iteration 41
Running iteration 42
Running iteration 43
Running iteration 44
Running iteration 45
Running iteration 46
Running iteration 47
Ru

In [43]:
true1 = []
predict1 = []
true2 = []
predict2 = []

for gt, l1, l2 in zip(gt_labels, labels_1, labels_2):
    if l1 != '':
        true1.append(gt)
        predict1.append(l1)
    if l2 != '':
        true2.append(gt)
        predict2.append(l2)

# Print the precision and recall, among other metrics
print(metrics.classification_report(true1, predict1, digits=3))
print(metrics.classification_report(true2, predict2, digits=3))     

                                             precision    recall  f1-score   support

                              All Bran Buds      1.000     1.000     1.000        27
                          All Bran Original      1.000     1.000     1.000        28
                    Apple Cinnamon Cheerios      1.000     1.000     1.000         9
                                Apple Jacks      1.000     1.000     1.000        27
                               Cap'n Crunch      1.000     1.000     1.000        33
                                   Cheerios      1.000     0.080     0.148        25
                                       Chex      0.000     0.000     0.000         0
               Chocolate Peanut Butter Pops      1.000     1.000     1.000        28
    Chocolate Toast Crunch cocoa & cinnamon      0.929     1.000     0.963        13
                      Cinnamon Toast Crunch      1.000     1.000     1.000        34
                               Classic Trix      1.000     0.769