In [1]:
# import pacakges
from PIL import Image
import pytesseract
import argparse
import cv2
import os
import imutils
from PIL import Image

from PIL import Image, ImageEnhance, ImageFilter
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt

import re
import glob
import pandas as pd
from PIL import ExifTags

# Helper Functions

In [5]:
# display images
def show(image):
    plt.figure()
    plt.imshow(image)
    plt.show()

    
# use regex pattern to extract style codes from editable documents
patterns= {'nike':['[\s_]([a-z]{1,2}[0-9]{4,5}(-|_|\.|\s)[0-9]{3})[\s\_\-\,]|[\s_]([0-9]{6}(-|_|\.|\s)[0-9]{3})[\s\_\-\,]'], \
           'adidas': [ 'art\s?([a-z\d]{1,2}\d{4,6})|art\s?(\d{5,6})|art\s?no\.\s?([a-z\d]{1,2}\d{4,6})|art\s?no\.\s?(\d{5,6})']}
def find_stylecode(d, brand):
    
    for p in patterns[brand]:
        m = re.search(p, d.lower())
        
        if m != None:
            return m[0]
    return "None"


# calculate accuracy for the model
def get_result(sneaker_data, col):
    data = sneaker_data.loc[sneaker_data['condition'] == str(1), \
                         ['brand', 'style-code', col]]
    calculate(data, col)
    for brand in ['adidas', 'nike']:
        d = data.loc[data['brand']==brand]
        print(brand, ": ")
        calculate(d, col)
def calculate(df, col):
    coverage = sum(df[['style-code', col]][col].notna())
    codes = df['style-code'].apply(lambda x: x.replace('-', '').lower())
    acc_n = (codes == df[col]).sum()
    total_num = df.shape[0]
    if total_num == 0:
        print(0, 0, 0, 0)
    else:
        coverage_rate = coverage/total_num
        accuracy_rate = acc_n/coverage
        print(coverage_rate, accuracy_rate, coverage, acc_n)
        





# image recognition workflow -- Tesseract ocr engine

In [7]:
# df = pd.read_csv('spreedsheet_add.csv')

def image_recognition_model(df):
    df['recognized'] =  np.NaN
    for i in range(df.shape[0]):
        filename = './data/'+df['file'][i]
        brand = df['brand'][i]
        code = df['style-code'][i]
        condition = df['condition'][i]
        print('provided info: '+filename, brand, code, condition, i)

        image = cv2.imread(filename)
        width = 500
        width_scale = image.shape[1]/width
        if width_scale > 0:
            height = int(image.shape[0]/width_scale)
        else:
            height = int(image.shape[0])
        dim = (width, height)
        image = cv2.resize(image,dim, interpolation = cv2.INTER_AREA)

        gray = cv2.cvtColor(image,cv2.COLOR_BGR2GRAY)
        ret,gray = cv2.threshold(gray,120,200,0)
        gray2 = gray.copy()
        mask = np.zeros(gray.shape,np.uint8)
        cnts = cv2.findContours(gray, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
        cnts = cnts[1] if imutils.is_cv3() else cnts[0]

        cnts = sorted(cnts, key=cv2.contourArea, reverse=True)
        
        #cropping sneaker tongue labels
        for c in cnts[:1]:
            x, y, w, h = cv2.boundingRect(c)
            cv2.rectangle(image, (x, y), (x + w, y + h), 255, 0)

        new_img=image[y:y+h, x:x+w]
        new_img = cv2.resize(new_img, None, fx=1.2, fy=1.2, interpolation=cv2.INTER_CUBIC)


        #bilateral blur: remove noises but preserve the edges
        new_img_blur = cv2.bilateralFilter(new_img,9,50,50)
        out_gray, out_binary = denoise(new_img_blur)
        
        # apply two different page segmentation modes
        tesseract_engine_config = ["--psm 11 --oem 1 -c preserve_interword_spaces=1 tessedit_char_whitelist=ABCDEFGEFGHIJKLMNOPQRSTUVWXYZ0123456789-/", 
                                  "--psm 6 --oem 1 -c preserve_interword_spaces=1 tessedit_char_whitelist=ABCDEFGEFGHIJKLMNOPQRSTUVWXYZ0123456789-/"]

        for engine_config in tesseract_engine_config:
            data = pytesseract.image_to_string(out_gray, lang='eng', config = engine_config)
            stylecode = find_stylecode(data, brand)
            stylecode = stylecode.replace(' ', '').replace('art', '').replace('-', '').replace('\n', '').replace('_', '').replace('.', '').replace(',', '')
            code = code.replace('-', '')
            if stylecode != "None":
                df.loc[i, 'recognized'] = stylecode
                break
            else:
                continue
        if stylecode == "None":
            gray = cv2.cvtColor(new_img,cv2.COLOR_BGR2GRAY)
            ret,gray = cv2.threshold(gray,120,200,0)
            gray2 = gray.copy()
            mask = np.zeros(gray.shape,np.uint8)
            cnts = cv2.findContours(gray, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
            cnts = cnts[1] if imutils.is_cv3() else cnts[0]
            cnts = sorted(cnts, key=cv2.contourArea, reverse=True)[1:3]
            #removing QR code 
            for c in cnts:
                x, y, w, h = cv2.boundingRect(c)
                new_img_qrcode = new_img.copy()
                new_img_qrcode[y:y+h,x:x+w]=255
                new_img_qrcode = cv2.resize(new_img_qrcode, None, fx=1.2, fy=1.2, interpolation=cv2.INTER_CUBIC)
                #bilateral blur: remove noises but preserve the edges
                new_img_qrcode_blur = cv2.bilateralFilter(new_img_qrcode,9,50,50)
                out_gray, out_binary = denoise(new_img_qrcode_blur)
                tesseract_engine_config = [ "--psm 6 --oem 1 -c preserve_interword_spaces=1 tessedit_char_whitelist=ABCDEFGEFGHIJKLMNOPQRSTUVWXYZ0123456789-/", 
                                          "--psm 11 --oem 1 -c preserve_interword_spaces=1 tessedit_char_whitelist=ABCDEFGEFGHIJKLMNOPQRSTUVWXYZ0123456789-/"]
                for engine_config in tesseract_engine_config:
                    data = pytesseract.image_to_string(out_gray, lang='eng', config = engine_config)
                    stylecode = find_stylecode(data, brand)
                    stylecode = stylecode.replace(' ', '').replace('art', '').replace('-', '').replace('\n', '').replace('_', '').replace('.', '').replace(',', '')
                    code = code.replace('-', '')
                    if stylecode != "None":
                        df.loc[i, 'recognized'] = stylecode
                        break
                    else:
                        continue
                if stylecode != "None":
                    break
                else:
                    continue
    return df

In [8]:
similar_database = {
                    '6': ['8', 'g'], \
                    '5': ['s', '0', '6'], \
                    '0': ['9', 'q', '1', 'o', 'c', '5'], \
                    '7': ['1', '2'], \
                    '8': ['9', '5', '3'], \
                    '1': ['t', '7', '0'], \
                    'a': ['4'], 
                    'g': ['q'], \
                    'e': ['6'], \
                    '9': ['5'], \
                    'r': ['a', 'b'], \
                    '2': ['3'], \
                    '3': ['9'], 
                    'r': ['b'], 
                    '4': ['7']
                   }

In [9]:
# helper function for generating similar style codes
def impute_code(code):
    imputed_ls = [''.join(code)]
    for i in range(len(code)):
        imputed = code.copy()
        if imputed[i] in similar_database:
            change_characters = similar_database[imputed[i]]
            for c in change_characters:
                imputed[i] = c
                imputed_ls.append(''.join(imputed))
        else:
            continue
    return imputed_ls
# use regex pattern to predict the brand
def recognize_brand(stylecode):
    if type(stylecode) != str:
        return np.NaN
    elif re.fullmatch('[a-z]{1,2}[0-9]{4,5}[0-9]{3}|[0-9]{6}[0-9]{3}', stylecode)!=None:
        brand = 'nike'
    elif re.fullmatch('[a-z]{1,2}\d{4,6}|\d{5,6}', stylecode)!=None:
        brand = 'adidas'
    else:
        brand = None
    
    return brand
        

# generating similar style codes

In [4]:

def impute_prod_code(stylecode):

    if type(stylecode) != str:
        return np.NaN
    
    patterns= {'nike':'[a-z]{1,2}[0-9]{4,5}-[0-9]{3}|[0-9]{6}-[0-9]{3}', \
           'adidas':  '[a-z]{1,2}\d{4,6}|\d{5,6}'}
    brand = recognize_brand(stylecode)
    if brand == None:
        return np.NaN
    if brand == 'nike':
        #nike
       
        prod_code = list(stylecode[:-3])
        color_code = list(stylecode[-3:])
    else:
        #adidas
        prod_code = list(stylecode)
        color_code = None
        
        
    prod_code_ls = impute_code(prod_code)
    if color_code != None:
        color_code_ls = impute_code(color_code)
        imputed_stylecode_ls = [pd_code+ cl_code for cl_code in color_code_ls for pd_code in prod_code_ls\
                               if re.fullmatch(patterns[brand], pd_code+'-'+ cl_code)!=None]
    else:
        imputed_stylecode_ls = [x for x in prod_code_ls if re.fullmatch(patterns[brand], x)!=None]

    #get at most 10 generated similar style codes
    return imputed_stylecode_ls[:10]

In [None]:
#apply generating similar style code model to all data in dataframe
def generate_similar_stylecode_in_df(df):
    df['imputed stylecode'] = df['recognized'].apply(impute_prod_code)
    df['recognized brand'] = df['recognized'].apply(recognize_brand)
    return df

