### Importing Depedencies

In [760]:
import re
from math import ceil

### Top Level Functions

In [551]:
def fix_letters_into_numbers(str):
    str = str.lower()
    str = str.replace('o','0')
    str = str.replace('i','1')
    str = str.replace('l','1')
    str = str.replace('b','8')
    str = str.replace('s','5')
    return str

def remove_whitespaces(str):
    return str.replace(' ','')

def remove_doublespaces(str):
    return str.replace('  ',' ')

def remove_spaces_between_num_sep(str):
    return re.sub(r'\s?,\s?', ',', str)
    
def jaccard_sim(str1, str2): 
    a = set(str1.split()) 
    b = set(str2.split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

def apply_price_sim(str):
    value = re.sub(r'\d', 'd', str).lower().replace('dd','d')
    score = jaccard_sim(value, 'd,d (d,d) = d,d')
    return score

### Loading Data

In [678]:
file = open('./data.txt')
original_data = file.readlines()

sku_reg = re.compile(r'^(?P<seq>\d{3})\s+(?P<sku>\d{13})\s(?P<desc>.*)$')
price_reg = re.compile(r'\(\d+,\d+\)')
prices_reg = re.compile(r'\d+,\d+')

### Data Preparation

In [613]:
data_prep = map(lambda x: x.strip(), original_data)
data_prep = map(remove_doublespaces, data_prep)
data_prep = filter(None, data_prep)
data_prep = map(remove_spaces_between_num_sep, data_prep)
data_prep = list(data_prep)

#### Get Skus

Create a list of all skus match

In [630]:
def build_sku(item):
    return {'seq': item['match'].group('seq'),
            'sku': item['match'].group('sku'),
            'desc': item['match'].group('desc'),
            'data': item['data']
           }

sku_list = map(lambda x: {'match': sku_reg.match(x), 'data': x}, data_prep)
sku_list = filter(lambda x: x['match'], sku_list )
sku_list = list(map(build_sku, sku_list))

print(*sku_list[:5] , sep='\n\n\n')

{'seq': '001', 'sku': '7891172432019', 'desc': 'NEVE P. HIG NEUTRO F. DUPL', 'data': '001 7891172432019 NEVE P. HIG NEUTRO F. DUPL'}


{'seq': '002', 'sku': '7898367980010', 'desc': 'EISENBAHN CERU UBT 26 79 (4,40) = 16,79', 'data': '002 7898367980010 EISENBAHN CERU UBT 26 79 (4,40) = 16,79'}


{'seq': '003', 'sku': '7896051111764', 'desc': 'ITAMBE LEITE 2. LACT"SEMI', 'data': '003 7896051111764 ITAMBE LEITE 2. LACT"SEMI'}


{'seq': '009', 'sku': '7896051128076', 'desc': 'ITAMBE LEITE Z. LACT DESN', 'data': '009 7896051128076 ITAMBE LEITE Z. LACT DESN'}


{'seq': '005', 'sku': '7896051128076', 'desc': 'ITAMBE LEITE 2 LACT DESN', 'data': '005 7896051128076 ITAMBE LEITE 2 LACT DESN'}


Clean data with prices

##### Cost

In [860]:
def find_units(item):
    units_reg = re.compile(r'\sun|kg\s')
    units = list(map(lambda x: x.strip(), units_reg.findall(item['data'])))
    item.update({'unit': 'un' if len(units) == 0 else units[0]})
    return item

def find_costs(item):
        reg = re.compile(r'\d+\.\d+')
        costs = list(map(float, reg.findall(item['data'])))
        kg = item['unit'] == 'kg'
        total = costs[-1]
        unit_price = costs[-1] if len(costs) or kg < 3 else costs[0]
        qtd = ceil(total / unit_price)
        item.update({'costs' : costs, 'qtd': qtd})
        return item
    
def fix_cost_separator(item):
    new_val = re.sub(r'(\d+),(\d+)', '\g<1>.\g<2>', item['data'])
    item['data'] = new_val
    return item


tokenize = lambda x: {'token' : remove_whitespaces(x), 'data' : x}

filter_prices = lambda x : price_reg.search(x['token'])

remove_kg = lambda x : x['unit'] != 'kg'

list_cost = map(tokenize , data_prep)
list_cost = map(find_units, list_cost)
# list_cost = filter(remove_kg, list_cost)
list_cost = filter(filter_prices, list_cost)
list_cost = map(fix_cost_separator, list_cost)
list_cost = map(find_costs, list_cost)

# list_cost = map(find_costs, list_cost)

seq = 0
for i in list_cost:
    seq = seq + 1
    print(seq, '\t', i['qtd'],'\t', i['unit'], '\t', i['costs'], '\t', i['data'])

# print(*[x['data'] for x in list_cost], sep='\n')


1 	 1 	 un 	 [4.4, 16.79] 	 002 7898367980010 EISENBAHN CERU UBT 26 79 (4.40) = 16.79
2 	 1 	 un 	 [3.89, 1.39, 3.89] 	 1 un X 3.89 (1.39) -< 3.89
3 	 1 	 un 	 [4.19, 0.18, 4.19] 	 1 un X 4.19 (0.18) = 4.19
4 	 1 	 un 	 [4.19, 0.18, 4.19] 	 1 un * 4.19 (0.18) = 4.19
5 	 1 	 un 	 [4.19, 0.18, 4.19] 	 un & 4.19 (0.18) = 4.19
6 	 1 	 un 	 [4.19, 0.18, 4.19] 	 I un X 4.19 (0.18) * 4.19
7 	 1 	 un 	 [2.79, 0.32, 2.79] 	 1 un 2 2.79 (0.32) = 2.79
8 	 1 	 un 	 [6.79, 1.51, 6.79] 	 1 un X 6.79 (1.51) = 6.79
9 	 1 	 un 	 [6.79, 1.51, 6.79] 	 1 un X 6.79 (1.51) = 6.79
10 	 1 	 un 	 [6.79, 1.51, 6.79] 	 1 un X 6.79 (1.51) = 6.79
11 	 1 	 un 	 [1.69, 0.41, 1.69] 	 1 un X 1.69 (0.41) = 1.69
12 	 1 	 un 	 [3.79, 1.34, 3.7] 	 1 un * 3.79 (1.34 ) = 3.7
13 	 1 	 un 	 [3.79, 1.34, 3.79] 	 1 un X 3.79 (1.34) = 3.79
14 	 1 	 un 	 [3.79, 1.34, 3.75] 	 I un X 3.79 (1.34) = 3.75
15 	 1 	 un 	 [3.79, 1.34, 3.79] 	 1 un X 3.79 (1.34 ) = 3.79
16 	 1 	 un 	 [3.39, 1.21, 3.39] 	 un X 3.39 (1.21) = 3.39
17 	 1 	 u

### Find CNPJ

In [155]:



prices_list = list(map(lambda x: {'score': int(apply_price_similarity(x, 'd un x d,dd (d,dd) = d,dd') * 100), 'data': x}, str_lines))

print_list(prices_list)

# filtered_cnpj = filter_list_by_regex(r'CNPJ', str_lines)
# filtered_cnpj = map_list(remove_whitespaces, str_lines)
# filtered_cnpj = map_list(lambda x: x.replace(',','.'), filtered_cnpj)

# cnpj_store = filtered_cnpj[0]

# cnpj_store = re.search(r'CNPJ:([\d.-/]*)', cnpj_store)

# cnpj_store = cnpj_store[1] if len(cnpj_store.groups()) == 1 else None

# print(filtered_cnpj)

{'score': 0, 'data': 'Supermercado Mirassol Ltda'}
{'score': 0, 'data': 'R. Joao Mirassol, 509 - VL. PESTANA - OSASCO - SP'}
{'score': 0, 'data': 'CNPJ:60. 708.245/0001-08 TE :492195864111'}
{'score': 0, 'data': 'Extrato No. 63772'}
{'score': 0, 'data': 'CUPOM FISCAL ELETRONICO - SAT'}
{'score': 0, 'data': 'CPF/CNPJ do Consumidor : 333.599.318-74'}
{'score': 5, 'data': '# ICOD IDESC IQTD JUN IVL UN RS ICUL TR R$)* IVL ITEM RS'}
{'score': 0, 'data': '001 7891172432019 NEVE P. HIG NEUTRO F. DUPL'}
{'score': 15, 'data': '002 7898367980010 EISENBAHN CERU UBT 26 79 (4,40) = 16,79'}
{'score': 71, 'data': '1 un X 3,89 (1,39) -< 3,89'}
{'score': 0, 'data': '003 7896051111764 ITAMBE LEITE 2. LACT"SEMI'}
{'score': 62, 'data': '1 un X 4,19 (0, 18) = 4,19'}
{'score': 0, 'data': '009 7896051128076 ITAMBE LEITE Z. LACT DESN'}
{'score': 55, 'data': '1 un * 4, 19 (0,18) = 4,19'}
{'score': 8, 'data': '005 7896051128076 ITAMBE LEITE 2 LACT DESN'}
{'score': 0, 'data': '006 7896051128076 ITAMBE LEITE Z.LA