In [2]:
import pandas as pd
from pathlib import Path
import re

In [3]:
file_path = Path('.').resolve() / 'data' / 'recipt_content.xlsx'
recipe_content = pd.read_excel(file_path, index_col=0).rename({0:'content'},axis = 1).dropna().reset_index(drop=True)
recipe_content.head(15).iloc[10].content
print(recipe_content.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 681 entries, 0 to 680
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   content     681 non-null    object 
 1   is_product  681 non-null    float64
dtypes: float64(1), object(1)
memory usage: 10.8+ KB
None


In [4]:
def extract_price(text_content:str):
    t =  text_content.replace(':','0')
    pattern = r'[^.,-]\.?(\s?\d+[,.]+?\d\d\s?[0OABC846]\s)'
    if re.search(pattern,t) and len(t) > 25:
        price = re.findall(pattern, t)
        return price[0]
    return None

recipe_content['price'] = recipe_content['content'].apply(extract_price)

In [5]:
only_products = recipe_content.loc[recipe_content['is_product'] == 1]

In [6]:
print(only_products.loc[only_products['price'].isnull()])
only_products

                       content  is_product price
85         KIWI 1 x1,79 1,790          1.0  None
87      LIMONKA 3 x1,79 5,37C          1.0  None
405  CCHLEB 500g 1*2.69 2.690          1.0  None
502      1szt. x10,99 10,9: A          1.0  None


Unnamed: 0,content,is_product,price
10,VICHY Dercos szapp/łup wrazliwy. 17841A 1op 44...,1.0,"44,99 0"
32,"BLOK BIUROWY A4 100K TOP COLOR 2.0 1SZT x5,20 ...",1.0,"5,20A"
33,"ZESZYT AS 60K KRAFT 1SZT X3,00 3,00A",1.0,"3,00A"
34,"ZESZYT A5 32K PP TOP 1SZT X2,80 2,80A",1.0,"2,80A"
35,"KOPERTY DO ZAPROSZEN METALICZNE 2SZT 1,00 2,00A",1.0,"2,00A"
...,...,...,...
615,"TYMBARK SOK MULTIVEJ 1 x2,49 2,490",1.0,2490
617,"EKA SCHMITT PŁYTK\AX 1 X24,99 24.99A",1.0,24.99A
619,"KUPIEC WAFLE RYZOVEJ 1 x2,19 2,190",1.0,2190
641,"2004714399999 WODA TOALETO 1*59,90= 59,90 A",1.0,"59,90 A"


In [7]:
only_recognised = recipe_content.loc[recipe_content['price'].notnull()]

In [8]:
print(only_recognised.value_counts('is_product'))
print(only_recognised.loc[only_recognised['is_product'] == 0])
only_recognised
# can be fixed by specifying demanded length

is_product
1.0    178
dtype: int64
Empty DataFrame
Columns: [content, is_product, price]
Index: []


Unnamed: 0,content,is_product,price
10,VICHY Dercos szapp/łup wrazliwy. 17841A 1op 44...,1.0,"44,99 0"
32,"BLOK BIUROWY A4 100K TOP COLOR 2.0 1SZT x5,20 ...",1.0,"5,20A"
33,"ZESZYT AS 60K KRAFT 1SZT X3,00 3,00A",1.0,"3,00A"
34,"ZESZYT A5 32K PP TOP 1SZT X2,80 2,80A",1.0,"2,80A"
35,"KOPERTY DO ZAPROSZEN METALICZNE 2SZT 1,00 2,00A",1.0,"2,00A"
...,...,...,...
615,"TYMBARK SOK MULTIVEJ 1 x2,49 2,490",1.0,2490
617,"EKA SCHMITT PŁYTK\AX 1 X24,99 24.99A",1.0,24.99A
619,"KUPIEC WAFLE RYZOVEJ 1 x2,19 2,190",1.0,2190
641,"2004714399999 WODA TOALETO 1*59,90= 59,90 A",1.0,"59,90 A"


In [9]:
temp = pd.concat([only_recognised,only_products]).drop_duplicates()

In [13]:
temp['content'].str.strip().to_list()

['VICHY Dercos szapp/łup wrazliwy. 17841A 1op 44,99 : 44,99 A',
 'BLOK BIUROWY A4 100K TOP COLOR 2.0 1SZT x5,20 5,20A',
 'ZESZYT AS 60K KRAFT 1SZT X3,00 3,00A',
 'ZESZYT A5 32K PP TOP 1SZT X2,80 2,80A',
 'KOPERTY DO ZAPROSZEN METALICZNE 2SZT 1,00 2,00A',
 'KOPERTA WIZYTOWA 2SZT X0,80 1,60A',
 'VICHY Dercos Silip p/lup wrazliwy. 17841A 1op * 44,99 44,99 A',
 'LIPTON ICE TEA GREEN 1.5L 1 x4,79 4,79A',
 'PIHO AMBER PSZENICZNIAK 1 x4,99 4.99A',
 'HIND SE CARLO ROSSI ROSE 1 x19,99 19,99A',
 'ZOZOLE HELLO ZELO RAIN.75 1 x2,69 2,69A',
 'BON RI RYZ BASMATI 1 KG 1 x8,99 8.990',
 'MONINI OLIWA Z OLIW.500ML 1 x19,99 19.990',
 'PAPRYKA SŁODKA MIEL. 20 G 1 x0,99 0,99B',
 'PAPRYKA OSTRA MIELONA 20G 1 x0,99 0,99B',
 'REKLAMOWKA Z USZAMI 1 x0,69 0,69A',
 'KAMIS PAPRYKA WEDZ. 20G 1 x1,79 1,798',
 'CHEDDAR MATURE PL 160G 1 x6,39 6,39C',
 'KAMIS KHIN RZYM 15 G 1 x1,79 1,798',
 'KUKURYDZA 340G 1 x2,49 2,49C',
 'FASOLA CZER. KIDNEY 410 G 1 x2.29 2,290',
 'MANDARYNKI - KG 0,28 x7,99 2.24C',
 'ZIOŁA W DONICZ

In [14]:
temp['price'].str.strip().to_list()


['44,99 0',
 '5,20A',
 '3,00A',
 '2,80A',
 '2,00A',
 '1,60A',
 '44,99 A',
 '4,79A',
 '4.99A',
 '19,99A',
 '2,69A',
 '8.990',
 '19.990',
 '0,99B',
 '0,99B',
 '0,69A',
 '1,798',
 '6,39C',
 '1,798',
 '2,49C',
 '2,290',
 '2.24C',
 '5,990',
 '6,99C',
 '92,790',
 '1,69B',
 '0,86C',
 '0,186',
 '0,650',
 '1,29A',
 '4.99C',
 '5,990',
 '2,190',
 '2,99C',
 '1,690',
 '2.99A',
 '7.70A',
 '8,00C',
 '13,69 A',
 '3,69 C',
 '4,19 C',
 '4,49 C',
 '5,99 C',
 '9,49 C',
 '9,79 C',
 '8,98 A',
 '5.690',
 '2.890',
 '2.596',
 '2.10C',
 '1.990',
 '3.99C',
 '4.990',
 '0.49A',
 '24,99 A',
 '2,89C',
 '5,16C',
 '6,98C',
 '4,990',
 '26,95 B',
 '3,50B',
 '5,00B',
 '49,99 A',
 '21.40A',
 '94.80A',
 '4,78C',
 '3,27C',
 '5,49A',
 '2,15A',
 '5,29A',
 '7,780',
 '2.99C',
 '4.990',
 '4,780',
 '8,98C',
 '3,99A',
 '1,99A',
 '11,97C',
 '14.99A',
 '2,40A',
 '0.900',
 '13.49A',
 '11.798',
 '4.194',
 '3,99A',
 '0,49A',
 '17,970',
 '14,990',
 '5.980',
 '4.980',
 '14,490',
 '5.98C',
 '1,65C',
 '3,38B',
 '3,250',
 '5.18B',
 '3.790',

In [15]:
for product, price in zip(temp['content'].str.strip().to_list(), temp['price'].str.strip().to_list()):
    print(product.removesuffix(price))


VICHY Dercos szapp/łup wrazliwy. 17841A 1op 44,99 : 44,99 A
BLOK BIUROWY A4 100K TOP COLOR 2.0 1SZT x5,20 
ZESZYT AS 60K KRAFT 1SZT X3,00 
ZESZYT A5 32K PP TOP 1SZT X2,80 
KOPERTY DO ZAPROSZEN METALICZNE 2SZT 1,00 
KOPERTA WIZYTOWA 2SZT X0,80 
VICHY Dercos Silip p/lup wrazliwy. 17841A 1op * 44,99 
LIPTON ICE TEA GREEN 1.5L 1 x4,79 
PIHO AMBER PSZENICZNIAK 1 x4,99 
HIND SE CARLO ROSSI ROSE 1 x19,99 
ZOZOLE HELLO ZELO RAIN.75 1 x2,69 
BON RI RYZ BASMATI 1 KG 1 x8,99 
MONINI OLIWA Z OLIW.500ML 1 x19,99 
PAPRYKA SŁODKA MIEL. 20 G 1 x0,99 
PAPRYKA OSTRA MIELONA 20G 1 x0,99 
REKLAMOWKA Z USZAMI 1 x0,69 
KAMIS PAPRYKA WEDZ. 20G 1 x1,79 
CHEDDAR MATURE PL 160G 1 x6,39 
KAMIS KHIN RZYM 15 G 1 x1,79 
KUKURYDZA 340G 1 x2,49 
FASOLA CZER. KIDNEY 410 G 1 x2.29 
MANDARYNKI - KG 0,28 x7,99 
ZIOŁA W DONICZCE 1 x5,99 
POMIDORY SUSZ W OLEJU280G 1 x6,99 
SHIETANA 182 400 G 1 x2,7
PRZYPRAWA DO KAWY MIX 20G 1 x1,69 
BANANY - KG 0,172 x4,99 
PAPRYKA CZERWONA KG 0,186 x6,99 1,30C
CEBULA ZOŁTA KG 0,162 x3,99 

TypeError: removesuffix() argument must be str, not None