In [1]:
def reddit_link(row):
    return f"https://reddit.com/{row['reddit_id']}"

In [2]:
import sys
sys.path
sys.path.append('/home/sapristi/dev/fleebmarket/advert_parsing/')

In [8]:
from pydantic import BaseModel, validator
import pandas as pd
import more_itertools
from enum import Enum
from typing import Optional
import re
import ipywidgets as ipw
from collections import defaultdict
from dataclasses import dataclass

from advert_parsing.markdown_parser import Text, Heading, Listing, Table,  Paragraph, md_to_ast
from advert_parsing.table_classification import extract_tables, combined_classif

In [4]:
try:
    data_full = pd.read_pickle('adverts_parsed.pickle')
except FileNotFoundError:
    data_full = pd.read_csv('adverts.csv').drop_duplicates(subset=['full_text'], keep='last')
    data_full['ast'] = data_full['full_text'].apply(md_to_ast)
    data_full.to_pickle('adverts_parsed.pickle')
    
data_full['ast']

0       children=[Paragraph(children=[Text(text='GMK D...
1       children=[Paragraph(children=[Text(text='Hello...
2       children=[Paragraph(children=[Text(text='Times...
3       children=[Paragraph(children=[Text(text='Times...
4       children=[Paragraph(children=[Text(text='TIMES...
                              ...                        
9995    children=[Paragraph(children=[Text(text='Times...
9996    children=[Paragraph(children=[Text(text='Times...
9997    children=[Paragraph(children=[Text(text='Howdy...
9998    children=[Paragraph(children=[Text(text='Times...
9999    children=[Paragraph(children=[Text(text='times...
Name: ast, Length: 9899, dtype: object

In [6]:
data_full['tables'] = data_full['ast'].apply(extract_tables)
data_full['tables_df'] = data_full['tables'].apply(lambda l: [pd.DataFrame(item.rows) for item in l])
data_full['tables_df'].iloc[0][0]

Unnamed: 0,0,1
0,text='Price' styles=set(),text='Item' styles=set()
1,text='30 Eur' styles=set(),text='Cherry G81-3000 new keyboard ISO-UK Desc...
2,"children=[Text(text='280 Eur', styles={<StyleV...",text='GMK Oblivion V2 Git base + split spaceba...
3,text='115 Eur' styles=set(),"text=""GMK DMG 60% arrows layout keys Descripti..."
4,text='10 Eur/each keycap' styles=set(),text='GMK Novelties keys Description: New cond...
5,text='70 Eur' styles=set(),text='GMK Esc keys collection Description: New...
6,text='16 Eur' styles=set(),text='GMK MX logo keys kit Description: New' s...
7,text='35 Eur' styles=set(),text='GMK Geekhack logo kit Description: New' ...
8,text='30 Eur' styles=set(),text='Hot Keys Project 6u Topre Caution Bar De...
9,text='20 Eur' styles=set(),"text=""Hector's cable Beige sleeve UsbC Descrip..."


In [7]:
total_tables = sum(map(len, data_full['tables_df']))
total_tables
    

3323

In [10]:
tables_array = [
    table 
    for tables in data_full['tables_df']
    for table in tables
]
len(tables_array)

3323

In [11]:
classif_df = pd.DataFrame(tables_array, columns=['table'])
classif_df['classif'] = classif_df['table'].apply(combined_classif)

  values = np.array([convert(v) for v in values])


In [34]:
from advert_parsing.table_classification import classify_with_header, make_bool_df, classify_table_simple, ItemsTable, Failure, ArtisanTable, NotRelevant
from advert_parsing.classification_utils.prices import  find_sold_token_in_text, find_prices_in_text, find_price_wo_curr_in_text
def combined_classif_py310(df):
    header_classif = classify_with_header(df)
    sold_bool_df = make_bool_df(df, find_sold_token_in_text)

    price_classif = classify_table_simple(df, find_prices_in_text)
    if not isinstance(price_classif, ItemsTable):
        price_classif_wocurr = classify_table_simple(df, find_price_wo_curr_in_text)
        if isinstance(price_classif_wocurr, ItemsTable):
            price_classif = price_classif_wocurr

    match header_classif, price_classif:
        case ItemsTable(price_cols=price_cols_1, has_header=_), ItemsTable(price_cols=price_cols_2):
            common_cols = set(price_cols_1) & set(price_cols_2)
            if len(common_cols) == 0:
                return Failure(reason="No common col between header and price classif")
            return ItemsTable(price_cols=common_cols, has_header=True)
        
        case ItemsTable(price_cols=price_cols, has_header=_), Failure(reason=_):
            return ItemsTable(price_cols=price_cols, has_header=True)
        
        case ItemsTable(price_cols=price_cols, has_header=_), ArtisanTable() | NotRelevant():
            return Failure(reason="different_classif")
        
        case Failure(reason=_), ItemsTable(price_cols=price_cols):
            return ItemsTable(price_cols=price_cols, has_header=False)
            
        case Failure(reason=_), _:
            return price_classif
        
        case _, _:
            print(header_classif, price_classif)
            raise 

In [35]:
classif_df['classif_py310'] = classif_df['table'].apply(combined_classif_py310)

In [36]:
classif_df[classif_df['classif'] != classif_df['classif_py310']]

Unnamed: 0,table,classif,classif_py310
37,...,"price_cols=[1, 2] has_header=None","price_cols=[1, 2] has_header=False"
256,...,"price_cols=[2, 1] has_header=None","price_cols=[2, 1] has_header=False"
295,...,"price_cols=[0, 2] has_header=None","price_cols=[0, 2] has_header=False"
410,0 ...,"price_cols=[1, 2] has_header=None","price_cols=[1, 2] has_header=False"
446,0 ...,"price_cols=[2, 1] has_header=None","price_cols=[2, 1] has_header=False"
531,...,"price_cols=[0, 2] has_header=None","price_cols=[0, 2] has_header=False"
619,0 ...,"price_cols=[4, 2, 3] has_header=None","price_cols=[4, 2, 3] has_header=False"
648,0 ...,"price_cols=[0, 4] has_header=None","price_cols=[0, 4] has_header=False"
729,...,"price_cols=[0, 2] has_header=None","price_cols=[0, 2] has_header=False"
778,...,"price_cols=[1, 2] has_header=None","price_cols=[1, 2] has_header=False"


In [None]:
classif_df['class'] = classif_df['classif'].apply(lambda o: o.__class__.__name__)

In [None]:
classif_df.groupby(['class']).count()