In [None]:
def reddit_link(row):
    return f"https://reddit.com/{row['reddit_id']}"

In [None]:
import sys
sys.path
sys.path.append('/home/sapristi/dev/fleebmarket/advert_parsing/')

In [None]:

from pydantic import BaseModel, validator
import pandas as pd
import more_itertools
from enum import Enum
from typing import Optional
import re
import ipywidgets as ipw
from collections import defaultdict
from dataclasses import dataclass

from advert_parsing.markdown_parser.custom_ast import Text, Heading, Listing, Table,  Paragraph
from advert_parsing.markdown_parser import md_to_ast

In [None]:
try:
    data_full = pd.read_pickle('adverts_parsed.pickle')
except FileNotFoundError:
    data_full = pd.read_csv('adverts.csv').drop_duplicates(subset=['full_text'], keep='last')
    data_full['ast'] = data_full['full_text'].apply(md_to_ast)
    data_full.to_pickle('adverts_parsed.pickle')
    
data_full['ast']

In [None]:
def clean_table(table: Table):
    ok_rows = []
    for row in table.rows:
        if any(row):
            ok_rows.append(row)
    
    return Table(rows=ok_rows)

In [None]:
clean_table(Table(rows=[
    [Text(text='ok'), None],
    [None, None]
]))

In [None]:
def extract_tables(item: Item):
    if isinstance(item, Table):
        return [item]
    elif (
        isinstance(item, Ast) or
        isinstance(item, Listing) or 
        isinstance(item, Paragraph)
    ):
        
        extracted_tables_lists = [extract_tables(child) for child in item.children]
        tables = [clean_table(t) for tables in extracted_tables_lists for t in tables if tables is not None]
        return tables
#         return item.recurse(lambda items: [extract_tables(item) for item in items if item is not None])
    else:
        return []
    

In [None]:
# extract_tables(Ast(children=asts[0]))

In [None]:
data_full['tables'] = data_full['ast'].apply(extract_tables)
data_full['tables_df'] = data_full['tables'].apply(lambda l: [pd.DataFrame(item.rows) for item in l])
data_full['tables_df'].iloc[0][0]

In [None]:
total_tables = sum(map(len, data_full['tables_df']))
total_tables
    

# capture prices in a table

We have to find which column is most likely to contain prices.

Problems: 

 - different units
 - sometime no units
 - sometime shipping price, which is not very relevant

## objectives

 1. Find which columns contains the prices
 2. in case multiple price tags are present, find the most likely:
    -> biggest price, non striked
    -> in case of multiple currencies, take ??


## way to go

 1. We first try to capture prices with units in all cells. 
   - A column should contain most of the prices.
 2. If no column has price, we try to capture unitless prices, and do the same


In [None]:
# capture number, with either . or , as delimiter (non capturing group)
number_regex = '\d+(?:[.,]\d+)?'

class Currency(str, Enum):
    GBP = "GBP"
    CAD = "CAD"
    AUD = "AUD"
    SGD = "SGD"
    EUR = "EUR"
    USD = "USD"

currencies = {
    Currency.GBP: ["£", "gbp"],
    Currency.CAD: ['CAD'],
    Currency.AUD: ['AUD'],
    Currency.SGD: ['sgd', 'sg$', 's$'],
    Currency.EUR: ['eur', '€'],
    Currency.USD: ['\$', 'usd'],
}

def generate_curr_regexes(curr_exprs):
    res = []
    for curr_expr in curr_exprs:
        res.append(f"{curr_expr} ?({number_regex})")
        res.append(f"({number_regex}) ?{curr_expr}")
    return res


price_regexes = {
    **{
        curr: generate_curr_regexes(curr_exprs)
        for curr, curr_exprs in currencies.items()
    },
}

no_curr_price_regexes = [
    f"({number_regex}) ?shipped",
    f"({number_regex}) ?\+ ?shipping"
]

In [None]:
class PriceTag(BaseModel):
    currency: Optional[Currency]
    amount: float
    striked: bool
    
    @validator('amount', pre=True)
    def replace_comma(cls, value):
        return value.replace(',', '.')

PriceTag(amount='5.5', currency=None, striked=False), PriceTag(amount='5,5', currency=None, striked=False)

In [None]:
def find_prices_in_text(text: Text) -> list[PriceTag]:
    res = []
    for curr, regexes in price_regexes.items():
        for regex in regexes:
            matches = re.findall(regex, text.text, flags=re.IGNORECASE)
            res.extend(PriceTag(currency=curr, amount=match, striked=text.striked) for match in matches)
    if not res:
        for regex in no_curr_price_regexes:
            matches = re.findall(regex, text.text, flags=re.IGNORECASE)
            res.extend(PriceTag(currency=None, amount=match, striked=text.striked) for match in matches)
    return res

def find_price_wo_curr_in_text(text: Text, min_amount=10):
    """Find numbers in text. If a number is more than min_amount, consider it as unitless price."""
    number_only_regex = f"(?<![a-z0-9-])({number_regex})(?![a-z.,0-9])"
    matches = re.findall(number_only_regex, text.text, flags=re.IGNORECASE)
    res = []
    for match in matches:
        price_tag = PriceTag(currency=None, amount=match, striked=text.striked)
        if price_tag.amount >= min_amount:
            res.append(price_tag)
    return res

def find_sold_token_in_text(text: Text):
    return [(
        'sold' in text.text.lower() or
        'traded' in text.text.lower()
    )]

def find_price_token_in_text(text: Text):
    text_lower = text.text.lower()
    return [(
        'price' in text_lower or
        'want' in text_lower or 
        'asking' in text_lower or
        'usd' in text_lower or
        'cost' in text_lower or
        'pricing' in text_lower
    )]


def find_in_cell(find_function):
    def inner(cell):
        if cell is None:
            return []
        if isinstance(cell, Text):
            return find_function(cell)
        else:
            return list(more_itertools.collapse(
                [find_in_cell(find_function)(child) for child in cell.children],
                levels=1
            ))
    return inner

# technique de classification 

si des prix sont trouvés:

 - classification avec uniquement des prix
 - si échec, classification avec prix + tag sold/traded
 
si aucun prix, la même avec uniqument les nombres

# algo de classification

on cherche si une colonne contient l'essentiel des cellules avec les prix
classification table artisan si les prix sont répartis a peu près uniformément

In [None]:
class FoundPrices(BaseModel):
    col_index: int
    nb_found: int

    
@dataclass
class ArtisanTable:
    pass
    
class ItemsTable(BaseModel):
    price_cols: list[int]
    has_header: Optional[bool] = None

@dataclass
class Failure:
    '''Failure to classify'''
    reason: str
    
@dataclass
class NotRelevant:
    '''Table does not contain anything usefull'''
    reason: str

In [None]:
def header_cell_with_price(cell):
    if cell is None:
        return False
    price_tokens = find_in_cell(find_price_token_in_text)(cell)
    price_tags = find_in_cell(find_prices_in_text)(cell)
    return sum(price_tokens) >= 1 and not price_tags

# TODO: we could also check for striked text, which would indicate it is not a header
def classify_with_header(df) -> ItemsTable | Failure:
    first_row = df.iloc[0]
    cells_with_price = [header_cell_with_price(cell) for cell in first_row]
    price_header_indices = [i for i, is_price_header in enumerate(cells_with_price) if is_price_header]
    if price_header_indices:
        return ItemsTable(price_cols=price_header_indices, has_header=True)
    return Failure("Cannot classify from header")
        
classify_with_header(data_full['tables_df'].iloc[0][0])

In [None]:
def make_bool_df(df, find_function):
    prices_df = df.applymap(find_in_cell(find_function))
    bool_df = prices_df.applymap(bool)
    return bool_df

In [None]:
df1 = pd.DataFrame([
    [True, False],
    [True, False]
])
df2 = pd.DataFrame([
    [False, True],
    [True, False]
])
df1.add(df2)

In [None]:
def generate_repartion(bool_df) -> list[FoundPrices]:
    col_count = bool_df.sum()
    repartition = []
    for i, value in enumerate(col_count):
        if value != 0:
            repartition.append(FoundPrices(nb_found=value, col_index=i))
    return sorted(repartition, key=lambda x: x.nb_found)

bool_df = make_bool_df(data_full['tables_df'].iloc[0][0], find_prices_in_text)
generate_repartion(bool_df)

### Classification à partir du header

 - détection d'un des mots clés (price, etc)
 - ne contient pas de prix
 
Piste à explorer: cellule simple sans ast; ou nombre de mots

In [None]:
def check_header_keywords():
    """prints out the words in header """
    for i, row in data_full.iterrows():
        for table in row['tables_df']:
            price_bool_df = make_bool_df(table, find_prices_in_text)
            repartition = generate_repartion(price_bool_df)
            if len(repartition) == 1:
                col_index = repartition[0].col_index
                if not price_bool_df.at[0, col_index]:
                    if not header_cell_with_price(table.at[0, col_index]):
                        print(table.at[0, col_index], reddit_link(row))
                        
# check_header_keywords()

### Classification à partir de la répartion des prix

1. Classification différente suivant classification du header 
 OU
2. Classification indépendante puis merge des résultats ?


Remarques:
 - La solution 2. passe plus facilement à l'échelle, mais il faut faire attention aux tables d'artisan.
 - De toute façon il faut savoir classifier sans header ; mais en fait ce n'est pas la même chose de classifier sans header et de classifier en sachant qu'aucune header n'a été trouvé
 

-> implémenter les deux solutions et comparer les résultats


####

Ce qu'il en ressort:

ça a l'air de marcher plutôt pas trop mal. Ce qui pourrait être mieux pour combiner:

 - les classifiers retournent uniquement les colonnes avec des prix, et éventuellement si un header est détecté
 - la classification en tant que table d'artisan par ex se fait par la suite lors de l'étape de combinaison

#### Classification avec infos header

#### Classification combinée

In [None]:
def classify_table_simple(df, find_function) -> NotRelevant | ItemsTable | ArtisanTable:
    nb_rows = len(df.index)
    nb_cols = len(df.columns)
    
    bool_df = make_bool_df(df, find_function)
#     print(bool_df)
    repartition = generate_repartion(bool_df)
    
    if len(repartition) == 0:
        return NotRelevant(reason="No price")
    
    relevant_columns = [p for p in repartition if p.nb_found >= nb_rows / 2]
    if len(relevant_columns) == 0:
        return Failure(f"Price: Not enough rows with price")

    if len(relevant_columns) == 1:        
        value = relevant_columns[0]
        return ItemsTable(price_cols=[value.col_index], has_header=None)
    
    if (
        (len(relevant_columns) == nb_cols) or
        (nb_cols > 3 and len(relevant_columns) >= nb_cols -1) or
        (nb_cols > 4 and len(relevant_columns) >= nb_cols -2)
    ):
        return ArtisanTable()
        
    return ItemsTable(price_cols=[value.col_index for value in relevant_columns])

In [None]:
def combined_classif(df):
    header_classif = classify_with_header(df)
    sold_bool_df = make_bool_df(df, find_sold_token_in_text)

    price_classif = classify_table_simple(df, find_prices_in_text)
    if not isinstance(price_classif, ItemsTable):
        price_classif_wocurr = classify_table_simple(df, find_price_wo_curr_in_text)
        if isinstance(price_classif_wocurr, ItemsTable):
            price_classif = price_classif_wocurr

    match header_classif, price_classif:
        case ItemsTable(price_cols=price_cols_1, has_header=_), ItemsTable(price_cols=price_cols_2):
            common_cols = set(price_cols_1) & set(price_cols_2)
            if len(common_cols) == 0:
                return Failure(reason="No common col between header and price classif")
            return ItemsTable(price_cols=common_cols, has_header=True)
        
        case ItemsTable(price_cols=price_cols, has_header=_), Failure(reason=_):
            return ItemsTable(price_cols=price_cols, has_header=True)
        
        case ItemsTable(price_cols=price_cols, has_header=_), ArtisanTable() | NotRelevant():
            return Failure(reason="different_classif")
        
        case Failure(reason=_), ItemsTable(price_cols=price_cols):
            return ItemsTable(price_cols=price_cols, has_header=False)
            
        case Failure(reason=_), _:
            return price_classif
        
        case _, _:
            print(header_classif, price_classif)
            raise 

In [None]:
tables_array = [
    table 
    for tables in data_full['tables_df']
    for table in tables
]
len(tables_array)

In [None]:
classif_df = pd.DataFrame(tables_array, columns=['table'])
classif_df['classif'] = classif_df['table'].apply(combined_classif)

In [None]:
classif_df['class'] = classif_df['classif'].apply(lambda o: o.__class__.__name__)

In [None]:
classif_df.groupby(['class']).count()