In [None]:
libs = ["kagglehub[pandas-datasets]"]
try:
  !pip install -q libs
except ValueError:
  print("Already installed, ", ValueError)

In [None]:
import kagglehub
import pandas as pd
import numpy as np

from tqdm.notebook import tqdm

from transformers import BertForTokenClassification, DistilBertTokenizerFast, pipeline


In [None]:
tqdm.pandas()

In [None]:
import requests
import zipfile
import os

def baixar_e_extrair_folha_dataset(dest_dir='./folha_dataset'):
    url = "https://www.kaggle.com/api/v1/datasets/download/marlesson/news-of-the-site-folhauol"
    zip_path = "./news-of-the-site-folhauol.zip"

    print("⬇️  Baixando dataset...")
    response = requests.get(url, stream=True)

    if response.status_code == 200:
        with open(zip_path, "wb") as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"✅ Download concluído: {zip_path}")
    else:
        raise Exception(f"❌ Erro {response.status_code} ao baixar dataset.")

    print("📦 Extraindo arquivos...")
    os.makedirs(dest_dir, exist_ok=True)
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(dest_dir)
    print(f"✅ Extração concluída para: {dest_dir}")

    # Apaga o zip se quiser
    os.remove(zip_path)

    return dest_dir


In [None]:
caminho = baixar_e_extrair_folha_dataset()

In [None]:
caminho

In [None]:
# Set the path to the file you'd like to load
df = pd.read_csv(r'.\folha_dataset\articles.csv')

print("First 5 records: \n")
display(df.head())

In [None]:
df.shape

In [None]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords

# Lista em português
stopwords_pt = set(stopwords.words('portuguese'))

In [None]:
df = df.dropna(axis=0, subset=['text'])


In [None]:
df.subcategory.value_counts()

In [None]:
# Convert 'timestamp' column to datetime objects
df['timestamp'] = pd.to_datetime(df['date'])
df.shape

In [None]:

# Filter by 'category' or 'subcategory' containing 'mercado' (case-insensitive)
df = df[df['category'].str.contains('mercado', case=False, na=False) |
                df['subcategory'].str.contains('mercado', case=False, na=False)]

df.shape

In [None]:
df = df[
    (df['timestamp'].dt.year == 2015) &
    (df['timestamp'].dt.quarter == 1)
]
df.shape

In [None]:
model = BertForTokenClassification.from_pretrained('monilouise/ner_pt_br')
tokenizer = DistilBertTokenizerFast.from_pretrained('neuralmind/bert-base-portuguese-cased'
                                                    , model_max_length=512
                                                    , do_lower_case=False
                                                    )
nlp = pipeline('ner', model=model, tokenizer=tokenizer, grouped_entities=True)

In [None]:
def run_nlp(text):
    output = nlp(text)
    return output if output else []

df['doc'] = df['text'].progress_apply(run_nlp)

In [None]:
df.head(1).doc.values[0][1]

In [None]:
def extract_words(text):
    if text is None:
        return None
    else:
        words = [json['word'] for json in text]

        return words

df['words'] = df['doc'].progress_apply(extract_words)

In [None]:
def extract_entities(text):
    if text is None:
        return None
    else:
        entities = [json['entity_group'] for json in text]

        return entities

df['entities'] = df['doc'].progress_apply(extract_entities)

In [None]:
def extract_entities_classifications(text):
    if text is None:
        return None
    else:
        entities = [{'word':json['word'],
                     'entity_group':json['entity_group'],
                     'score':float(json['score'])

                     } for json in text]

        return entities

df['entities_classifications'] = df['doc'].progress_apply(extract_entities_classifications)

In [None]:
df.head(1).entities_classifications.values[0]

In [None]:
df.entities.value_counts()

In [None]:
df[df.text.str.contains('bradesco',case=False)].head(1).doc.values[0]

In [None]:
sw = set(list(stopwords_pt) + ['Nu','CN'])

In [None]:
def reconstruir_entidades(ents, tipo='ORG'):
    entidades_reconstruidas = []
    entidade_atual = ''

    for ent in ents:

        palavra = ent['word']
        if palavra.startswith('##') and ent['entity_group'] == tipo:
            entidade_atual += palavra[2:]
            print('startswith ##: ',entidade_atual)
        elif ent['entity_group'] == tipo:
            print('Else if: ',entidade_atual)
            if entidade_atual:
                if not entidade_atual in sw and len(entidade_atual) > 2:

                  entidades_reconstruidas.append(entidade_atual)

            entidade_atual = palavra

    # salva a última
    if entidade_atual:
        entidades_reconstruidas.append(entidade_atual)

    return entidades_reconstruidas


df['orgs_reconstruidas'] = df['doc'].apply(reconstruir_entidades)


In [None]:
df.orgs_reconstruidas.values[:]

In [None]:
df.orgs_reconstruidas

In [None]:
from collections import Counter

counter:list =  []

for org in df.orgs_reconstruidas:
   counter.extend(org)


org_count = Counter(counter).most_common(20)
for org in org_count:
  print(org)