In [2]:
# Feito a conexão ao contêiner local

from elasticsearch import Elasticsearch
es = Elasticsearch("http://localhost:9200")
print(es.info())

{'name': '3a728dd319ba', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'WO2zj9DLSoaLWbLwQ2v37A', 'version': {'number': '8.15.0', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '1a77947f34deddb41af25e6f0ddb8e830159c179', 'build_date': '2024-08-05T10:05:34.233336849Z', 'build_snapshot': False, 'lucene_version': '9.11.1', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}


In [8]:
"""
É criado o dataframe do dataset.

Explicação:

Acessamos o caminho "./dataset/bbc"
Para cada item na pasta (categoria) adicionamos ao 'data':
    id,
    categoria (nome da pasta),
    title (primeira linha do arquivo),
    text (texto do artigo)
"""

import os
import pandas as pd

path = "./dataset/bbc"
data = []
id_counter = 0

for category in os.listdir(path):
    category_path = os.path.join(path, category)

    if os.path.isdir(category_path):

        for filename in os.listdir(category_path):
            if filename.endswith(".txt"):
                file_path = os.path.join(category_path, filename)

                with open(file_path, "r", encoding="latin1") as f:
                    text = f.read().strip()

                lines = text.split("\n")
                title = lines[0].strip() if len(lines) > 0 else ""
                full_text = " ".join(lines[1:]).strip() if len(lines) > 1 else title

                data.append({
                    "id": id_counter,
                    "category": category,
                    "title": title,
                    "text": full_text
                })

                id_counter += 1

df = pd.DataFrame(data, columns=["id","category","title","text"])
print(df.shape)
print(df.head(10))

(2225, 4)
   id category                             title  \
0   0     tech  Looks and music to drive mobiles   
1   1     tech   Hi-tech posters guide commuters   
2   2     tech      Xbox power cable 'fire fear'   
3   3     tech  Games win for Blu-ray DVD format   
4   4     tech  PC photo printers challenge pros   
5   5     tech  Software watching while you work   
6   6     tech    Online commons to spark debate   
7   7     tech  Apple sues to stop product leaks   
8   8     tech           Broadband soars in 2004   
9   9     tech  Pandas benefit from wireless net   

                                                text  
0  Mobile phones are still enjoying a boom time i...  
1  Interactive posters are helping Londoners get ...  
2  Microsoft has said it will replace more than 1...  
3  The next-generation DVD format Blu-ray is winn...  
4  Home printed pictures can be cheaper and highe...  
5  Software that can not only monitor every keyst...  
6  Online communities set up by 

In [10]:
# Feito um holdout (70/30) no DataFrame
# Obs: Provavelmente não é a melhor abordagem. Poderíamos fazer isso após gerar o dataframe para o treinamento do LTR algorithm.

from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(
    df,
    test_size=0.3,
    random_state=42,
    stratify=df['category']
)

print("Tamanho do treino:", len(train_df))
print("Tamanho do teste:", len(test_df))


Tamanho do treino: 1557
Tamanho do teste: 668


In [12]:
!pip install tqdm

Collecting tqdm
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.67.1


In [14]:
# Criamos um index no formato 'mapping' e adicionamos todos os documentos (no formato do índice declarado) ao nosso contêiner local.

from tqdm import tqdm

index_name = "bbc_articles"

if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)

mapping = {
    "mappings":{
        "properties": {
            "title": {"type":"text"},
            "text": {"type":"text"},
            "category": {"type":"keyword"}
        }
    }
}

es.indices.create(index=index_name, body=mapping)

for _, row in tqdm(train_df.iterrows(), total=len(train_df)):
    doc = {
        "title": row["title"],
        "text": row["text"],
        "category": row["category"]
    }
    es.index(index=index_name, id=int(row["id"]), document=doc)

100%|██████████████████████████████████████████████████████████████████████████████████████████| 1557/1557 [00:13<00:00, 119.09it/s]


In [17]:
# Exemplo de uma consulta em nossos documentos indexados. Utiliza o BM-25 para realizar a busca (lexical search).

query = {
    "query": {
        "match": {
            "text": "government policy economy"
        }
    }
}

res = es.search(index=index_name, body=query)
for hit in res['hits']['hits']:
    print(hit['_score'], "->", hit['_source']['title'])

10.023342 -> Markets signal Brazilian recovery
9.773174 -> Japan economy slides to recession
9.589449 -> Japanese growth grinds to a halt
9.490807 -> Brazil jobless rate hits new low
8.596207 -> Parties warned over 'grey vote'
8.546261 -> BBC poll indicates economic gloom
7.951545 -> US trade deficit widens sharply
7.8857813 -> Newest EU members underpin growth
7.743498 -> 'Strong dollar' call halts slide
7.718348 -> Blair backs 'pre-election budget'


In [32]:
# É criado o DataFrame que será utilizado para o treinamento do nosso algoritmo LTR.
# Utilizamos 30 consultas (5 tipos, cada um com 6 consultas diferentes)
# Percorremos as categorias das consultas e assim, as suas consultas associadas realizando buscas em nossos documentos indexados.
# Em nosso DataFrame adicionamos: id da query, texto da query, id do documento, score do bm-25, relevância, categoria do doc e da query.

queries = {
    "business": [
        "latest stock market trends",
        "company profits and earnings reports",
        "impact of inflation on global economy",
        "business mergers and acquisitions news",
        "rise of startups in finance sector",
        "oil prices and economic growth"
    ],
    "entertainment": [
        "new movie releases this month",
        "celebrity awards and red carpet events",
        "top streaming series of the year",
        "music festivals and live performances",
        "film reviews from critics",
        "box office hits and records"
    ],
    "politics": [
        "government election campaign promises",
        "international relations and diplomacy",
        "new legislation passed by parliament",
        "political debates on climate change",
        "public opinion on government policies",
        "european union policy changes"
    ],
    "sport": [
        "football world cup latest scores",
        "tennis grand slam results",
        "athletes preparing for olympic games",
        "transfer news in premier league",
        "team rankings in international cricket",
        "basketball championship finals highlights"
    ],
    "tech": [
        "latest smartphone innovations",
        "artificial intelligence in healthcare",
        "cybersecurity and data breaches",
        "new software development frameworks",
        "future of quantum computing",
        "robotics and automation in industry"
    ]
}

ltr_data = []
query_id = 0

for category, query_list in queries.items():
    for query_text in query_list:
        query_id += 1

        query_body = {
            "size":50,
            "query":{
                "multi_match": {
                    "query": query_text,
                    "fields":["title","text"]
                }
            }
        }

        res = es.search(index=index_name, body=query_body)

        for hit in res["hits"]["hits"]:
            doc_id = hit["_id"]
            score = hit["_score"]
            doc_category = hit["_source"]["category"]
    
            relevance = 1 if doc_category == category else 0
    
            ltr_data.append({
                "query_id": query_id,
                "query_text": query_text,
                "doc_id": doc_id,
                "bm25_score":score,
                "relevance": relevance,
                "doc_category": doc_category,
                "query_category": category
            })

ltr_df = pd.DataFrame(ltr_data)
print(ltr_df.head(10))
ltr_df.to_csv("ltr_dataset.csv",index=False)

   query_id                  query_text doc_id  bm25_score  relevance  \
0         1  latest stock market trends    651   10.705631          1   
1         1  latest stock market trends    611   10.473111          1   
2         1  latest stock market trends    496    9.555423          1   
3         1  latest stock market trends    823    9.221897          1   
4         1  latest stock market trends    545    8.787973          1   
5         1  latest stock market trends    889    8.747120          1   
6         1  latest stock market trends     43    8.713518          0   
7         1  latest stock market trends    315    8.713518          0   
8         1  latest stock market trends    411    8.533895          1   
9         1  latest stock market trends    649    8.480694          1   

  doc_category query_category  
0     business       business  
1     business       business  
2     business       business  
3     business       business  
4     business       business  
5   

In [49]:
!pip install lightgbm
!pip install scikit_learn



In [1]:
# É treinado um modelo 'LambdaRANK' em nosso DataFrame de treinamento.
# Ele utiliza uma feature (bm-25 score) para predizer a relevância do documento
# Veja que utilizar apenas uma feature (bm-25 score) faz com que nosso modelo atue como o BM-25.
# Adicionaremos mais features para tornar o ranqueamento mais inteligente.

import lightgbm as lgb

X = ltr_df[['bm25_score']]
y = ltr_df['relevance']

num_queries = ltr_df['query_id'].nunique()
docs_per_query = 50

group_sizes = [docs_per_query] * num_queries

train_data = lgb.Dataset(X, label=y, group=group_sizes)

params = {
    "objective":"lambdarank",
    "metric": "ndcg",
    "ndcg_eval_at":[10],
    "learning_rate": 0.05,
    "num_leaves":31,
    "min_data_in_leaf": 20,
    "verbose": -1
}

model = lgb.train(params, train_data, num_boost_round=150)
model.save_model("ltr_model.txt")

NameError: name 'ltr_df' is not defined