• tr_id: identificador único do traceroute;

• tr_src: identificador único de máquina de origem;

• tr_dst: identificador único de máquina de destino (máquina do m-lab);

• all_rtts: lista de valores em ponto flutuante, em milissegundos (ms), representando o tempo de ida e volta (RTT – Round Trip Time) de cada pacote no último hop;

• tr_attempts: número de tentativas feitas por probe para alcançar o último hop
durante o teste de traceroute;

• total_probes_sent: total de pacotes enviados durante o teste de traceroute para o último hop;

• total_replies_last_hop: número de respostas recebidas do último hop (salto final) durante o teste;

• date_index: indice da data (1 representa que é o primeiro dia registrado, 2 o segundo, etc.);

• seconds_since_start: tempo desde a primeira medição do dataset, contado em segundos.

• route_changed: indicador booleano (True ou False - 1 ou 0, respectivamente) que
informa se houve alteração na rota durante o teste

In [46]:
import pandas as pd
import numpy as np
import ast
from typing import List, Tuple

class DataChallengeProcessor:
    """
    Classe para processar os dados do Data Challenge de detecção de mudanças de rota.
    """

    def __init__(self, file_path: str):
        self.file_path = file_path
        self.df = None

    def load_data(self, nrows) -> pd.DataFrame:
        print(f"Carregando dados de {self.file_path}...")
        self.df = pd.read_csv(self.file_path, nrows=nrows)
        print(f"Dados carregados: {self.df.shape[0]:,} linhas, {self.df.shape[1]} colunas")
        print(f"Colunas: {list(self.df.columns)}")

        return self.df

    def sanity_deduplication(self):
        before = len(self.df)
        self.df = self.df.drop_duplicates(subset=self.df.columns, keep='first')
        print(f"Removed {before - len(self.df)} exact duplicate rows.")
        return self.df

    def parse_rtts(self) -> pd.DataFrame:
        print("Processando coluna all_rtts...")

        self.df['all_rtts_list'] = self.df['all_rtts'].apply(
            lambda x: ast.literal_eval(x) if pd.notna(x) else []
        )
        self.df['rtt_mean'] = self.df['all_rtts_list'].apply(
            lambda x: np.mean(x) if x else np.nan
        )
        self.df['rtt_std'] = self.df['all_rtts_list'].apply(
            lambda x: np.std(x) if x else np.nan
        )
        self.df['rtt_min'] = self.df['all_rtts_list'].apply(
            lambda x: np.min(x) if x else np.nan
        )
        self.df['rtt_max'] = self.df['all_rtts_list'].apply(
            lambda x: np.max(x) if x else np.nan
        )
        self.df["rtt_range"] = self.df["rtt_max"] - self.df["rtt_min"]
        self.df['rtt_count'] = self.df['all_rtts_list'].apply(len)

        self.df = self.df.drop(['all_rtts', 'all_rtts_list', 'rtt_min', 'rtt_max'], axis=1).reset_index(drop=True)
        print("RTTs processados com sucesso!")
        return self.df

    def create_other_features(self) -> pd.DataFrame:
        print("Criando features...")

        self.df = self.df.sort_values(['tr_src', 'tr_dst', 'seconds_since_start']).reset_index(drop=True)

        self.df['time_since_last'] = self.df.groupby(['tr_src', 'tr_dst'])['seconds_since_start'].diff()
        self.df['reply_ratio'] = self.df['total_replies_last_hop'] / self.df['total_probes_sent'].replace(0, np.nan)
        self.df['probe_efficiency'] = self.df['total_replies_last_hop'] / self.df['tr_attempts'].replace(0, np.nan)
        self.df['pair_id'] = self.df['tr_src'] + self.df['tr_dst']

        self.df = self.df.drop(['total_probes_sent', 'total_replies_last_hop',
                                'seconds_since_start', 'tr_attempts',
                                'tr_src', 'tr_dst'], axis=1)

        print("Features criadas!")
        return self.df

    def handle_nan(self) -> pd.DataFrame:
        print("Tratando valores NaN...")
        initial_nan_count = self.df.isnull().sum().sum()
        self.df = self.df.fillna(0)
        print(f"Substituídos {initial_nan_count:,} valores NaN por zero.")
        return self.df

    def analyze_dataset(self) -> dict:
        print("\n" + "="*60)
        print("ANÁLISE EXPLORATÓRIA DOS DADOS")
        print("="*60)

        stats = {}
        stats['total_records'] = len(self.df)
        stats['total_pairs'] = self.df[['pair_id']].drop_duplicates().shape[0]

        if 'route_changed' in self.df.columns:
            target_dist = self.df['route_changed'].value_counts()
            stats['route_changes'] = target_dist.get(1, 0)
            stats['no_changes'] = target_dist.get(0, 0)
            stats['change_rate'] = stats['route_changes'] / len(self.df)

        if 'rtt_mean' in self.df.columns:
            stats['avg_rtt'] = self.df['rtt_mean'].mean()
            stats['std_rtt'] = self.df['rtt_mean'].std()

        print(f"Total de registros: {stats['total_records']:,}")
        print(f"Pares origem-destino únicos: {stats['total_pairs']}")

        if 'route_changes' in stats:
            print(f"\nMudanças de rota: {stats['route_changes']:,} ({stats['change_rate']*100:.2f}%)")
            print(f"Sem mudanças: {stats['no_changes']:,} ({(1-stats['change_rate'])*100:.2f}%)")
            print(f"scale_pos_weight = {stats['no_changes']/(stats['route_changes'] +  + 1e-9):.2f}")

        if 'avg_rtt' in stats:
            print(f"\nRTT médio: {stats['avg_rtt']:.2f} \u00b1 {stats['std_rtt']:.2f} ms")

        return stats

    def run_eda(self, train_mode: bool=False, nrows: int=1000) -> pd.DataFrame:
        """
        Executa o pipeline completo de EDA.
        """
        self.load_data(nrows)
        original_cols = self.df.columns.tolist()

        if train_mode:
            self.sanity_deduplication()

        self.parse_rtts()
        self.create_other_features()
        self.handle_nan()

        float_cols = self.df.select_dtypes(include=['float64']).columns
        self.df[float_cols] = self.df[float_cols].round(2)

        self.analyze_dataset()
        return self.df, original_cols

In [47]:
from pathlib import Path

DRIVE_PATH = "/content/drive/MyDrive/UFF/Disciplinas/tratamento-incertezas"
train_file = Path(DRIVE_PATH, "train.csv")
test_file = Path(DRIVE_PATH, "test.csv")
train_new_features = Path(DRIVE_PATH, "train_new_features.csv")
test_new_features = Path(DRIVE_PATH, "test_new_features.csv")
nsample = 10000000

In [48]:
print("Data Challenge 2025 - RNP")
print("Detecção de Mudanças de Rota baseada em RTT")
print("="*50)

# Inicializa processador
processor = DataChallengeProcessor(train_file)

# Processa uma amostra primeiro (para testes)
print(f"Processando arquivo csv...")
df_sample, original_cols = processor.run_eda(train_mode=True, nrows=nsample)

print("\nProcessamento concluído!")
print(f"Dataset final: {df_sample.shape}")
print(f"Features disponíveis: {df_sample.shape[1]}")

# Lista features criadas
print(f"\nFeatures criadas:")

new_features = [col for col in df_sample.columns if col not in original_cols]
for feature in new_features:
    print(f"  - {feature}")

df_sample.to_csv(train_new_features, index=False)

Data Challenge 2025 - RNP
Detecção de Mudanças de Rota baseada em RTT
Processando arquivo csv...
Carregando dados de /content/drive/MyDrive/UFF/Disciplinas/tratamento-incertezas/train.csv...
Dados carregados: 10,000,000 linhas, 10 colunas
Colunas: ['tr_id', 'tr_src', 'tr_dst', 'all_rtts', 'tr_attempts', 'total_probes_sent', 'total_replies_last_hop', 'route_changed', 'date_index', 'seconds_since_start']
Removed 0 exact duplicate rows.
Processando coluna all_rtts...
RTTs processados com sucesso!
Criando features...
Features criadas!
Tratando valores NaN...
Substituídos 173 valores NaN por zero.

ANÁLISE EXPLORATÓRIA DOS DADOS
Total de registros: 10,000,000
Pares origem-destino únicos: 109

Mudanças de rota: 205,935 (2.06%)
Sem mudanças: 9,794,065 (97.94%)
scale_pos_weight = 47.56

RTT médio: 124.56 ± 78.10 ms

Processamento concluído!
Dataset final: (10000000, 11)
Features disponíveis: 11

Features criadas:
  - rtt_mean
  - rtt_std
  - rtt_range
  - rtt_count
  - time_since_last
  - repl

In [49]:
df_sample.head(20)

Unnamed: 0,tr_id,route_changed,date_index,rtt_mean,rtt_std,rtt_range,rtt_count,time_since_last,reply_ratio,probe_efficiency,pair_id
0,0,0,1,210.09,0.0,0.0,1,0.0,1.0,0.33,0
1,1,0,1,210.09,0.0,0.0,1,0.0,1.0,0.33,0
2,2,0,1,210.09,0.0,0.0,1,0.0,1.0,0.33,0
3,3,0,1,210.09,0.0,0.0,1,0.0,1.0,0.33,0
4,4,0,1,210.09,0.0,0.0,1,0.0,1.0,0.33,0
5,5,0,1,210.09,0.0,0.0,1,0.0,1.0,0.33,0
6,6,0,1,210.09,0.0,0.0,1,0.0,1.0,0.33,0
7,7,0,1,210.09,0.0,0.0,1,0.0,1.0,0.33,0
8,8,0,1,210.09,0.0,0.0,1,0.0,1.0,0.33,0
9,9,0,1,210.09,0.0,0.0,1,0.0,1.0,0.33,0


In [50]:
df_sample.tail(20)

Unnamed: 0,tr_id,route_changed,date_index,rtt_mean,rtt_std,rtt_range,rtt_count,time_since_last,reply_ratio,probe_efficiency,pair_id
9999980,3262753,0,24,106.34,2.3,6.25,6,0.0,1.0,2.0,124
9999981,3262754,0,24,106.34,2.3,6.25,6,0.0,1.0,2.0,124
9999982,3262755,0,24,106.34,2.3,6.25,6,0.0,1.0,2.0,124
9999983,3262756,0,24,106.34,2.3,6.25,6,0.0,1.0,2.0,124
9999984,3469868,0,25,105.94,1.51,4.19,6,53640.0,1.0,2.0,124
9999985,3469869,0,25,105.94,1.51,4.19,6,0.0,1.0,2.0,124
9999986,3469870,0,25,105.94,1.51,4.19,6,0.0,1.0,2.0,124
9999987,3469871,0,25,105.94,1.51,4.19,6,0.0,1.0,2.0,124
9999988,3469872,0,25,105.94,1.51,4.19,6,0.0,1.0,2.0,124
9999989,3469873,0,25,105.94,1.51,4.19,6,0.0,1.0,2.0,124


In [51]:
print("Data Challenge 2025 - RNP")
print("Detecção de Mudanças de Rota baseada em RTT")
print("="*50)

processor = DataChallengeProcessor(test_file)

print(f"Processando arquivo csv...")
df_sample_test, original_cols = processor.run_eda(nrows=nsample)

print("\nProcessamento concluído!")
print(f"Dataset final: {df_sample_test.shape}")
print(f"Features disponíveis: {df_sample_test.shape[1]}")

# Lista features criadas
print(f"\nFeatures criadas:")

new_features = [col for col in df_sample_test.columns if col not in original_cols]
for feature in new_features:
    print(f"  - {feature}")

df_sample_test.to_csv(test_new_features, index=False)

Data Challenge 2025 - RNP
Detecção de Mudanças de Rota baseada em RTT
Processando arquivo csv...
Carregando dados de /content/drive/MyDrive/UFF/Disciplinas/tratamento-incertezas/test.csv...
Dados carregados: 8,556,497 linhas, 9 colunas
Colunas: ['tr_id', 'tr_src', 'tr_dst', 'all_rtts', 'tr_attempts', 'total_probes_sent', 'total_replies_last_hop', 'date_index', 'seconds_since_start']
Processando coluna all_rtts...
RTTs processados com sucesso!
Criando features...
Features criadas!
Tratando valores NaN...
Substituídos 195 valores NaN por zero.

ANÁLISE EXPLORATÓRIA DOS DADOS
Total de registros: 8,556,497
Pares origem-destino únicos: 114

RTT médio: 123.11 ± 77.08 ms

Processamento concluído!
Dataset final: (8556497, 10)
Features disponíveis: 10

Features criadas:
  - rtt_mean
  - rtt_std
  - rtt_range
  - rtt_count
  - time_since_last
  - reply_ratio
  - probe_efficiency
  - pair_id


In [52]:
df_sample_test.head(20)

Unnamed: 0,tr_id,date_index,rtt_mean,rtt_std,rtt_range,rtt_count,time_since_last,reply_ratio,probe_efficiency,pair_id
0,25503974,8,199.9,0.0,0.0,1,0.0,1.0,0.33,0
1,25503975,8,199.9,0.0,0.0,1,0.0,1.0,0.33,0
2,25503976,8,199.9,0.0,0.0,1,0.0,1.0,0.33,0
3,25503977,8,199.9,0.0,0.0,1,0.0,1.0,0.33,0
4,25503978,8,199.9,0.0,0.0,1,0.0,1.0,0.33,0
5,25503979,8,199.9,0.0,0.0,1,0.0,1.0,0.33,0
6,25503980,8,199.9,0.0,0.0,1,0.0,1.0,0.33,0
7,25503981,8,199.9,0.0,0.0,1,0.0,1.0,0.33,0
8,25503982,8,199.9,0.0,0.0,1,0.0,1.0,0.33,0
9,25503983,8,199.9,0.0,0.0,1,0.0,1.0,0.33,0
