In [82]:
import pandas as pd

# Cargar el archivo
file_path = "./dev_test2.txt"

# Leer el contenido del archivo
with open(file_path, "r") as f:
    lines = f.readlines()

# Procesar los datos
data = []
current_file = None

for line in lines:
    line = line.strip()
    if line.startswith("./dev/pruebas") or line.startswith("./dev/pruebas2"):
        current_file = line
    elif "[Paddle]" in line or "[Doctr]" in line or "[Surya]" in line:
        engine, values = line.split("]")
        engine = engine[1:]  # Quitar corchete inicial
        cer, wer = values.replace("CER:", "").replace("WER:", "").split("-")
        data.append([current_file, engine.strip(), float(cer.strip()), float(wer.strip())])

# Crear DataFrame
df = pd.DataFrame(data, columns=["File", "Engine", "CER", "WER"])

# Mostrar las primeras filas
df.head()


Unnamed: 0,File,Engine,CER,WER
0,./dev/pruebas/9175.pdf,Paddle,8.28,33.16
1,./dev/pruebas/9175.pdf,Doctr,6.33,22.78
2,./dev/pruebas/9175.pdf,Surya,1.27,5.06
3,./dev/pruebas/9080.pdf,Paddle,10.63,23.23
4,./dev/pruebas/9080.pdf,Doctr,9.23,17.17


In [84]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist

# Paso 1: Pivotear los datos para tener una fila por fichero y columnas para cada motor/métrica
df_pivot = df.pivot_table(
    index='File', 
    columns='Engine', 
    values=['CER', 'WER'], 
    aggfunc='first'
)

# Renombrar columnas para facilitar su uso
df_pivot.columns = ['_'.join(col).strip() for col in df_pivot.columns.values]
df_pivot = df_pivot.reset_index()
df_pivot.head()

Unnamed: 0,File,CER_Doctr,CER_Paddle,CER_Surya,WER_Doctr,WER_Paddle,WER_Surya
0,./dev/pruebas/8998.pdf,8.14,11.3,2.35,27.86,33.59,7.63
1,./dev/pruebas/9000.pdf,6.9,5.9,0.78,21.52,22.85,3.64
2,./dev/pruebas/9005.pdf,3.93,4.0,0.15,17.21,15.81,0.93
3,./dev/pruebas/9006.pdf,7.1,7.22,1.1,23.45,22.76,4.14
4,./dev/pruebas/9014.pdf,6.94,11.25,8.88,24.88,34.98,18.08


### Método 1

In [80]:
# Paso 2: Normalizar los datos (por motor, ya que solo son comparables dentro del mismo motor)
scaler = StandardScaler()
engines = df['Engine'].unique()
features = []

for engine in engines:
    if f'CER_{engine}' in df_pivot.columns and f'WER_{engine}' in df_pivot.columns:
        cols = [f'CER_{engine}', f'WER_{engine}']
        df_pivot[cols] = scaler.fit_transform(df_pivot[cols])
        features.extend(cols)

# Paso 3: Aplicar K-means para obtener 6 clases
X = df_pivot[features].fillna(0)  # Manejar posibles valores NaN
kmeans = KMeans(n_clusters=6, random_state=42, n_init=10)
df_pivot['cluster'] = kmeans.fit_predict(X)

# Paso 4: Analizar los clusters
cluster_stats = df_pivot.groupby('cluster').agg({
    **{feat: ['mean', 'std'] for feat in features},
    'File': 'count'
})

# Paso 5: Encontrar el fichero más representativo de cada cluster (más cercano al centroide)
representative_files = []
for cluster_id in range(6):
    cluster_data = df_pivot[df_pivot['cluster'] == cluster_id][features]
    if len(cluster_data) > 0:
        centroid = kmeans.cluster_centers_[cluster_id]
        distances = cdist(cluster_data, [centroid], 'euclidean')
        representative_idx = distances.argmin()
        representative_file = df_pivot[df_pivot['cluster'] == cluster_id].iloc[representative_idx]['File']
        representative_files.append((cluster_id, representative_file, len(cluster_data)))

# Paso 6: Identificar anomalías (usando la distancia al centroide)
threshold = np.percentile(
    [cdist([row], [kmeans.cluster_centers_[cluster]], 'euclidean')[0][0] 
     for row, cluster in zip(X.values, df_pivot['cluster'])], 
    95  # Consideramos el 5% más lejano como anomalías
)

df_pivot['distance_to_centroid'] = [
    cdist([row], [kmeans.cluster_centers_[cluster]], 'euclidean')[0][0]
    for row, cluster in zip(X.values, df_pivot['cluster'])
]

anomalies = df_pivot[df_pivot['distance_to_centroid'] > threshold]

# Mostrar resultados
print("Estadísticas de los clusters:")
print(cluster_stats)
print("\nFicheros representativos por cluster:")
for cluster_id, file, count in representative_files:
    print(f"Cluster {cluster_id}: {file} (contiene {count} ficheros)")
print("\nFicheros anómalos:")
print(anomalies[['File', 'cluster', 'distance_to_centroid']])

Estadísticas de los clusters:
        CER_Paddle           WER_Paddle           CER_Doctr            \
              mean       std       mean       std      mean       std   
cluster                                                                 
0        -0.595953  0.199281  -0.791781  0.293385 -0.485205  0.125902   
1         1.527307  0.823849   1.723090  0.602622  1.645675  0.847896   
2         3.550292  0.285367   2.644795  0.152336  4.452637  0.563874   
3        -0.123794  0.317387   0.023672  0.346552 -0.196823  0.200549   
4         2.336351  1.047582   2.241901  0.368121  1.173390  0.699432   
5         0.396589  0.405465   0.893104  0.440851  0.284053  0.393807   

        WER_Doctr           CER_Surya           WER_Surya            File  
             mean       std      mean       std      mean       std count  
cluster                                                                    
0       -0.678214  0.235603 -0.397155  0.166178 -0.516468  0.208991   213  
1       

### Método 2

In [86]:
# Normalizar los datos por motor
from sklearn.preprocessing import MinMaxScaler

# Crear una copia del DataFrame
df_normalized = df_pivot.copy()

# Columnas a normalizar (todas excepto 'File')
columns_to_normalize = df_pivot.columns[1:]

# Normalizar cada columna por separado
scaler = MinMaxScaler()
df_normalized[columns_to_normalize] = scaler.fit_transform(df_pivot[columns_to_normalize])

# Aplicar K-means para crear 6 clases
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import numpy as np

# Usar solo las columnas normalizadas para el clustering
X = df_normalized[columns_to_normalize].values

# Aplicar K-means con 6 clusters
kmeans = KMeans(n_clusters=6, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X)

# Agregar la información de cluster al DataFrame original
df_pivot['Cluster'] = clusters

# Analizar cada cluster
cluster_stats = df_pivot.groupby('Cluster').agg({
    'CER_Paddle': ['mean', 'min', 'max', 'count'],
    'CER_Doctr': ['mean', 'min', 'max'],
    'CER_Surya': ['mean', 'min', 'max'],
    'WER_Paddle': ['mean', 'min', 'max'],
    'WER_Doctr': ['mean', 'min', 'max'],
    'WER_Surya': ['mean', 'min', 'max']
})

# Encontrar el elemento más representativo de cada cluster (más cercano al centroide)
representative_files = []
for i in range(6):
    cluster_points = X[clusters == i]
    centroid = kmeans.cluster_centers_[i]
    
    # Calcular la distancia euclidiana de cada punto al centroide
    distances = np.sqrt(np.sum((cluster_points - centroid) ** 2, axis=1))
    
    # Encontrar el índice del punto más cercano al centroide
    closest_idx = np.argmin(distances)
    
    # Obtener el índice original en el DataFrame
    original_idx = np.where(clusters == i)[0][closest_idx]
    representative_files.append(df_pivot.iloc[original_idx]['File'])

# Identificar anomalías usando la distancia al centroide
from scipy.stats import zscore

# Calculamos la distancia de cada punto a su centroide
distances_to_centroid = []
for i, row in enumerate(X):
    cluster_id = clusters[i]
    centroid = kmeans.cluster_centers_[cluster_id]
    distance = np.sqrt(np.sum((row - centroid) ** 2))
    distances_to_centroid.append(distance)

df_pivot['Distance_to_Centroid'] = distances_to_centroid

# Consideramos anomalías los puntos con distancia > 2 desviaciones estándar
threshold = np.mean(distances_to_centroid) + 2 * np.std(distances_to_centroid)
anomalies = df_pivot[df_pivot['Distance_to_Centroid'] > threshold]

# Resultados finales
cluster_summary = pd.DataFrame({
    'Cluster': range(6),
    'Size': [sum(clusters == i) for i in range(6)],
    'Representative_File': representative_files,
    'Paddle_CER_Range': [f"{cluster_stats['CER_Paddle']['min'][i]:.2f}-{cluster_stats['CER_Paddle']['max'][i]:.2f}" for i in range(6)],
    'Doctr_CER_Range': [f"{cluster_stats['CER_Doctr']['min'][i]:.2f}-{cluster_stats['CER_Doctr']['max'][i]:.2f}" for i in range(6)],
    'Surya_CER_Range': [f"{cluster_stats['CER_Surya']['min'][i]:.2f}-{cluster_stats['CER_Surya']['max'][i]:.2f}" for i in range(6)],
    'Paddle_WER_Range': [f"{cluster_stats['WER_Paddle']['min'][i]:.2f}-{cluster_stats['WER_Paddle']['max'][i]:.2f}" for i in range(6)],
    'Doctr_WER_Range': [f"{cluster_stats['WER_Doctr']['min'][i]:.2f}-{cluster_stats['WER_Doctr']['max'][i]:.2f}" for i in range(6)],
    'Surya_WER_Range': [f"{cluster_stats['WER_Surya']['min'][i]:.2f}-{cluster_stats['WER_Surya']['max'][i]:.2f}" for i in range(6)]
})

In [88]:
cluster_summary

Unnamed: 0,Cluster,Size,Representative_File,Paddle_CER_Range,Doctr_CER_Range,Surya_CER_Range,Paddle_WER_Range,Doctr_WER_Range,Surya_WER_Range
0,0,196,./dev/pruebas2/9267.pdf,3.08-16.60,1.14-12.59,0.15-18.00,9.88-37.43,5.32-30.18,0.93-24.16
1,1,18,./dev/pruebas2/9361.pdf,13.66-51.67,8.14-38.28,15.43-57.59,36.68-75.95,25.76-73.90,34.43-76.70
2,2,160,./dev/pruebas2/9387.pdf,7.71-28.21,3.94-22.07,0.86-27.85,28.89-52.27,17.54-51.85,3.82-44.64
3,3,22,./dev/pruebas2/9353.pdf,39.53-75.85,34.50-75.88,32.61-76.25,76.10-97.20,73.17-97.57,64.99-97.01
4,4,19,./dev/pruebas2/9060.pdf,32.70-89.04,8.93-41.65,5.32-27.17,71.15-96.84,29.39-87.72,14.07-59.26
5,5,75,./dev/pruebas2/9308.pdf,13.23-40.71,5.54-28.29,1.32-25.06,41.73-76.27,19.04-66.29,6.67-39.44


In [90]:
anomalies

Unnamed: 0,File,CER_Doctr,CER_Paddle,CER_Surya,WER_Doctr,WER_Paddle,WER_Surya,Cluster,Distance_to_Centroid
61,./dev/pruebas/9456.pdf,8.14,13.66,50.69,25.76,36.68,53.71,1,0.539293
89,./dev/pruebas2/8986.pdf,34.58,40.0,32.61,91.33,92.88,91.33,3,0.611515
102,./dev/pruebas2/9001.pdf,34.77,65.0,27.17,72.39,93.6,59.26,4,0.377618
113,./dev/pruebas2/9015.pdf,13.92,51.67,50.13,39.78,74.45,61.86,1,0.406834
144,./dev/pruebas2/9051.pdf,41.06,40.73,53.37,83.33,88.24,81.86,3,0.420296
148,./dev/pruebas2/9057.pdf,46.24,39.53,44.7,87.44,83.57,85.51,3,0.439113
152,./dev/pruebas2/9062.pdf,34.5,72.24,33.42,86.67,93.33,73.33,3,0.542685
180,./dev/pruebas2/9094.pdf,15.43,18.52,20.27,41.88,53.75,65.0,1,0.321842
189,./dev/pruebas2/9106.pdf,50.2,55.15,47.34,73.17,76.1,64.99,3,0.417459
224,./dev/pruebas2/9147.pdf,8.93,76.6,5.32,29.39,81.68,18.32,4,0.587135
