In [None]:
import pandas as pd
import os
from google.colab import files

# Funzione per analizzare un file CSV dei risultati dei data smell
def analyze_smell_file(file_path):
    df = pd.read_csv(file_path)

    # Escludi i Duplicated Value Smell
    df = df[df['Smell Type'] != 'Duplicated Value Smell']

    # Estrai il nome del dataset rimuovendo il prefisso "detection_results_"
    filename = os.path.basename(file_path)
    dataset_name = filename.replace('detection_results_', '').replace('.csv', '')

    # Statistiche generali
    smell_count = int(len(df))
    unique_columns = int(df['Column Name'].nunique())
    total_faulty = int(df['Faulty Element Count'].sum())
    total_elements = int(df['Total Element Count'].sum())

    # Calcolo media in percentuale (con float)
    if len(df) > 0:
        avg_faulty_percentage = round(
            (df['Faulty Element Count'] / df['Total Element Count']).mean() * 100, 2
        )
    else:
        avg_faulty_percentage = 0.0

    # Tipi di smell (esclusi i duplicati)
    smell_types = df['Smell Type'].dropna().unique()
    smell_types_str = "; ".join(sorted(smell_types))

    summary = {
        'Dataset': dataset_name,
        'Total_Smells': smell_count,
        'Unique_Columns_Affected': unique_columns,
        'Sum_Faulty_Elements': total_faulty,
        'Sum_Total_Elements': total_elements,
        'Avg%Faulty_Elements': avg_faulty_percentage,
        'Smell_Types': smell_types_str
    }

    # Dettaglio per tipo di smell
    for smell_type in smell_types:
        smell_df = df[df['Smell Type'] == smell_type]
        clean_name = smell_type.replace(" ", "_")
        summary[f'Smell_Count:{clean_name}'] = int(len(smell_df))
        summary[f'Faulty_Elements:{clean_name}'] = int(smell_df['Faulty Element Count'].sum())

    return summary

# Carica i file CSV
uploaded = files.upload()

# Analizza e raccogli i riepiloghi
summaries = []
for file_name in uploaded.keys():
    summary = analyze_smell_file(file_name)
    summaries.append(summary)

# Crea il DataFrame
summary_df = pd.DataFrame(summaries)

# Sostituisci NaN con 0 e assicurati che ogni colonna abbia il tipo giusto
for col in summary_df.columns:
    if col == 'Avg%Faulty_Elements':
        summary_df[col] = summary_df[col].fillna(0.0).astype(float)
    elif col != 'Dataset' and col != 'Smell_Types':
        summary_df[col] = summary_df[col].fillna(0).astype(int)
    elif col != 'Dataset':
        summary_df[col] = summary_df[col].fillna('')

# Rinomina colonne: sostituisci spazi con underscore
summary_df.columns = [col.replace(" ", "_") for col in summary_df.columns]

# Salva e scarica
summary_df.to_csv('summary_cleaned.csv', index=False)
files.download('summary_cleaned.csv')



Saving adult.csv to adult (1).csv
Saving bank-full.csv to bank-full (1).csv
Saving diabetic.csv to diabetic (1).csv
Saving fpes.csv to fpes (1).csv
Saving german.csv to german (1).csv
Saving speed_dating.csv to speed_dating (1).csv
Saving heart_disease_sp.csv to heart_disease_sp (1).csv
Saving student-por_sp.csv to student-por_sp (1).csv
Saving detection_results_compas.csv to detection_results_compas (1).csv
Saving detection_results_mep.csv to detection_results_mep (1).csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>