In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from google.colab import files
import io

# Upload file
uploaded = files.upload()
file_name = next(iter(uploaded))
df = pd.read_csv(io.BytesIO(uploaded[file_name]))

# Identifica colonne numeriche e categoriche
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = [col for col in df.columns if df[col].dtype == "object" and not col.lower().startswith("num_")]

# Imputazione dei valori nulli
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

# Colonne categoriche: imputazione + one-hot encoding
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])


preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_pipeline, numerical_cols),
    ('cat', categorical_pipeline, categorical_cols)
])

# Applica il preprocessing
df_processed = preprocessor.fit_transform(df)

# Ottieni nomi colonne trasformate
num_features = numerical_cols
cat_features = preprocessor.named_transformers_['cat']['encoder'].get_feature_names_out(categorical_cols)
all_features = list(num_features) + list(cat_features)

# Crea il DataFrame finale
df_encoded = pd.DataFrame(df_processed, columns=all_features)

# Salva e scarica il dataset pulito
output_file = "clean_dataset.csv"
df_encoded.to_csv(output_file, index=False)
files.download(output_file)
