In [1]:
import os, sys
import argparse

import numpy as np
import pandas as pd
from azureml.core import Run

import logging

In [None]:
def create_one_urology(df):
    # Replace NA and NaN with proper pandas nan
    df = df.replace(r"\s+", "", regex=True)
    df = df.replace("", np.nan, regex=True)
    df = df.replace("NA", np.nan)
    df = df.replace("NaN", np.nan)

    # Ignoring metachronus and vhl patients
    print(f"Original shape: {df.shape}")
    df = df[df["episode"] != "metachronous"]
    print(f"Shape without metachronous: {df.shape}")

    df = df[df["vhl"] != 1]
    df = df[df["ANM.1.vonHippelLindau"] != True]
    print(f"Shape without von Hippel Lindau {df.shape}")

    df = df[df["ANM.1.kidneyType"] != "BilateraleSincrono"]
    print(f"Shape without BilateraleSincrono {df.shape}")

    df = df[df["ANM.1.therapyNeoadjuvant"] != True]
    print(f"Shape without Therapy Neoadjuvant {df.shape}")

    # At the moment there's a patient that has g/L in emoglobin UM instead of. We ignore it
    df["ANM.1.examEmCreatininemiaUm"] = df["ANM.1.examEmCreatininemiaUm"].replace(
        "mg/dl", "mg/dL"
    )
    df["ANM.1.examEmPiastrine"] = df["ANM.1.examEmPiastrine"].values.astype("float32")
    df["ANM.1.examEmPiastrineUm"] = df["ANM.1.examEmPiastrineUm"].replace(
        "x10^9", "10^9"
    )
    df["ANM.1.examEmPiastrineUm"] = df["ANM.1.examEmPiastrineUm"].replace(
        "x10^9/L", "10^9/L"
    )
    df["ANM.1.examEmPiastrineUm"] = df["ANM.1.examEmPiastrineUm"].replace(
        "10^9", "10^9/L"
    )
    df["ANM.1.examEmPiastrineUm"] = df["ANM.1.examEmPiastrineUm"].replace(
        "10^)/L", "10^9/L"
    )
    df["ANM.1.examEmPiastrineUm"] = df["ANM.1.examEmPiastrineUm"].replace(
        "10ì9/L", "10^9/L"
    )
    df["ANM.1.examEmPiastrineUm"] = df["ANM.1.examEmPiastrineUm"].replace(
        "U/L", "10^9/L"
    )
    df["ANM.1.examEmPiastrineUm"] = df["ANM.1.examEmPiastrineUm"].replace(
        "1000/ul", "10^9/L"
    )
    df["ANM.1.examEmPiastrineUm"] = df["ANM.1.examEmPiastrineUm"].replace(
        "1000/mm3", "10^9/L"
    )
    df["ANM.1.examEmPiastrineUm"] = df["ANM.1.examEmPiastrineUm"].replace(
        "mg/dL", "10^9/L"
    )
    df = df[df["INT.1.kidney1IschemiaType"] != "TotaleFredda"]
    print(f"Shape without ischemiaType TotaleFredda {df.shape}")

    df = df[~df["IST.1.kidney1TumorDimension"].isna()]
    print(f"Shape without missing tumor dimension {df.shape}")

    rename_dict = {}
    for c in df.columns:
        rename_dict[c] = c.replace(".", "_")
    df = df.rename(columns=rename_dict)

    df = df[df["ANM_1_metastasis"] == "No"]
    print(f"Shape without metastatic {df.shape}")
    return df

In [9]:
import mltable
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

ml_client = MLClient.from_config(credential=DefaultAzureCredential())
data_asset = ml_client.data.get("urologia", version="1")

tbl = mltable.load(f"azureml:/{data_asset.id}")

df = tbl.to_pandas_dataframe()
df

Found the config file in: /config.json


Unnamed: 0,Column1,P.1.id,P.1.sex,P.1.exitusState,P.1.exitusDate,P.1.exitusCause,P.1.exitusCauseSpecific,P.1.exitusCauseConcurrent,P.1.exitusSurvivalTimeProstate,P.1.exitusSurvivalTimeKidney,...,gfr.96.EPI,gfr.96.BIS,gfr.96,gfr.108.EPI,gfr.108.BIS,gfr.108,gfr.120.EPI,gfr.120.BIS,gfr.120,yob
0,1,1,M,Si,13512960000,,...,...,,198,...,,,,,,,,,,1918
1,2,2,F,Si,13081824000,Altra causa,Osseocitoma e Leiomioma Uterino ...,"Carcinosi Peritoneali, Cachessia ...",,13,...,,,,,,,,,,1945
2,3,4,M,Si,13340160000,,...,...,,87,...,,,,,,,,,,1925
3,4,5,F,No,,,...,...,,,...,,,,,,,67.0108377500802,66.7416388846908,67.01,1949
4,5,6,M,No,,,...,...,,,...,81.2096919585057,78.8576090569383,81.21,,,,,,,1952
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3791,3792,2030537,F,No,,,...,...,,,...,,,,,,,,,,1949
3792,3793,2030538,F,No,,,...,...,,,...,,,,,,,,,,1954
3793,3794,2030554,F,,,,...,...,,,...,,,,,,,,,,1970
3794,3795,2030579,M,No,,,...,...,,,...,,,,,,,,,,1980


In [10]:
new_df = create_one_urology(df)

  df = df.replace("", np.nan, regex=True)
  df = df.replace("NA", np.nan)


Original shape: (3796, 7049)
Shape without metachronous: (3714, 7049)
Shape without von Hippel Lindau (3703, 7049)
Shape without BilateraleSincrono (3562, 7049)
Shape without Therapy Neoadjuvant (3562, 7049)
Converting piastrine to float32
Shape without ischemiaType TotaleFredda (3549, 7049)
Shape without missing tumor dimension (3402, 7049)
Shape without metastatic (3081, 7049)
