# Notebook 3: Funções de Transformação e Limpeza de Dados

## Introdução
Neste notebook, iremos implementar as transformações e limpar os dados da tabela 'Drug' do conjunto de dados FAERS, com base na análise exploratória realizada no notebook anterior (Exploratory_Data_Analysis_Drug). 

## Estrutura do notebook
- Importação de Bibliotecas
- Carregamento dos Dados (Parquet Raw) usando o Schema definido 
- Aplicar apenas as Transformações necessárias
- Salvar **Parquet Final** com os Dados Limpos


In [0]:
# Importar bibliotecas necessárias
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, LongType, DoubleType, DataType
from pyspark.sql.functions import col, count, when, regexp_replace, concat, substring, lit, coalesce, initcap, lower, to_date, date_format


In [0]:
%run "../Funcoes_auxiliares" 

In [0]:
from pyspark.sql.types import *

# Carregar dados do Parquet raw
schema_drug = StructType([
    StructField('primaryid', LongType(), False, {'description': 'Unique number for identifying a FAERS report'}),
    StructField('caseid', IntegerType(), False, {'description': 'Number for identifying a FAERS case.'}),
    StructField('drug_seq', IntegerType(), True, {'description': 'Unique number for identifying a drug for a Case.'}),
    StructField('role_cod', StringType(), True, {'description': 'Code for drugs reported role in event.'}),
    StructField('drugname', StringType(), True, {'description': 'Name of medicinal product.'}),
    StructField('prod_ai', StringType(), True, {'description': 'Product Active Ingredient.'}),
    StructField('val_vbm', IntegerType(), True, {'description': 'Code for source of drugname. 1- validated, 2- verbatim text'}),
    StructField('route', StringType(), True, {'description': 'The route of drug administration.'}),
    StructField('dose_vbm', StringType(), True, {'description': 'Verbatim text for dose, frequency, and route, exactly as entered on report.'}), 
    StructField('cum_dose_chr', StringType(), True, {'description': 'Cumulative dose to first reaction.'}),
    StructField('cum_dose_unit', StringType(), True, {'description': 'Cumulative dose to first reaction unit.'}),
    StructField('dechal', StringType(), True, {'description': 'Dechallenge code.'}),
    StructField('rechal', StringType(), True, {'description': 'Rechallenge code.'}),
    StructField('lot_num', StringType(), True, {'description': 'Lot number of the drug.'}),
    StructField('exp_dt', DataType(), True, {'description': 'Expiration date of the drug.'}),
    StructField('nda_num', LongType(), True, {'description': 'NDA number.'}),
    StructField('dose_amt', DoubleType(), True, {'description': 'Amount of drug reported.'}),
    StructField('dose_unit', StringType(), True, {'description': 'Unit of drug dose.'}),
    StructField('dose_form', StringType(), True, {'description': 'Form of dose reported.'}),
    StructField('dose_freq', StringType(), True, {'description': 'Frequency.'})
])

df_drug = spark.read.parquet('dbfs:/FileStore/FAERS-grupo-4/drug_raw', schema=schema_drug)


In [0]:
#Na coluna Role_Cod substituir valores a null por SS (moda da variavel)
df_drug = df_drug.withColumn(
    "Role_Cod",
    when(col("Role_Cod").isNull(), "SS").otherwise(col("Role_Cod"))
)

In [0]:

# Modificar os valores na coluna ROLE_COD
df_drug = df_drug.withColumn(
    "ROLE_COD",
    when(col("ROLE_COD") == "PS", "Primary Suspect Drug")
    .when(col("ROLE_COD") == "SS", "Secondary Suspect Drug")
    .when(col("ROLE_COD") == "C", "Concomitant")
    .when(col("ROLE_COD") == "I", "Interaction")
    .otherwise("Unknown")
)


In [0]:
#Eliminar valores a null na coluna prod_ai e também valores em que a coluna val_vbm = 2 
df_drug = df_drug.filter((col('prod_ai').isNotNull()) & (col('val_vbm') != 2)) # codigo utiliza raciocinio contrario

In [0]:
# Eliminar a coluna 'val_vbm' do DataFrame df_drug
df_drug = df_drug.drop('val_vbm')

In [0]:
#Colocar a null valores como Unknow, injection e injectable por nao ser percetivel a via de admistração e uniformizar as expressóes intravenous
df_drug = df_drug.withColumn(
    "ROUTE",
    when((col("ROUTE") == "Unknown") | (col("ROUTE") == "INJECTION")| (col("ROUTE") == "INJECTABLE"), None)
    .when((col("ROUTE") == "Intravenous drip") | 
          (col("ROUTE") == "Intravenous (not otherwise specified)") | 
          (col("ROUTE") == "Intravenous bolus"), "Intravenous")
    .otherwise(col("ROUTE"))
)


In [0]:
##Palavras que podem ser implementadas da coluna route 

# Lista de palavras a serem procuradas
words = [
    "Oral", "Intravenous", "Subcutaneous", "Intravenous drip", "Intramuscular", "Topical", "Transplacental", 
    "Respiratory (inhalation)", "Ophthalmic", "Intra-uterine", "Transdermal", "Intracardiac", "Intravenous bolus", 
    "Nasal", "Intrathecal", "Rectal", "Intraperitoneal", "Cutaneous", "Endocervical", "Vaginal", "Intraocular", 
    "Sublingual", "Intra-arterial", "Buccal", "Parenteral", "Intra-articular", "Intracavernous", "Urethral", 
    "Epidural", "Transmammary", "Intravesical", "Subconjunctival", "Intradermal", "Hemodialysis", "Dental", 
    "Intralesional", "Periarticular", "Intrameningeal", "Oropharyngeal", "Auricular (otic)", "Endosinusial", 
    "Intra-amniotic", "Endotracheal", "Intradiscal (intraspinal)", "Intrapleural", "Occlusive dressing technique", 
    "Intratumor", "Intracervical", "Intracisternal", "Intramedullar (bone marrow)", "Intra corpus cavernosum", 
    "Intrapericardial", "Intrathoracic", "Intratracheal", "Iontophoresis", "Intracoronary", "Intrasynovial", 
    "Perineural", "Retrobulbar", "Intracerebral", "Intrahepatic", "Intralymphatic", "Intracorneal", "Extra-amniotic"
]

df_drug = df_drug.withColumn(
    'routenew',
    coalesce(*[when(col('dose_vbm').rlike(f"(?i)\\b{word}\\b"), initcap(lit(word))) for word in words])
)

df_drug = df_drug.withColumn(
    'routenew2',
    when(col('route').isNotNull(), col('route')).otherwise(col('routenew'))
)


In [0]:

# Ajustar a coluna dose_vbm sem Unkwon ou UNK
df_drug = df_drug.withColumn(
    "dose_vbm",
    regexp_replace(col("dose_vbm"), "(?i)unk", "")
)

# Remover vírgulas ou pontos no início ou fim dos registros
df_drug = df_drug.withColumn(
    "dose_vbm",
    # Substituir vírgulas ou pontos no início ou fim dos registros
    regexp_replace(col("dose_vbm"), "^[,\\.]+|[,\\.]+$", "")
)



In [0]:
#Substituir dados da coluna dechal
df_drug = df_drug.withColumn(
    "dechal",
    when(col("dechal") == "Y", "Positive")
    .when(col("dechal") == "N", "Negative")
    .when(col("dechal") == "U", None)
    .when(col("dechal") == "D", "Does not apply")
    .otherwise(col("dechal"))
)


In [0]:
#Substituir dados da coluna rechal
df_drug = df_drug.withColumn(
    "rechal",
    when(col("rechal") == "Y", "Positive")
    .when(col("rechal") == "N", "Negative")
    .when(col("rechal") == "U", None)
    .when(col("rechal") == "D", "Does not apply")
    .otherwise(col("rechal"))
)

In [0]:
# Limpar a coluna LOT_NUM
df_drug = df_drug.withColumn(
    "LOT_NUM",
    regexp_replace("LOT_NUM", r"[,;|/].*$", "")
)

expressoes_indesejadas = [
    r"(?i)\b(UNK|unknown|NA|Not|Available|ASKED|BUT|reported|time|at|this|asku|N/A)\b"
]

for expressao in expressoes_indesejadas:
    df_drug = df_drug.withColumn("LOT_NUM", regexp_replace("LOT_NUM", expressao, ""))


df_drug = df_drug.withColumn("LOT_NUM", when(col("LOT_NUM") == "", None).otherwise(col("LOT_NUM")))

In [0]:
#Formatar exp_dt para formato yyyy-mm-dd
df_drug = df_drug.withColumn(
    "EXP_DT",
    when(
        col("EXP_DT").rlike(r"^\d{8}$"),
        concat(
            substring(col("EXP_DT"), 1, 4), lit("-"),
            substring(col("EXP_DT"), 5, 2), lit("-"),
            substring(col("EXP_DT"), 7, 2)
        )
    ).otherwise(col("EXP_DT"))
)

In [0]:
#forçar exp_dt a formato data
df_drug = df_drug.withColumn("EXP_DT", to_date("EXP_DT", "yyyy-MM-dd"))


In [0]:
#Limpar a coluna Dose_form e uniformizar os dados
df_drug = df_drug.withColumn(
    "DOSE_FORM",
    when(
        col("DOSE_FORM").isin("Unknown", "UNK", "NOT SPECIFIED", "Formulation Unknown", 
                              "Unknown Formulation", "Not Provided", "Not Specified", 
                              "Unknown (other/unspecified)", "***", "No Drug", "Xxx", "Unknown,unknown", " ()"  ),
        None
    ).otherwise(
        initcap(
            regexp_replace(col("DOSE_FORM"), r"\d+", "")
        )
    )
)


In [0]:
#Uniformizar os dados da coluna Dose_Form
df_drug = df_drug.withColumn("DOSE_FORM_ORIGINAL", col("DOSE_FORM"))

df_drug = df_drug.withColumn("DOSE_FORM", lower(col("DOSE_FORM")))

df_drug = df_drug.withColumn(
    "DOSE_FORM",
    when(col("DOSE_FORM").rlike(".*tablets?"), "Tablet")
    .when(col("DOSE_FORM").rlike(".*solutions?"), "Solution")
    .when(col("DOSE_FORM").rlike(".*injections?"), "Injection")
    .when(col("DOSE_FORM").rlike(".*capsules?"), "Capsule")
    .when(col("DOSE_FORM").rlike(".*powders?"), "Powder")
    .when(col("DOSE_FORM").rlike(".*infusions?"), "Infusion")
    .when(col("DOSE_FORM").rlike(".*orals?"), "Oral")
    .when(col("DOSE_FORM").rlike(".*creams?"), "Cream")
    .when(col("DOSE_FORM").rlike(".*gels?"), "Gel")
    .when(col("DOSE_FORM").rlike(".*ointments?"), "Ointment")
    .when(col("DOSE_FORM").rlike(".*suspensions?"), "Suspension")
    .when(col("DOSE_FORM").rlike(".*drops?"), "Drops")
    .when(col("DOSE_FORM").rlike(".*nasal sprays?"), "Nasal Spray")
    .when(col("DOSE_FORM").rlike(".*patches?"), "Transdermal Patch")
    .when(col("DOSE_FORM").rlike(".*dry powders?"), "Dry Powder")
    .when(col("DOSE_FORM").rlike(".*pens?"), "Pen")
    .when(col("DOSE_FORM").rlike(".*aerosols?"), "Aerosol")
    .when(col("DOSE_FORM").rlike(".*drinks?"), "Drink")
    .when(col("DOSE_FORM").rlike(".*pumps?"), "Pump")
    .when(col("DOSE_FORM").rlike(".*injects?"), "Injection")
    .when(col("DOSE_FORM").rlike(".*liquid topicals?"), "Liquid Topical")
    .when(col("DOSE_FORM").rlike(".*rectals?"), "Rectal")
    .when(col("DOSE_FORM").rlike(".*emulsions?"), "Emulsion")
    .when(col("DOSE_FORM").rlike(".*solvents?"), "Solvent")
    .when(col("DOSE_FORM").rlike(".*liquid"), "Liquid")
    .when(col("DOSE_FORM").rlike(".*topicals?"), "Topical")
    .when(col("DOSE_FORM").rlike(".*capsulas?"), "Capsulas")
    .when(col("DOSE_FORM").rlike(".*implantations?"), "Implantation")
    .when(col("DOSE_FORM").rlike(".*«syringes?"), "Injection")
    .when(col("DOSE_FORM").rlike(".*iv"), "Injection")
    .when(col("DOSE_FORM").rlike(".*lotions?"), "Lotion")
    .when(col("DOSE_FORM").rlike(".*vapours?"), "Vapour")
    .when(col("DOSE_FORM").rlike(".*dispersions?"), "Dispersion")
    .when(col("DOSE_FORM").rlike(".*granules?"), "Granule")
    .when(col("DOSE_FORM").rlike(".*sprays?"), "Spray")
    .otherwise(initcap(col("DOSE_FORM")))
)

df_drug = df_drug.withColumn(
    "DOSE_FORM",
    when(col("DOSE_FORM") == lower(col("DOSE_FORM_ORIGINAL")), None).otherwise(col("DOSE_FORM"))
)



In [0]:
#Corrigir coluna Dose_frq para uniformizar os dados 
df_drug = df_drug.withColumn(
    "DOSE_FREQ",
    when(col("DOSE_FREQ") == "1X", "Once or one time")
    .when(col("DOSE_FREQ") == "BID", "Twice a day")
    .when(col("DOSE_FREQ") == "BIW", "Twice a week")
    .when(col("DOSE_FREQ") == "HS", "At bedtime")
    .when(col("DOSE_FREQ") == "PRN", "As needed")
    .when(col("DOSE_FREQ") == "Q12H", "Every 12 hours")
    .when(col("DOSE_FREQ") == "Q2H", "Every 2 hours")
    .when(col("DOSE_FREQ") == "Q3H", "Every 3 hours")
    .when(col("DOSE_FREQ") == "Q3W", "Every 3 weeks")
    .when(col("DOSE_FREQ") == "Q4H", "Every 4 hours")
    .when(col("DOSE_FREQ") == "Q5H", "Every 5 hours")
    .when(col("DOSE_FREQ") == "Q6H", "Every 6 hours")
    .when(col("DOSE_FREQ") == "Q8H", "Every 8 hours")
    .when(col("DOSE_FREQ") == "QD", "Daily")
    .when(col("DOSE_FREQ") == "QH", "Every hour")
    .when(col("DOSE_FREQ") == "QID", "4 times a day")
    .when(col("DOSE_FREQ") == "QM", "Monthly")
    .when(col("DOSE_FREQ") == "QOD", "Every other day")
    .when(col("DOSE_FREQ") == "QOW", "Every other week")
    .when(col("DOSE_FREQ") == "QW", "Every week")
    .when(col("DOSE_FREQ") == "TID", "3 times a day")
    .when(col("DOSE_FREQ") == "TIW", "3 times a week")
    .when(col("DOSE_FREQ") == "UNK", None)  # Substituir "UNK" por null
    .when(col("DOSE_FREQ").like("%/WK"), "Every week")  # Quando termina com /WK
    .when(col("DOSE_FREQ").like("%/MONTH"), "Monthly")  # Quando termina com /MONTH
    .when(col("DOSE_FREQ").like("%/HR"), "Every hour")
    .when(col("DOSE_FREQ").like("%/MIN"), "Every minute")  # Quando termina com /HR
    .when(col("DOSE_FREQ") == "/CYCLE", "Every cicle")
    .when(col("DOSE_FREQ") == "/YR", "Every year")
    .when(col("DOSE_FREQ") == "TRIMESTER", "Every 3 months")
    .when(col("DOSE_FREQ") == "/SEC", "Ever")
    .when(col("DOSE_FREQ") == "999", None)  # Substituir "999" por null
    .when(col("DOSE_FREQ").isin("PC", "ONCE", "Q4-6H", "QAM", "TOTAL", "AC", "UD"), None)  # Substituir "PC", "ONCE", "Q4-6H" por null
    .otherwise(col("DOSE_FREQ"))  # Manter outros valores sem alteração
)

In [0]:
#Eliminar colunas desnecessáiras para a pipeline
columns_to_drop = ['route', 'routenew', 'dose_form_original']

# Remover as colunas
df_drug = df_drug.drop(*columns_to_drop)

In [0]:
#renomear colunas
df_drug = df_drug \
    .withColumnRenamed("ROLE_COD", "role_cod") \
    .withColumnRenamed("routenew2", "route") \
    .withColumnRenamed("LOT_NUM", "lot_num") \
    .withColumnRenamed("EXP_DT", "exp_dt") \
    .withColumnRenamed("LOT_NUM", "lot_num") \
    .withColumnRenamed("DOSE_FORM", "dose_form") \
    .withColumnRenamed("DOSE_FREQ", "dose_freq") \
    .withColumnRenamed("prod_ai", "active ingredient")

In [0]:
#Ordenar colunas
df_drug = df_drug.select(
    "primaryid", "caseid", "drug_seq", "role_cod", "active ingredient", "dose_vbm", "route", 
    "cum_dose_chr", "cum_dose_unit", "dechal", "rechal", 
    "lot_num", "exp_dt", "nda_num", "dose_amt", "dose_unit", "dose_form", "dose_freq"
)

In [0]:
#Restaurar metadados
df_drug = restore_metadata(schema_drug,df_drug)

In [0]:
# salvar parquet final
df_drug.write.mode('overwrite').parquet("dbfs:/FileStore/FAERS-grupo-4/drug_final")