# Notebook 3: Funções de Transformação e Limpeza de Dados

## Introdução
Neste notebook, iremos implementar as transformações e limpar os dados da tabela 'Demographic' do conjunto de dados FAERS, com base na análise exploratória realizada no notebook anterior (Exploratory_Data_Analysis_Demographic). 

## Estrutura do notebook
- Importação de Bibliotecas
- Carregamento dos Dados (Parquet Raw) usando o Schema definido 
- Aplicar apenas as Transformações necessárias
- Salvar **Parquet Final** com os Dados Limpos


In [0]:
# Importar bibliotecas necessárias
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, BooleanType, LongType, DoubleType
from pyspark.sql.functions import col, when, length, to_date, expr, round

In [0]:
%run "../Funcoes_auxiliares" 

In [0]:
# Carregar dados do Parquet raw
schema_demo = StructType ([
  StructField('primaryid',LongType(),False,{'description':'Unique number for identifying a FAERS report'}),
  StructField('caseid',LongType(),False,{'description':'Number for identifying a FAERS case.'}),
  StructField('caseversion',IntegerType(),False,{'description':'Safety Report Version Number'}),
  StructField('i_f_code',StringType(),False,{'description':'Code for initial or follow-up status of report, as reported by manufacturer.'}),
  StructField('event_dt',IntegerType(),True,{'description':'Date the adverse event occurred or began'}),
  StructField('mfr_dt',IntegerType(),True,{'description':'Date manufacturer first received initial information'}),
  StructField('init_fda_dt',IntegerType(),True,{'description':'Date FDA received first version (Initial) of Case'}),
  StructField('fda_dt',IntegerType(),True,{'description':'Date FDA received Case'}),
  StructField('rept_cod',StringType(),False,{'description':'Code for the type of report submitted'}), 
  StructField('auth_num',StringType(),True,{'description':'Regulatory Authority’s case report number'}),
  StructField('mfr_num',StringType(),True,{'description':'Manufacturer’s unique report identifier'}),
  StructField('mfr_sndr',StringType(),False,{'description':'Coded name of manufacturer sending report'}),
  StructField('lit_ref',StringType(),True,{'description':'Literature Reference information'}),
  StructField('age',IntegerType(),True,{'description':'Numeric value of patient’s age at event.'}),
  StructField('age_cod',StringType(),True,{'description':'Unit abbreviation for patient’s age'}),
  StructField('age_grp',StringType(),True,{'description':'Patient Age Group code'}),
  StructField('sex',StringType(),True,{'description':'Code for patient’s sex'}),
  StructField('e_sub',StringType(),False,{'description':'Whether this report was submitted under the electronic submissions procedure for manufacturers'}),
  StructField('wt',DoubleType(),True,{'description':'Numeric value of patient’s weight.'}),
  StructField('wt_cod',StringType(),True,{'description':'Unit abbreviation for patient’s weight'}),
  StructField('rept_dt',IntegerType(),True,{'description':'Date report was sent'}),
  StructField('to_mfr',StringType(),True,{'description':'Whether voluntary reporter also notified manufacturer'}),
  StructField('occp_cod',StringType(),True,{'description':'Abbreviation for the reporter’s type of occupation in the latest version of a case.'}),
  StructField('reporter_country',StringType(),True, {'description': "Reporter Country Code"}),
  StructField('occr_country',StringType(),True,{'description':'The country where the event occurred.'})
  ]
)
df_demo = spark.read.format('parquet').schema(schema_demo).load('dbfs:/FileStore/FAERS-grupo-4/demo_raw')



In [0]:
#i_f_code para bool
df_demo = df_demo.withColumn("folup_rpt", when(df_demo['i_f_code'] == 'F', True).otherwise(False))
df_demo=df_demo.drop(df_demo['i_f_code'])

In [0]:
# aplicar transformação para data de todas as colunas '_dt' através da função convert_date.  
colunas_dt = [x for x in df_demo.columns if x.endswith('_dt')]
for coluna in colunas_dt:
    df_demo = convert_date(df_demo,coluna,coluna)

In [0]:
#Fazer mapeamento da coluna 'rept_cod' para 'rept_typ_txt'
mapping_data = [
    ("EXP", "Expedited (15-Day)"),
    ("PER", "Periodic (Non-Expedited)"),
    ("DIR", "Direct"),
    ("5DAY", "5-Day"),
    ("30DAY", "30-Day")
]
mapping_columns = ["rept_cod", "rept_typ_txt"]
df_mapping = spark.createDataFrame(mapping_data, mapping_columns)

# Join entre df_rept_cod e o DataFrame de mapeamento
df_demo= df_mapping.join(df_demo, on="rept_cod", how="left")

In [0]:
#corrgir nulls na coluna age_cod  
df_demo = df_demo.withColumn(
    "age_cod",
    when(
        (col("age_grp") == "A") & (col("age_cod").isNull()) & (col("age").isNotNull()) & (col("age")>18) & (col("age")<150),
        "YR"
    )
    .when(
        (col("age_grp") == "E") & (col("age_cod").isNull()) & (col("age").isNotNull()) & (col("age")>18) & (col("age")<150),
        "YR"
    )
    .when( col("age_cod").isNotNull(), 
        col("age_cod")

    )  # Defina o valor padrão para outras condições
)


In [0]:
#Normalizar 'age' em 'years'
df_demo = df_demo.withColumn(
    "age_in_years",
      when(col("age_cod") == "MON", round(col("age") / 12, 2))
      .when(col("age_cod") == "HR", round(col("age")/(24*365) , 2))
      .when(col("age_cod") == "DY", round(col("age")/365 , 2))
      .when(col("age_cod") == "DEC", round(col("age")*10 , 2))
      .when(col("age_cod") == "WK", round(col("age")/52 , 2))
      .when(col("age_cod") == "YR", col("age")).otherwise(None)
    )

In [0]:
#corrigir nulls na coluna age_grp
df_demo = df_demo.withColumn("age_grp",
   when(col("age_grp").isNotNull(),col("age_grp")) 
    .when(col("age_in_years") < (28.0/365), "N")
    .when((col("age_in_years") >= (28.0/365)) & (col("age_in_years") < 2), "I")
    .when((col("age_in_years") >= 2) & (col("age_in_years") < 12), "C")
    .when((col("age_in_years") >= 12) & (col("age_in_years") < 22), "T")
    .when((col("age_in_years") >= 22) & (col("age_in_years") < 65), "A")
    .when(col("age_in_years") >= 65 , "E")
    
)

In [0]:
#Fazer fill dos nulls do sex por UNK
df_demo = df_demo.fillna({'sex':'UNK'})

In [0]:
#fazer conversão para booleano da coluna e_sub
df_demo = df_demo.withColumn("e_sub",when(col("e_sub")=='Y',True).otherwise(False))

In [0]:
#fazer a substituição de UNK por null na coluna wt_cod
df_demo = df_demo.withColumn("wt_cod", when(col("wt_cod") == "UNK", None).otherwise(col("wt_cod")))

In [0]:
#criar coluna wt_in_kg
df_demo = df_demo.withColumn(
    "wt_in_kg",
      when(col("wt_cod") == "KG", round(col("wt"), 3))
      .when(col("wt_cod") == "LBS", round(col("wt")/2.2046 , 3))
      .when(col("wt_cod") == "GMS", round(col("wt")/1000 , 3)).otherwise(None)
    )

In [0]:
#fazer a conversão da coluna to_mfr para booleano substituindo UNK por nulls
df_demo = df_demo.withColumn("to_mfr",when(col("to_mfr")=='Y',True).when(col("to_mfr")=='N',False).otherwise(None))

In [0]:
#fazer a substituição de UNK por null na coluna occp_cod
df_demo = df_demo.withColumn("occp_cod", when(col("occp_cod") == "UNK", None).otherwise(col("occp_cod")))


In [0]:
#criar a coluna occp_txt
mapping_data = [
    ("MD", "Physician"),
    ("PH", "Pharmacist"),
    ("OT", "Other health-professional"),
    ("LW", "Lawyer"),
    ("CN", "Consumer")
]
mapping_columns = ["occp_cod", "occp_txt"]
df_mapping = spark.createDataFrame(mapping_data, mapping_columns)

# Join entre df_rept_cod e o DataFrame de mapeamento
df_demo = df_demo.join(df_mapping, on="occp_cod", how="left")

In [0]:
#substituir UNKs e nulls por Country not specified
df_demo = df_demo.withColumn("reporter_country", when(length(col("reporter_country")) != 2, 'Country not specified').otherwise(col("reporter_country"))).fillna({'reporter_country': 'Country not specified'})
df_demo = df_demo.withColumn("occr_country", when(length(col("occr_country")) != 2, 'Country not specified').otherwise(col("occr_country"))).fillna({'occr_country': 'Country not specified'})

In [0]:
#criar df_countries
df_countries = spark.read.csv("dbfs:/FileStore/NCIt_GENC_Terminology.txt", header=True, inferSchema=True, sep='\t').select(['GENC 2 Letter Code','GENC Name (FDA Standard)']).dropna().withColumnsRenamed({'GENC 2 Letter Code':'reporter_country','GENC Name (FDA Standard)':'reporter_country_text'})
new_rows = [('Country not specified', 'Country not specified')]
new_df = spark.createDataFrame(new_rows, ['reporter_country','reporter_country_text'])
df_countries = df_countries.union(new_df)

In [0]:
#Criar coluna reporter_country_text
df_demo = df_demo.join(df_countries, on="reporter_country", how="left")

In [0]:
#Alterar titulos df_countries para occr_country e criar occr_country_text
df_demo = df_demo.join(df_countries.withColumnsRenamed({'reporter_country':'occr_country','reporter_country_text':'occr_country_text'}), on="occr_country", how="left")

In [0]:
#Restaurar metadados
df_demo = restore_metadata(schema_demo,df_demo)

In [0]:
# salvar parquet final
df_demo.write.mode('overwrite').parquet("dbfs:/FileStore/FAERS-grupo-4/demo_final")