In [0]:
from pyspark.sql.functions import col, explode, collect_list

In [0]:
df = (spark.read
      .format("json")
      .option("multiLine", True)
      .load("/Volumes/wine_harmonization/datasets/raw_datasets/flavor_profiles_v4/"))

In [0]:
df_exploded = df.select(
    "entity_alias_readable",
    explode("molecules").alias("mol")
)

df_final = df_exploded.select(
    col("entity_alias_readable"),
    col("mol.common_name").alias("common_name"),
    col("mol.fema_flavor_profile").alias("fema_flavor_profile"),
    col("mol.molecular_weight").alias("molecular_weight"),
    col("mol.odor").alias("odor"),
    col("mol.flavor_profile").alias("flavor_profile"),
    col("mol.functional_groups").alias("functional_groups"),
    col("mol.pubchem_id").alias("pubchem_id")
)

display(df_final)

In [0]:
df_ingredients = df_final.groupBy("entity_alias_readable").agg(
    collect_list("common_name").alias("molecules"),
    collect_list("pubchem_id").alias("pubchem_ids")
)

display(df_ingredients)

In [0]:
df_ingredients.write.mode("overwrite").saveAsTable("wine_harmonization.datasets.ingredients_molecules")

### Molecules table

In [0]:
df_molecules = df_final.drop("entity_alias_readable", "odor", "functional_groups", "flavor_profile").dropDuplicates(["pubchem_id"])
display(df_molecules)

In [0]:
from pyspark.sql.functions import (
    split,
    regexp_replace,
    expr,
    flatten,
    array_distinct,
    initcap,
    trim,
    length
)

df_final = (
    df_molecules
    # 1. Tokeniza (remove aspas, separa em "@", depois em vírgula) e achata
    .withColumn(
        "tokens_raw",
        flatten(
            expr("""
                transform(
                    split(regexp_replace(fema_flavor_profile, '"', ''), '@'),
                    x -> split(x, '\\s*,\\s*')
                )
            """)
        )
    )
    # 2. Limpa brancos e capitaliza
    .withColumn(
        "tokens_cleaned",
        expr("""
            transform(
                filter(tokens_raw, x -> trim(x) <> ''),
                x -> initcap(trim(x))
            )
        """)
    )
    # 3. Filtra apenas tokens curtos (<=20 caracteres)
    .withColumn(
        "tokens_short",
        expr("filter(tokens_cleaned, x -> length(x) <= 20)")
    )
    # 4. Remove duplicatas e já renomeia pra coluna final
    .withColumn("flavor_array", array_distinct("tokens_short"))
    # 5. Elimina colunas intermediárias
    .drop("tokens_raw", "tokens_cleaned", "tokens_short", "fema_flavor_profile")
)

display(df_final)

In [0]:
df_final.write.mode("overwrite").saveAsTable("wine_harmonization.datasets.molecules")

In [0]:
from pyspark.sql.functions import explode

df_final = spark.read.table("wine_harmonization.datasets.molecules")

distinct_flavors = df_final.select(explode("flavor_array").alias("flavor")).distinct()

display(distinct_flavors)

In [0]:
%sql
select * from wine_harmonization.datasets.molecules limit 10

In [0]:
%sql
select * from wine_harmonization.datasets.ingredients_molecules limit 10

In [0]:
%sql
select * from wine_harmonization.datasets.recipes limit 10