## Pipeline Flavor Profile

In [86]:
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
import csv
import thefuzz 


In [422]:
'''
does voncert a pandas dataframe into a matrix where 
keys = row name
values = col name
data = either 0 or 1 
'''
def dict_to_matrix(dictionary):
    unique_values = set(val for sublist in dictionary.values() for val in sublist)
    data_dict = {key: {val: 1 if val in values else 0 for val in unique_values} for key, values in dictionary.items()}
    df = pd.DataFrame(data_dict).T
    return(df)


#### 1. MS Data ----- Massbank -----> Molecules

-> Chris

save which intensities belong to which molecule and give each the % 
2 approaches for one molecule with multiple OQs
1.  each gets same % 
2.  divide % by count of OQs per molecule

<span style="color:lightblue">mol_int</span> : dict where key is the molecule name and value is the intensites from the mass spec data

In [87]:
# DUMMY DATA
intensities = [80, 40, 5]
ms_molecules_ex = ['(E)-3-hexenol', "2-dodecenal",'δ-muurolene']
mol_int = {'(E)-3-hexenol': 80, "2-dodecenal": 40 ,'δ-muurolene': 5}

<span style="color:#FBB714">I_vector</span> : dataframe with the molecules x intensities

In [88]:
#I_vector = pd.DataFrame(data = mol_int, index = ["intensities"]).T
I_vector = pd.DataFrame(data= intensities,index = ms_molecules_ex, columns=["intensities"])
I_vector

Unnamed: 0,intensities
(E)-3-hexenol,80
2-dodecenal,40
δ-muurolene,5


#### 2. Molecules ----- Flavornet -----> OD 
Find ODs by matching molecules with databases using Flavornet

<span style="color:lightblue">CAS_to_odorants.csv</span> : table with following columns - molecule, CAS, Mol wt, odorant


##### table manipualtion done in R

```r
library(rlang)
library(tidyverse)

CAS_to_odorants <- read.table("~/Bioinformatik_20.21/Bachelorarbeit/CAS_to_mol.txt", sep = "\t", header = FALSE)
OD_to_mol <- read.table("~/Bioinformatik_20.21/Bachelorarbeit/OD_mol.txt", sep = "\t", header = FALSE)

OD_to_mol_sep <- OD_to_mol %>%
  separate_rows(V2, sep = ";\\s*") %>%
  mutate(V2 = trimws(V2))
OD_to_mol_final <- OD_to_mol_sep[, c("V2","V1")]
colnames(OD_to_mol_final) <- c("molecule", "OD")

CAS_to_odorants_sep <- CAS_to_odorants %>%
  separate_rows(V3, sep = ";\\s*") %>%
  mutate(V3 = trimws(V3))
colnames(CAS_to_odorants_sep) <- c("CAS","Mol Wt", "molecule")


merged_df <- merge(CAS_to_odorants_sep, OD_to_mol_final, by = "molecule")
write.csv(merged_df, file = "~/Bioinformatik_20.21/Bachelorarbeit/CAS_mol_OD.csv", row.names = FALSE)
```

<span style="color:lightblue">mol_to_OD</span> : dict where key is the molecule and the descriptors for these molecules are the values

In [301]:
mol_to_OD = {}
molecules = []
descriptors = []
with open('./data/CAS_mol_OD.csv', 'r',encoding='utf-8') as tabfile:
    reader = csv.reader(tabfile, delimiter=',')
    for row in reader: 
        molecule = row[0]
        OD = row[3]
        molecules.append(molecule)
        descriptors.append(OD)
        if molecule in mol_to_OD:
            mol_to_OD[molecule].add(OD)
        else:
            mol_to_OD[molecule]  = set([OD])

molecules = list(set(molecules))
descriptors = list(set(descriptors))

<span style="color:#FBB714">D</span>: Dataframe with the *molecules x descriptors*

In [302]:
D = pd.DataFrame(index=molecules, columns=descriptors)

In [303]:
for col in D.columns:
    for index, row in D.iterrows():
        # Check if value in  current row matches any entry in the dictionary
        if index in mol_to_OD.keys() and col in mol_to_OD[index]:
            # Set to 1
            D.at[index, col] = 1

D = D.fillna(0)
D

Unnamed: 0,cooked vegetable,meat,putrid,marshmallow,apricot,thyme,nutmeg,honey,balsamic,prune,...,cognac,garlic,pepper,lily,geranium,biscuit,truffle,grapefruit,pine,clove
dimethylethyl pyrazine,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
α-cubebene,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
piperitol,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
cis-isocitral,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
dihydromyrcenol,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
propanoic acid,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"3,6-nonadienal",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
undecanol,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
sabinene,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


match molecules from MS data with flavors

In [304]:
flavors = []
mol_flavors = {}
for mol in ms_molecules_ex:
    if mol in mol_to_OD.keys():
        flavors += mol_to_OD[mol]
        mol_flavors[mol] = mol_to_OD[mol]

found_flavors = set(flavors)
found_flavors

{'fat', 'fresh', 'green', 'moss', 'oil', 'sweet'}

<span style="color:lightblue">found_flavors</span>: list of the ODs found from the identified molecules 

<span style="color:lightblue">mol_flavors</span>: contains the molecule from where each OD is coming from 

In [305]:
mol_flavors

{'(E)-3-hexenol': {'fresh', 'moss'},
 '2-dodecenal': {'fat', 'green', 'sweet'},
 'δ-muurolene': {'oil'}}

<span style="color:#FBB714">OD</span>: Dataframe - *molecules x ODs*

In [306]:
# filter only relevant rows - only those ODs coming from indentified molecules
OD = D[(D.index).isin(I_vector.index)]
# get only important/matching columns
OD = OD.loc[:, (OD != 0).any(axis=0)]
OD 

Unnamed: 0,oil,green,fresh,moss,sweet,fat
(E)-3-hexenol,0,0,1,1,0,0
δ-muurolene,1,0,0,0,0,0
2-dodecenal,0,1,0,0,1,1


#### 3. OD ----- Roche Data -----> OQ
Use ontology to find matching OQ

In [307]:
OD_OQ_df =  pd.read_csv('./data/DATA_OD-2-OQ.csv', header=None)
# filter out rows with only 0 
OD_OQ_df = OD_OQ_df.loc[(OD_OQ_df.iloc[:,1:] != 0).any(axis=1)]


In [308]:
OQ_descriptors = ["Almond","Woody","Camphor","Leather","Cooked","Spicy","Floral","Fresh","Fruity","Smoky","Cut-grass","Lactonic","Vegetable","Honey","Peel","Sulfurous","Toasty","Vanilla","Green","Vinous","Violet"]

<span style="color:#FBB714">fuzzy_OQs</span>: Dataframe - fuzzy_ODs x OQs

In [309]:
fuzzy_OQs = []
ms_fuzzy_ODs = {}
for key, value in mol_flavors.items():
    all_OQS = []
    for index, row in OD_OQ_df.iterrows():
        for keyword in value:
            if fuzz.ratio(keyword, str(row[0]).lower()) >= 60:
                all_OQS.append((row[0]))
                fuzzy_OQs.append(row)
    ms_fuzzy_ODs[key] = all_OQS
fuzzy_OQs = pd.DataFrame(fuzzy_OQs)


In [310]:
OQ_descriptors = ["Almond","Woody","Camphor","Leather","Cooked","Spicy","Floral","Fresh","Fruity","Smoky","Cut-grass","Lactonic","Vegetable","Honey","Peel","Sulfurous","Toasty","Vanilla","Green","Vinous","Violet"]

fuzzy_OQs.columns = ["OD/OQ"] + OQ_descriptors
# get only important columns
fuzzy_OQs = fuzzy_OQs.loc[:, (fuzzy_OQs != 0).any(axis=0)]
fuzzy_OQs

Unnamed: 0,OD/OQ,Floral,Fruity,Cut-grass,Green
132,Mimosa,1,0,0,0
100,Grape,0,1,0,0
102,Green,0,0,0,1
118,Leafy-green,0,0,0,1
218,Weedy,0,0,1,0


<span style="color:lightblue">ms_fuzzy_ODs</span>: dict with mol as keys and fuzzy_ODs as values 

In [325]:
# to know which OD comes from which molecule
ms_fuzzy_ODs

{'(E)-3-hexenol': ['Mimosa'],
 '2-dodecenal': ['Grape', 'Green', 'Leafy-green', 'Weedy'],
 'δ-muurolene': []}

<span style="color:#FBB714">OQs</span>: Dataframe - ODs x OQs

In [326]:
OQs = pd.DataFrame(columns=list(fuzzy_OQs.columns[1:]), index=ms_molecules_ex)


<span style="color:lightblue">OD_dict</span>: dictionary - key = molecule, value = OQs

In [341]:
OQ_dict = {}
for key, value in ms_fuzzy_ODs.items():
    current_OQ = []
    for fuzzy_OD in value:
        for index, row in fuzzy_OQs.iterrows():
            if row[0] == fuzzy_OD:
                current_OQ.append(','.join(row.loc[row.eq(1)].index.tolist()))
                # todo: maybe expection -> do change if two ones in a row
                OQs.loc[key][row.loc[row.eq(1)].index.tolist()] = 1
    OQ_dict[key] = current_OQ

In [340]:
OQ_dict

{'(E)-3-hexenol': ['Floral'],
 '2-dodecenal': ['Fruity', 'Green', 'Green', 'Cut-grass'],
 'δ-muurolene': []}

In [342]:
OQs = OQs.fillna(0)
OQs

Unnamed: 0,Floral,Fruity,Cut-grass,Green
(E)-3-hexenol,1,0,0,0
2-dodecenal,0,1,1,1
δ-muurolene,0,0,0,0


In [179]:
'''OQs = OD_OQ_df[OD_OQ_df.apply(lambda row: any(keyword in str(row).lower() for keyword in found_flavors if fuzz.ratio(keyword, found_flavors)>=15), axis=1)]
OQs
# TODO: e.g. make fat belong to fatty, oil to oily etc. 
# ODs = 'fat', 'fresh', 'green', 'moss', 'oil', 'sweet'
'''

"OQs = OD_OQ_df[OD_OQ_df.apply(lambda row: any(keyword in str(row).lower() for keyword in found_flavors if fuzz.ratio(keyword, found_flavors)>=15), axis=1)]\nOQs\n# TODO: e.g. make fat belong to fatty, oil to oily etc. \n# ODs = 'fat', 'fresh', 'green', 'moss', 'oil', 'sweet'\n"

#### 4.1 OQ ------ Binary Matrix -----> OSA


In [356]:
OSA_data = pd.read_table('./data/OQ_OSA.csv', delimiter=';')
OSA_data


Unnamed: 0,OQ/OSA,floral,fruit,solventy,soapy,sweet,wood,nutty,spicy,oily,...,feinty,cereal,green/grassy,malt,primary taste,mouthfeel,nasal effects,dried fruits,aftertaste,complexity
0,fresh flowers,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,floral,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,fruit,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,green,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,perfumed,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,fresh fruit,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,dried fruit,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
7,citrus,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,artificial fruit flavorings,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,nail varnish remover,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [358]:
OQs

Unnamed: 0,Floral,Fruity,Cut-grass,Green
(E)-3-hexenol,1,0,0,0
2-dodecenal,0,1,1,1
δ-muurolene,0,0,0,0


In [408]:
mol_OSA ={}
for key, value in OQ_dict.items():
    current_OSA = []
    for OQ in value:
        for index, row in OSA_data.iterrows():
            if fuzz.ratio(OQ, str(row[0]).lower()) >= 60:
                current_OSA.append(str(row[0]))
    mol_OSA[key] = list(set(current_OSA))

print(mol_OSA)

{'(E)-3-hexenol': ['floral'], '2-dodecenal': ['green', 'fruit', 'grass'], 'δ-muurolene': []}


In [423]:
OSA = dict_to_matrix(mol_OSA)
OSA

Unnamed: 0,green,fruit,floral,grass
(E)-3-hexenol,0,0,1,0
2-dodecenal,1,1,0,1
δ-muurolene,0,0,0,0


#### normalize intensities ?
#### what to do if 2 molecules have same OSA add intensites?

In [262]:
intensities

[80, 40, 5]

In [424]:
mol_OSA

{'(E)-3-hexenol': ['floral'],
 '2-dodecenal': ['green', 'fruit', 'grass'],
 'δ-muurolene': []}

Flavor profile with OSAs:

dried_fruits: 80 + 40

floral: 80 + 40 

fruit: 40 

malt: 40 

sweet: 40 


#### 4.2 OQ ------ Fuzzy Logic -----> OSA
Use Fuzzy logic to weigh OQ with expert knowledge/Intensity from Masspec


#### 4.3 Comparison BM and FL

#### 5. Show Flavourprofile