## Pipeline Flavor Profile

In [1]:
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
import csv



In [2]:
'''
does convert a pandas dataframe into a matrix where 
keys = row name
values = col name
data = either 0 or 1 
'''
def dict_to_matrix(dictionary):
    unique_values = set(val for sublist in dictionary.values() for val in sublist)
    data_dict = {key: {val: 1 if val in values else 0 for val in unique_values} for key, values in dictionary.items()}
    df = pd.DataFrame(data_dict).T
    return(df)


In [3]:
'''
find overlapp between to lists with a threshold (=60)
'''
def overlapping_elements(list1, list2, threshold=60):
    overlap = []
    for elem1 in list1:
        for elem2 in list2:
            ratio = fuzz.ratio(elem1, elem2)
            if ratio >= threshold:
                overlap.append(elem1)
    return overlap

#### 1. MS Data ----- Massbank -----> Molecules

-> Chris

save which intensities belong to which molecule and give each the % 
2 approaches for one molecule with multiple OQs
1.  each gets same % 
2.  divide % by count of OQs per molecule

<span style="color:lightblue">mol_int</span> : dict where key is the molecule name and value is the intensites from the mass spec data

In [7]:
# DUMMY DATA
intensities = [80, 40, 5]
# only flavornet data use : ['(E)-3-hexenol', "2-dodecenal",'δ-muurolene']
ms_molecules_ex = ['dimethylethyl pyrazine', "3-mercaptothiophene",'δ-muurolene']
mol_int = {'(E)-3-hexenol': 80, "2-dodecenal": 40 ,'δ-muurolene': 5}

<span style="color:#FBB714">I_vector</span> : dataframe with the molecules x intensities

In [8]:
#I_vector = pd.DataFrame(data = mol_int, index = ["intensities"]).T
I_vector = pd.DataFrame(data= intensities,index = ms_molecules_ex, columns=["intensities"])
I_vector

Unnamed: 0,intensities
dimethylethyl pyrazine,80
3-mercaptothiophene,40
δ-muurolene,5


#### 2. Molecules ----- Flavornet -----> OD 
Find ODs by matching molecules with databases using Flavornet

<span style="color:lightblue">CAS_to_odorants.csv</span> : table with following columns - molecule, CAS, Mol wt, odorant


##### table manipualtion done in R

```r
library(rlang)
library(tidyverse)

CAS_to_odorants <- read.table("~/Bioinformatik_20.21/Bachelorarbeit/CAS_to_mol.txt", sep = "\t", header = FALSE)
OD_to_mol <- read.table("~/Bioinformatik_20.21/Bachelorarbeit/OD_mol.txt", sep = "\t", header = FALSE)

OD_to_mol_sep <- OD_to_mol %>%
  separate_rows(V2, sep = ";\\s*") %>%
  mutate(V2 = trimws(V2))
OD_to_mol_final <- OD_to_mol_sep[, c("V2","V1")]
colnames(OD_to_mol_final) <- c("molecule", "OD")

CAS_to_odorants_sep <- CAS_to_odorants %>%
  separate_rows(V3, sep = ";\\s*") %>%
  mutate(V3 = trimws(V3))
colnames(CAS_to_odorants_sep) <- c("CAS","Mol Wt", "molecule")


merged_df <- merge(CAS_to_odorants_sep, OD_to_mol_final, by = "molecule")
write.csv(merged_df, file = "~/Bioinformatik_20.21/Bachelorarbeit/CAS_mol_OD.csv", row.names = FALSE)
```

<span style="color:lightblue">mol_to_OD</span> : dict where key is the molecule and the descriptors for these molecules are the values

*Flavornet only*

In [4]:
# only using flavornet data
'''mol_to_OD = {}
molecules = []
descriptors = []
with open('./data/CAS_mol_OD.csv', 'r',encoding='utf-8') as tabfile:
    reader = csv.reader(tabfile, delimiter=',')
    for row in reader: 
        molecule = row[0]
        OD = row[3]
        molecules.append(molecule)
        descriptors.append(OD)
        if molecule in mol_to_OD:
            mol_to_OD[molecule].add(OD)
        else:
            mol_to_OD[molecule]  = set([OD])

molecules = list(set(molecules))
descriptors = list(set(descriptors))'''

In [5]:
len(descriptors)

197

In [47]:
big_book =  pd.read_table('./data/bigBook.txt', sep=',')


*Flavornet x Big Book*

In [53]:
# filter only those descriptor that are available in flavorent and in our big book
mol_to_OD = {}
molecules = []
descriptors = []
with open('./data/CAS_mol_OD.csv', 'r',encoding='utf-8') as tabfile:
    reader = csv.reader(tabfile, delimiter=',')
    for row in reader: 
        molecule = row[0]
        OD = row[3]
        for item in big_book['label']:
            if (fuzz.ratio(OD, item) >= 60):
                molecules.append(molecule)
                descriptors.append(item)
                if molecule in mol_to_OD:
                    mol_to_OD[molecule].add(item)
                else:
                    mol_to_OD[molecule]  = set([item])

molecules = list(set(molecules))
descriptors = list(set(descriptors))

In [48]:
len((descriptors))

50

<span style="color:#FBB714">D</span>: Dataframe with all *molecules x descriptors*

In [537]:
D = pd.DataFrame(index=molecules, columns=descriptors)

In [538]:
for col in D.columns:
    for index, row in D.iterrows():
        # Check if value in  current row matches any entry in the dictionary
        if index in mol_to_OD.keys() and col in mol_to_OD[index]:
            # Set to 1
            D.at[index, col] = 1

D = D.fillna(0)
D

Unnamed: 0,lemon,meat,putrid,roast,tomato leaf,rose,wine,melon,biscuit,black currant,...,mandarin,tobacco,rubber,pepper,geranium,paper,maple,herb,bitter,pine
dimethylethyl pyrazine,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
α-cubebene,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
piperitol,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
hexanol,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3-mercaptothiophene,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
abhexone,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
γ-octalactone,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"ethyl-(E,Z)-2,4-decadienoate",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
undecanol,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


match molecules from MS data with flavors

In [605]:
flavors = []
mol_flavors = {}
for mol in ms_molecules_ex:
    if mol in mol_to_OD.keys():
        flavors += mol_to_OD[mol]
        mol_flavors[mol] = mol_to_OD[mol]
    else:
        mol_flavors[mol] = {}

found_flavors = set(flavors)
found_flavors

{'cooked meat', 'roast'}

<span style="color:lightblue">found_flavors</span>: list of the ODs found from the identified molecules 

<span style="color:lightblue">mol_flavors</span>: contains the molecule from where each OD is coming from 

In [606]:
mol_flavors

{'dimethylethyl pyrazine': {'roast'},
 '3-mercaptothiophene': {'cooked meat'},
 'δ-muurolene': {}}

<span style="color:#FBB714">OD</span>: Dataframe - *molecules x ODs*

In [607]:
# filter only relevant rows - only those ODs coming from indentified molecules
OD = D[(D.index).isin(I_vector.index)]
# get only important/matching columns
OD = OD.loc[:, (OD != 0).any(axis=0)]
OD 

Unnamed: 0,roast,cooked meat
dimethylethyl pyrazine,1,0
3-mercaptothiophene,0,1


#### 3. OD ----- Roche Data -----> OQ
Use ontology to find matching OQ

In [608]:
# filter out 
OD_OQ_df =  pd.read_csv('./data/DATA_OD-2-OQ.csv', header=None)
# filter out rows with only 0 
OD_OQ_df = OD_OQ_df.loc[(OD_OQ_df.iloc[:,1:] != 0).any(axis=1)]


In [583]:
OQ_descriptors = ["Almond","Woody","Camphor","Leather","Cooked","Spicy","Floral","Fresh","Fruity","Smoky","Cut-grass","Lactonic","Vegetable","Honey","Peel","Sulfurous","Toasty","Vanilla","Green","Vinous","Violet"]

<span style="color:#FBB714">fuzzy_OQs</span>: Dataframe - fuzzy_ODs x OQs

In [609]:
fuzzy_OQs = []
ms_fuzzy_ODs = {}
for key, value in mol_flavors.items():
    all_OQS = []
    for index, row in OD_OQ_df.iterrows():
        for keyword in value:
            if fuzz.ratio(keyword, str(row[0]).lower()) >= 60:
                all_OQS.append((row[0]))
                fuzzy_OQs.append(row)
    ms_fuzzy_ODs[key] = all_OQS
fuzzy_OQs = pd.DataFrame(fuzzy_OQs)


In [610]:
OQ_descriptors = ["Almond","Woody","Camphor","Leather","Cooked","Spicy","Floral","Fresh","Fruity","Smoky","Cut-grass","Lactonic","Vegetable","Honey","Peel","Sulfurous","Toasty","Vanilla","Green","Vinous","Violet"]

fuzzy_OQs.columns = ["OD/OQ"] + OQ_descriptors
# get only important columns
fuzzy_OQs = fuzzy_OQs.loc[:, (fuzzy_OQs != 0).any(axis=0)]
fuzzy_OQs

Unnamed: 0,OD/OQ,Floral,Vegetable,Toasty
168,Rose,1,0,0
200,Toasted,0,0,1
58,Cooked vegetables,0,1,0


<span style="color:lightblue">ms_fuzzy_ODs</span>: dict with mol as keys and fuzzy_ODs as values 

In [611]:
# to know which OD comes from which molecule
ms_fuzzy_ODs

{'dimethylethyl pyrazine': ['Rose', 'Toasted'],
 '3-mercaptothiophene': ['Cooked vegetables'],
 'δ-muurolene': []}

<span style="color:#FBB714">OQs</span>: Dataframe - ODs x OQs

In [612]:
OQs = pd.DataFrame(columns=list(fuzzy_OQs.columns[1:]), index=ms_molecules_ex)


<span style="color:lightblue">OD_dict</span>: dictionary - key = molecule, value = OQs

In [613]:
OQ_dict = {}
for key, value in ms_fuzzy_ODs.items():
    current_OQ = []
    for fuzzy_OD in value:
        for index, row in fuzzy_OQs.iterrows():
            if row[0] == fuzzy_OD:
                current_OQ.append(','.join(row.loc[row.eq(1)].index.tolist()))
                # todo: maybe expection -> do change if two ones in a row
                OQs.loc[key][row.loc[row.eq(1)].index.tolist()] = 1
    OQ_dict[key] = current_OQ

In [614]:
OQ_dict

{'dimethylethyl pyrazine': ['Floral', 'Toasty'],
 '3-mercaptothiophene': ['Vegetable'],
 'δ-muurolene': []}

In [615]:
OQs = OQs.fillna(0)
OQs

Unnamed: 0,Floral,Vegetable,Toasty
dimethylethyl pyrazine,1,0,1
3-mercaptothiophene,0,1,0
δ-muurolene,0,0,0


In [179]:
'''OQs = OD_OQ_df[OD_OQ_df.apply(lambda row: any(keyword in str(row).lower() for keyword in found_flavors if fuzz.ratio(keyword, found_flavors)>=15), axis=1)]
OQs
# TODO: e.g. make fat belong to fatty, oil to oily etc. 
# ODs = 'fat', 'fresh', 'green', 'moss', 'oil', 'sweet'
'''

"OQs = OD_OQ_df[OD_OQ_df.apply(lambda row: any(keyword in str(row).lower() for keyword in found_flavors if fuzz.ratio(keyword, found_flavors)>=15), axis=1)]\nOQs\n# TODO: e.g. make fat belong to fatty, oil to oily etc. \n# ODs = 'fat', 'fresh', 'green', 'moss', 'oil', 'sweet'\n"

#### 4.1 OQ ------ Binary Matrix -----> OSA


In [616]:
OSA_data = pd.read_table('./data/OQ_OSA.csv', delimiter=';')
OSA_data


Unnamed: 0,OQ/OSA,floral,fruit,solventy,soapy,sweet,wood,nutty,spicy,oily,...,feinty,cereal,green/grassy,malt,primary taste,mouthfeel,nasal effects,dried fruits,aftertaste,complexity
0,fresh flowers,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,floral,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,fruit,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,green,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,perfumed,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,fresh fruit,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,dried fruit,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
7,citrus,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,artificial fruit flavorings,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,nail varnish remover,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [617]:
OQs

Unnamed: 0,Floral,Vegetable,Toasty
dimethylethyl pyrazine,1,0,1
3-mercaptothiophene,0,1,0
δ-muurolene,0,0,0


<span style="color:lightblue">mol_OSA</span>: dict - keys = molecules x value = OSAs

In [618]:
mol_OSA ={}
for key, value in OQ_dict.items():
    current_OSA = []
    for OQ in value:
        for index, row in OSA_data.iterrows():
            if fuzz.ratio(OQ, str(row[0]).lower()) >= 60:
                current_OSA.append(str(row[0]))
    mol_OSA[key] = list(set(current_OSA))

print(mol_OSA)

{'dimethylethyl pyrazine': ['floral', 'toasted'], '3-mercaptothiophene': ['green vegetables', 'cooked vegetables'], 'δ-muurolene': []}


<span style="color:#FBB714">OSA</span>: Dataframe - OQs x OSAs

In [619]:
OSA = dict_to_matrix(mol_OSA)
OSA

Unnamed: 0,green vegetables,cooked vegetables,floral,toasted
dimethylethyl pyrazine,0,0,1,1
3-mercaptothiophene,1,1,0,0
δ-muurolene,0,0,0,0


#### normalize intensities ?
#### what to do if 2 molecules have same OSA add intensites?

In [620]:
I_vector

Unnamed: 0,intensities
dimethylethyl pyrazine,80
3-mercaptothiophene,40
δ-muurolene,5


In [621]:
from sklearn.preprocessing import Normalizer
normalizer = Normalizer()
I_vector_normalized = normalizer.fit_transform(I_vector.T).T
I_vector_normalized

array([[0.89303292],
       [0.44651646],
       [0.05581456]])

In [622]:
final_OSA = OSA*I_vector_normalized
final_OSA

Unnamed: 0,green vegetables,cooked vegetables,floral,toasted
dimethylethyl pyrazine,0.0,0.0,0.893033,0.893033
3-mercaptothiophene,0.446516,0.446516,0.0,0.0
δ-muurolene,0.0,0.0,0.0,0.0


#### 4.2 OQ ------ Fuzzy Logic -----> OSA
Use Fuzzy logic to weigh OQ with expert knowledge/Intensity from Masspec


#### 4.3 Comparison BM and FL

#### 5. Show Flavourprofile