## Pipeline Flavor Profile

In [1]:
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
import csv



In [2]:
'''
does convert a pandas dataframe into a matrix where 
keys = row name
values = col name
data = either 0 or 1 
'''
def dict_to_matrix(dictionary):
    unique_values = set(val for sublist in dictionary.values() for val in sublist)
    data_dict = {key: {val: 1 if val in values else 0 for val in unique_values} for key, values in dictionary.items()}
    df = pd.DataFrame(data_dict).T
    return(df)


In [3]:
'''
find overlapp between to lists with a threshold (=70)
'''
def overlapping_elements(list1, list2, threshold=70):
    overlap = []
    for elem1 in list1:
        for elem2 in list2:
            ratio = fuzz.ratio(elem1, elem2)
            if ratio >= threshold:
                overlap.append(elem1)
    return overlap

In [4]:
'''
Convert DataFrame to dictionary
'''
def dataframe_to_dict(df):
    result_dict = {}
    for index, row in df.iterrows():
        if row[0] in result_dict:
            result_dict[row[0]].append(row[1])
        else:
            result_dict[row[0]] = [row[1]]
    return result_dict     

In [5]:
'''
find match between dictionary entries and the rows of a dataframe
'''
def find_matching_entries(dictionary,df):
    threshold = 80
    found_match = []
    found_match_per_entry = {}
    for key, value in dictionary.items():
        all_matches = []
        for index, row in df.iterrows():
            for keyword in value:
                if fuzz.ratio(keyword, str(index).lower()) >= threshold:
                    all_matches.append(index)
                    found_match.append(row)
        found_match_per_entry[key] = all_matches
    found_match = pd.DataFrame(found_match)
    return(found_match, found_match_per_entry)

In [None]:
'''
create a new matrix by finding matches and filtering right columns
'''
def create_matrix_from_dict(dictionary,df):
    matrix = pd.DataFrame(columns=list(df), index=ms_molecules_ex)
    final_dict = {}
    for key, value in dictionary.items():
        current_val = []
        for v in value:
            for index, row in df.iterrows():
                if index == v:
                    current_val.append(','.join(row.loc[row.eq(1)].index.tolist()))
                    # todo: maybe exception -> do change if two ones in a row
                    matrix.loc[key][row.loc[row.eq(1)].index.tolist()] = 1
        final_dict[key] = current_val
    matrix = matrix.fillna(0)
    matrix = matrix.loc[:, (matrix != 0).any(axis=0)]
    return(final_dict, matrix)
   

#### 1. MS Data ----- Massbank -----> Molecules

-> Chris

save which intensities belong to which molecule and give each the % 
2 approaches for one molecule with multiple OQs
1.  each gets same % 
2.  divide % by count of OQs per molecule

<span style="color:lightblue">mol_int</span> : dict where key is the molecule name and value is the intensites from the mass spec data

In [6]:
# DUMMY DATA
intensities = [80, 40, 5]
ms_molecules_ex =['(E)-3-hexenol', "coumarin",'citral']
mol_int = {'(E)-3-hexenol': 80, "coumarin": 40 ,'citral': 5}

<span style="color:#FBB714">I_vector</span> : dataframe with *molecules x intensities*

In [7]:
#I_vector = pd.DataFrame(data = mol_int, index = ["intensities"]).T
I_vector = pd.DataFrame(data= intensities,index = ms_molecules_ex, columns=["intensities"])
I_vector

Unnamed: 0,intensities
(E)-3-hexenol,80
coumarin,40
citral,5


#### 2. Molecules ----- Flavornet -----> OD 
Find ODs by matching molecules with databases using Flavornet

<span style="color:lightblue">CAS_to_odorants.csv</span> : table with following columns - molecule, CAS, Mol wt, odorant


##### table manipualtion done in R

```r
library(rlang)
library(tidyverse)

CAS_to_odorants <- read.table("~/Bioinformatik_20.21/Bachelorarbeit/CAS_to_mol.txt", sep = "\t", header = FALSE)
OD_to_mol <- read.table("~/Bioinformatik_20.21/Bachelorarbeit/OD_mol.txt", sep = "\t", header = FALSE)

OD_to_mol_sep <- OD_to_mol %>%
  separate_rows(V2, sep = ";\\s*") %>%
  mutate(V2 = trimws(V2))
OD_to_mol_final <- OD_to_mol_sep[, c("V2","V1")]
colnames(OD_to_mol_final) <- c("molecule", "OD")

CAS_to_odorants_sep <- CAS_to_odorants %>%
  separate_rows(V3, sep = ";\\s*") %>%
  mutate(V3 = trimws(V3))
colnames(CAS_to_odorants_sep) <- c("CAS","Mol Wt", "molecule")


merged_df <- merge(CAS_to_odorants_sep, OD_to_mol_final, by = "molecule")
write.csv(merged_df, file = "~/Bioinformatik_20.21/Bachelorarbeit/CAS_mol_OD.csv", row.names = FALSE)
```

*Flavornet only*

<span style="color:lightblue">mol_to_OD</span> : dict where key is the molecule and the descriptors for these molecules are the values

In [8]:
'''
read flavornet data and save it in mol_to_OD
'''
# only using flavornet data
mol_to_OD = {}
molecules = []
descriptors = []
with open('./data/CAS_mol_OD.csv', 'r',encoding='utf-8') as tabfile:
    reader = csv.reader(tabfile, delimiter=',')
    for row in reader: 
        molecule = row[0]
        OD = row[3]
        molecules.append(molecule)
        descriptors.append(OD)
        if molecule in mol_to_OD:
            mol_to_OD[molecule].add(OD)
        else:
            mol_to_OD[molecule]  = set([OD])

#list with all lecules from flavornet
molecules = list(set(molecules))
# list of all individual descriptors/ODs
descriptors = list(set(descriptors))

<span style="color:#FBB714">D</span>: Dataframe with all *molecules x ODs* - rigth now only flavornet database

In [9]:
D = pd.DataFrame(index=molecules, columns=descriptors)

In [10]:
for col in D.columns:
    for index, row in D.iterrows():
        # Check if value in  current row matches any entry in the dictionary
        if index in mol_to_OD.keys() and col in mol_to_OD[index]:
            # Set to 1
            D.at[index, col] = 1

D = D.fillna(0)
D

Unnamed: 0,burnt sugar,dust,mandarin,dill,wine,sulfur,green leaf,straw,prune,bitter almond,...,alkaline,must,rancid,cocoa,sour,yeast,camphor,pea,celery,plastic
furfural,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
methyl jasmonate,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
(E)-farnesol,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
(Z)-β-Farnesene,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
methyldihydrothiophenone,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
R-δ-decenolactone,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
p-menthadienhydroperoxide,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ethylmethyl pyrazine,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
α-p-dimethylstyrene,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


match molecules from MS data with flavors

In [66]:
#TODO: Maybe also use a fuzzy ratio here depending on the writing of the moleucles -> reduce mistakes
flavors = []
mol_flavors = {}
for mol in ms_molecules_ex:
    if mol in mol_to_OD.keys():
        flavors += mol_to_OD[mol]
        mol_flavors[mol] = mol_to_OD[mol]
    else:
        mol_flavors[mol] = {}

found_flavors = set(flavors)


<span style="color:lightblue">found_flavors</span>: list of the ODs found from the identified molecules 


In [67]:
found_flavors

{'fresh', 'green', 'lemon', 'moss', 'sweet'}


<span style="color:lightblue">mol_flavors</span>: contains the molecule from where each OD is coming from 

In [13]:
mol_flavors

{'(E)-3-hexenol': {'fresh', 'moss'},
 'coumarin': {'green', 'sweet'},
 'citral': {'lemon'}}

<span style="color:#FBB714">OD</span>: Dataframe - *molecules x ODs*

In [14]:
# filters only relevant rows - only those ODs coming from indentified molecules
OD = D[(D.index).isin(I_vector.index)]
# get only important/matching columns
OD = OD.loc[:, (OD != 0).any(axis=0)]
OD 

Unnamed: 0,lemon,moss,fresh,green,sweet
(E)-3-hexenol,0,1,1,0,0
citral,1,0,0,0,0
coumarin,0,0,0,1,1


#### 3. OD ----------> OQ


### Whisky Research institute

In [90]:
Research_data = pd.read_csv('./data/swri_flavorwheel.csv', sep=';')
Research_OQ_OD =  Research_data.drop(columns=Research_data.columns[0])
Research_OSA_OQ = Research_data.drop(columns=Research_data.columns[2])

In [91]:
OD_OQ = dict_to_matrix(dataframe_to_dict(Research_OQ_OD)).T
OQ_OSA = dict_to_matrix(dataframe_to_dict(Research_OSA_OQ)).T
# filter out rows with only 0
OD_OQ = OD_OQ.loc[(OD_OQ.iloc[:,1:] != 0).any(axis=1)]
OQ_OSA = OQ_OSA.loc[(OQ_OSA.iloc[:,1:] != 0).any(axis=1)]


<span style="color:#FBB714">OD_OQ</span>: Dataframe - all ODs x OQs from research institute

In [92]:
OD_OQ

Unnamed: 0,Burnt,Smoky,Medicinal,Leathery,Tobacco,Sweaty,Dry cereals,Wet cereals,leafy,Herbal,...,Solvently,Soapy,Sweet,Woody,Nutty,Spicy,Oily,Sour,Sulphury,Stale
first aid kit,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
scented,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Cabbage,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Burning wood,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Linseed oil,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCP,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Corked,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Oily,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
dairy,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


<span style="color:#FBB714">OQ_OSA</span>: Dataframe - all OQs x OSAs from research institute

In [17]:
#OQ_OSA

In [93]:
# find matching OQs to found ODs from massspec x flavornet
matching_OQs, matching_OQs_dict = find_matching_entries(mol_flavors, OD_OQ)

In [94]:
# get only important columns
matching_OQs = matching_OQs.loc[:, (matching_OQs != 0).any(axis=0)]
matching_OQs

Unnamed: 0,Citrus,Sweet
Sweet,0,1
Lemons,1,0


In [95]:
# to know which OD comes from which molecule
matching_OQs_dict

{'(E)-3-hexenol': [], 'coumarin': ['Sweet'], 'citral': ['Lemons']}

<span style="color:#FBB714">OQs</span>: Dataframe - ODs x OQs

In [96]:
OQ_dict, OQs = create_matrix_from_dict(matching_OQs_dict, matching_OQs)

In [97]:
OQs

Unnamed: 0,Citrus,Sweet
(E)-3-hexenol,0,0
coumarin,0,1
citral,1,0


#### 4.1 OQ ------ Binary Matrix -----> OSA


In [98]:
OQ_OSA

Unnamed: 0,peaty,feinty,cereal,Green/grassy,Floral,Fruity,Solvently,Soapy,Sweet,Woody,Nutty,Spicy,Oily,Sour,Sulphury,Stale
Earthy,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
Coconut,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
Leathery,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Stale,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
Green vegetables,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
Floral,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
New wood,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
Vinegary,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
Nuts,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
Peppery,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0


In [99]:
# find matching OQs to found ODs from massspec x flavornet
matching_OSAs, matching_OSAs_dict = find_matching_entries(OQ_dict, OQ_OSA)

In [100]:
matching_OSAs

Unnamed: 0,peaty,feinty,cereal,Green/grassy,Floral,Fruity,Solvently,Soapy,Sweet,Woody,Nutty,Spicy,Oily,Sour,Sulphury,Stale
Sweet,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
Citrus,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


<span style="color:#FBB714">OSAs</span>: Dataframe - OQs x OSAs

In [101]:
OSA_dict, OSAs = create_matrix_from_dict(matching_OSAs_dict, matching_OSAs)

In [102]:
OSAs

Unnamed: 0,Fruity,Sweet
(E)-3-hexenol,0,0
coumarin,0,1
citral,1,0


In [103]:
I_vector

Unnamed: 0,intensities
(E)-3-hexenol,80
coumarin,40
citral,5


In [104]:
from sklearn.preprocessing import Normalizer
normalizer = Normalizer()
I_vector_normalized = normalizer.fit_transform(I_vector.T).T
I_vector_normalized

array([[0.89303292],
       [0.44651646],
       [0.05581456]])

In [105]:
final_OSA = OSAs*I_vector_normalized
final_OSA

Unnamed: 0,Fruity,Sweet
(E)-3-hexenol,0.0,0.0
coumarin,0.0,0.446516
citral,0.055815,0.0


#### 4.2 OQ ------ Fuzzy Logic -----> OSA
Use Fuzzy logic to weigh OQ with expert knowledge/Intensity from Masspec


#### 4.3 Comparison BM and FL

#### 5. Show Flavourprofile