## Pipeline Flavor Profile

In [1]:
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
import csv



In [2]:
'''
does convert a pandas dataframe into a matrix where 
keys = row name
values = col name
data = either 0 or 1 
'''
def dict_to_matrix(dictionary):
    unique_values = set(val for sublist in dictionary.values() for val in sublist)
    data_dict = {key: {val: 1 if val in values else 0 for val in unique_values} for key, values in dictionary.items()}
    df = pd.DataFrame(data_dict).T
    return(df)


In [3]:
'''
find overlapp between to lists with a threshold (=70)
'''
def overlapping_elements(list1, list2, threshold=70):
    overlap = []
    for elem1 in list1:
        for elem2 in list2:
            ratio = fuzz.ratio(elem1, elem2)
            if ratio >= threshold:
                overlap.append(elem1)
    return overlap

In [4]:
'''
Convert DataFrame to dictionary
'''
def dataframe_to_dict(df):
    result_dict = {}
    for index, row in df.iterrows():
        if row[0] in result_dict:
            result_dict[row[0]].append(row[1])
        else:
            result_dict[row[0]] = [row[1]]
    return result_dict     

In [5]:
'''
find match between dictionary entries and the rows of a dataframe
'''
def find_matching_entries(dictionary,df):
    threshold = 50
    found_match = []
    found_match_per_entry = {}
    for key, value in dictionary.items():
        all_matches = set()
        for index, row in df.iterrows():
            for keyword in value:
                if fuzz.ratio(keyword, str(index).lower()) >= threshold:
                    if index not in all_matches:
                        all_matches.add(index)
                        found_match.append(row)
        found_match_per_entry[key] = all_matches
    found_match = pd.DataFrame(found_match)
    return(found_match, found_match_per_entry)

In [6]:
'''
create a new matrix by finding matches and filtering right columns
'''
def create_matrix_from_dict(dictionary,df):
    matrix = pd.DataFrame(columns=list(df), index=ms_cas)
    final_dict = {}
    for key, value in dictionary.items():
        current_val = []
        for v in value:
            for index, row in df.iterrows():
                if index == v:
                    current_val.append(','.join(row.loc[row.eq(1)].index.tolist()))
                    # todo: maybe exception -> do change if two ones in a row
                    matrix.loc[key][row.loc[row.eq(1)].index.tolist()] = 1
        final_dict[key] = set(current_val)
    matrix = matrix.fillna(0)
    matrix = matrix.loc[:, (matrix != 0).any(axis=0)]
    return(final_dict, matrix)
   

#### 1. MS Data ----- Massbank -----> Molecules

-> Chris

save which intensities belong to which molecule and give each the % 
2 approaches for one molecule with multiple OQs
1.  each gets same % 
2.  divide % by count of OQs per molecule

<span style="color:lightblue">mol_int</span> : dict where key is the molecule name and value is the intensites from the mass spec data

In [7]:
# DUMMY DATA
intensities = [80, 40, 5,15]
# only flavornet data use : ['(E)-3-hexenol', "2-dodecenal",'δ-muurolene']
ms_cas =['928-97-2', "20407-84-5",'120021-96-7', '18479-58-8']
# big book x flavorent: 
# ms_cas = ['dimethylethyl pyrazine', "3-mercaptothiophene",'δ-muurolene']
mol_int = {'928-97-2': 80, "20407-84-5": 40 ,'120021-96-7': 5, '18479-58-8': 15}

In [11]:
file_path = 'data/cas_intensities.csv'
data = pd.read_csv(file_path, delimiter=',')

In [12]:
data[data["replicate"]==1]

Unnamed: 0,sample_key,replicate,6485-40-1,3033-23-6,21284-22-0,38427-78-0,495-61-4,29873-99-2,6909-30-4,99-49-0,...,13466-78-9,483-76-1,36564-42-8,705-86-2,2825-91-4,713-95-1,20307-84-0,120021-96-7,586-62-9,710-04-3
0,000920,1,1.943542e+08,1.240854e+09,1.048295e+08,9.087319e+07,1.831223e+08,1.831223e+08,6.043091e+08,1.943542e+08,...,2.528854e+08,1.831223e+08,1.048295e+08,4.645706e+08,4.645706e+08,1.893777e+08,1.831223e+08,1.831223e+08,2.528854e+08,2.662124e+08
3,000922,1,1.549112e+08,1.029110e+09,1.086362e+08,1.181268e+08,1.913892e+08,1.913892e+08,6.519549e+08,1.549112e+08,...,1.875943e+08,1.913892e+08,1.086362e+08,4.505443e+08,4.505443e+08,2.602672e+08,1.913892e+08,1.913892e+08,1.875943e+08,2.698756e+08
6,000927,1,1.688820e+08,1.102763e+09,1.523761e+08,1.616408e+08,1.932470e+08,1.932470e+08,7.387225e+08,1.688820e+08,...,2.299686e+08,1.932470e+08,1.523761e+08,4.690569e+08,4.690569e+08,2.387369e+08,1.932470e+08,1.932470e+08,2.299686e+08,2.549133e+08
9,000911,1,1.759171e+08,9.992782e+08,1.654314e+08,1.787409e+08,2.531031e+08,2.531031e+08,8.800410e+08,1.759171e+08,...,2.857401e+08,2.531031e+08,1.654314e+08,4.831389e+08,4.831389e+08,3.724425e+08,2.531031e+08,2.531031e+08,2.857401e+08,3.321024e+08
12,000925,1,1.517116e+08,1.068380e+09,1.090820e+08,1.262307e+08,1.855528e+08,1.855528e+08,6.077461e+08,1.517116e+08,...,1.823817e+08,1.855528e+08,1.090820e+08,4.520255e+08,4.520255e+08,2.344530e+08,1.855528e+08,1.855528e+08,1.823817e+08,2.454637e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
756,000596,1,2.413772e+08,2.413874e+09,9.689814e+07,9.213682e+07,1.471158e+08,1.471158e+08,6.822826e+08,2.413772e+08,...,2.592504e+08,1.471158e+08,9.689814e+07,5.506742e+08,5.506742e+08,1.736489e+08,1.471158e+08,1.471158e+08,2.592504e+08,3.138085e+08
759,000804,1,3.452753e+08,3.397598e+09,1.905095e+08,2.577871e+08,2.881055e+08,2.881055e+08,1.122339e+09,3.452753e+08,...,3.308782e+08,2.881055e+08,1.905095e+08,5.763415e+08,5.763415e+08,3.355894e+08,2.881055e+08,2.881055e+08,3.308782e+08,5.460406e+08
762,000853,1,3.740657e+08,1.855998e+09,1.829394e+08,1.613276e+08,2.186666e+08,2.186666e+08,6.303671e+08,3.740657e+08,...,3.798451e+08,2.186666e+08,1.829394e+08,2.239344e+08,2.239344e+08,1.979400e+08,2.186666e+08,2.186666e+08,3.798451e+08,3.843281e+08
765,000931,1,1.714774e+08,1.085542e+09,1.353320e+08,1.542927e+08,3.349072e+08,3.349072e+08,8.858262e+08,1.714774e+08,...,2.587659e+08,3.349072e+08,1.353320e+08,4.699361e+08,4.699361e+08,3.719480e+08,3.349072e+08,3.349072e+08,2.587659e+08,3.572876e+08


In [14]:
# use sample 920 repliacte 1
used_data = data[data["sample_key"]=="000920"]
used_data = used_data[used_data["replicate"]==1]
used_data

Unnamed: 0,sample_key,replicate,6485-40-1,3033-23-6,21284-22-0,38427-78-0,495-61-4,29873-99-2,6909-30-4,99-49-0,...,13466-78-9,483-76-1,36564-42-8,705-86-2,2825-91-4,713-95-1,20307-84-0,120021-96-7,586-62-9,710-04-3
717,656,1,282584200.0,2133618000.0,137612200.0,112911700.0,124194200.0,124194200.0,759420800.0,282584200.0,...,309010600.0,124194200.0,137612200.0,574309600.0,574309600.0,193439700.0,124194200.0,124194200.0,309010600.0,311112400.0


In [11]:
#filter columns that are 0
used_data = used_data.loc[:, (used_data != 0).any(axis=0)]

In [12]:
used_data

Unnamed: 0,sample_key,replicate,6485-40-1,3033-23-6,21284-22-0,38427-78-0,495-61-4,29873-99-2,6909-30-4,99-49-0,...,13466-78-9,483-76-1,36564-42-8,705-86-2,2825-91-4,713-95-1,20307-84-0,120021-96-7,586-62-9,710-04-3
0,920,1,194354200.0,1240854000.0,104829500.0,90873190.0,183122300.0,183122300.0,604309100.0,194354200.0,...,252885400.0,183122300.0,104829500.0,464570600.0,464570600.0,189377700.0,183122300.0,183122300.0,252885400.0,266212400.0


In [13]:

used_data = used_data.drop(columns=['sample_key', 'replicate'])
largest_entries = used_data.iloc[0].nlargest(50)
# Create a new DataFrame with only the largest entries
data_top50 = pd.DataFrame({'Intensities': largest_entries})
data_top50

Unnamed: 0,Intensities
3033-23-6,1240854000.0
876-17-5,1240854000.0
07.11.5258,1240854000.0
21662-09-9,1240854000.0
491-04-3,1240854000.0
562-74-3,1240854000.0
470-67-7,1240854000.0
470-82-6,1240854000.0
2497-25-8,1240854000.0
3913-71-1,1240854000.0


In [14]:
ms_cas = list(data_top50.index)
ms_cas

['3033-23-6',
 '876-17-5',
 '07.11.5258',
 '21662-09-9',
 '491-04-3',
 '562-74-3',
 '470-67-7',
 '470-82-6',
 '2497-25-8',
 '3913-71-1',
 '3913-81-3',
 '4861-58-9',
 '586-82-3',
 '105683-99-6',
 '147159-48-6',
 '29887-38-5',
 '507-70-0',
 '465-31-6',
 '15537-55-0',
 '106-23-0',
 '619-01-2',
 '128386-31-2',
 '1632-73-1',
 '106-24-1',
 '124-76-5',
 '18675-35-9',
 '16750-94-0',
 '5944-20-7',
 '498-16-8',
 '78-70-6',
 '89-80-5',
 '10458-14-7',
 '106-25-2',
 '619-62-5',
 '18479-68-0',
 '16721-39-4',
 '91-10-1',
 '17699-16-0',
 '14575-74-7',
 '98-55-5',
 '138-87-4',
 '541-58-2',
 '3581-91-7',
 '20407-84-5',
 '134346-43-3',
 '19700-21-1',
 '1200-67-5',
 '115-99-1',
 '1189-09-9',
 '3943-74-6']

In [15]:
intensities = list(data_top50['Intensities'])
intensities

[1240854213.7282832,
 1240854213.7282832,
 1240854213.7282832,
 1240854213.7282832,
 1240854213.7282832,
 1240854213.7282832,
 1240854213.7282832,
 1240854213.7282832,
 1240854213.7282832,
 1240854213.7282832,
 1240854213.7282832,
 1240854213.7282832,
 1240854213.7282832,
 1240854213.7282832,
 1240854213.7282832,
 1240854213.7282832,
 1240854213.7282832,
 1240854213.7282832,
 1240854213.7282832,
 1240854213.7282832,
 1240854213.7282832,
 1240854213.7282832,
 1240854213.7282832,
 1240854213.7282832,
 1240854213.7282832,
 1240854213.7282832,
 1240854213.7282832,
 1240854213.7282832,
 1240854213.7282832,
 1240854213.7282832,
 1240854213.7282832,
 1240854213.7282832,
 1240854213.7282832,
 1240854213.7282832,
 1240854213.7282832,
 1240854213.7282832,
 1240854213.7282832,
 1240854213.7282832,
 1240854213.7282832,
 1240854213.7282832,
 1240854213.7282832,
 1235600535.8503914,
 1235600535.8503914,
 801165438.8323025,
 801165438.8323025,
 801165438.8323025,
 801165438.8323025,
 801165438.832302

<span style="color:#FBB714">I_vector</span> : dataframe with *molecules x intensities*

In [16]:
#I_vector = pd.DataFrame(data = mol_int, index = ["intensities"]).T
I_vector = pd.DataFrame(data= intensities,index = ms_cas, columns=["intensities"])
I_vector

Unnamed: 0,intensities
3033-23-6,1240854000.0
876-17-5,1240854000.0
07.11.5258,1240854000.0
21662-09-9,1240854000.0
491-04-3,1240854000.0
562-74-3,1240854000.0
470-67-7,1240854000.0
470-82-6,1240854000.0
2497-25-8,1240854000.0
3913-71-1,1240854000.0


#### 2. Molecules ----- Flavornet -----> OD 
Find ODs by matching molecules with databases using Flavornet

<span style="color:lightblue">CAS_to_odorants.csv</span> : table with following columns - molecule, CAS, Mol wt, odorant


##### table manipualtion done in R

```r
library(rlang)
library(tidyverse)

CAS_to_odorants <- read.table("~/Bioinformatik_20.21/Bachelorarbeit/CAS_to_mol.txt", sep = "\t", header = FALSE)
OD_to_mol <- read.table("~/Bioinformatik_20.21/Bachelorarbeit/OD_mol.txt", sep = "\t", header = FALSE)

OD_to_mol_sep <- OD_to_mol %>%
  separate_rows(V2, sep = ";\\s*") %>%
  mutate(V2 = trimws(V2))
OD_to_mol_final <- OD_to_mol_sep[, c("V2","V1")]
colnames(OD_to_mol_final) <- c("molecule", "OD")

CAS_to_odorants_sep <- CAS_to_odorants %>%
  separate_rows(V3, sep = ";\\s*") %>%
  mutate(V3 = trimws(V3))
colnames(CAS_to_odorants_sep) <- c("CAS","Mol Wt", "molecule")


merged_df <- merge(CAS_to_odorants_sep, OD_to_mol_final, by = "molecule")
write.csv(merged_df, file = "~/Bioinformatik_20.21/Bachelorarbeit/CAS_mol_OD.csv", row.names = FALSE)
```

*Flavornet only*

<span style="color:lightblue">mol_to_OD</span> : dict where key is the molecule and the descriptors for these molecules are the values

In [32]:
'''
read flavornet data and save it in mol_to_OD
'''
# only using flavornet data
mol_to_OD = {}
molecules = []
descriptors = []
CAS_numbers= []
with open('./data/CAS_mol_OD.csv', 'r',encoding='utf-8') as tabfile:
    reader = csv.reader(tabfile, delimiter=',')
    for row in reader: 
        molecule = row[0]
        CAS = row[1]
        OD = row[3]
        molecules.append(molecule)
        CAS_numbers.append(CAS)
        descriptors.append(OD)
        if CAS in mol_to_OD:
            mol_to_OD[CAS].add(OD)
        else:
            mol_to_OD[CAS]  = set([OD])

CAS_numbers = list(set(CAS_numbers))
molecules = list(set(molecules))
descriptors = list(set(descriptors))

In [35]:
mol_to_OD

{'6485-40-1': {'mint'},
 '3033-23-6': {'rose', 'sweet'},
 '21284-22-0': {'green tea', 'herb', 'spice'},
 '38427-78-0': {'terpentine'},
 '495-61-4': {'balsamic'},
 '29873-99-2': {'green', 'oil', 'wood'},
 '6909-30-4': {'green'},
 '99-49-0': {'basil', 'caraway', 'fennel', 'mint'},
 '2244-16-8': {'caraway'},
 '876-17-5': {'flower', 'green'},
 '5989-27-5': {'citrus', 'mint'},
 '4674-50-4': {'grapefruit'},
 '2243-33-6': {'dill'},
 '10307-61-6': {'apple'},
 '69064-37-5': {'oil'},
 '85761-70-2': {'green', 'leaf'},
 '6728-26-3': {'apple', 'fat', 'green', 'leaf', 'rancid'},
 '13419-69-7': {'fat', 'must'},
 '928-95-0': {'green', 'leaf', 'walnut'},
 '18829-56-6': {'cucumber', 'fat', 'green'},
 '1576-96-1': {'mushroom'},
 '53448-07-0': {'fat', 'green', 'soap'},
 '928-97-2': {'fresh', 'moss'},
 '1197-07-5': {'caraway', 'solvent'},
 '14371-10-9': {'cinnamon', 'paint'},
 '09.04.5948': {'herb', 'warm'},
 '106-28-5': {'muguet'},
 '5273-85-8': {'flower', 'spice'},
 '5932-68-3': {'flower'},
 '4959-35-7':

In [201]:
big_book =  pd.read_table('./data/bigBook.txt', sep=',')

*Flavornet x Big Book*

In [18]:
'''# filter only those descriptor that are available in flavorent and in our big book
mol_to_OD = {}
molecules = []
descriptors = []
with open('./data/CAS_mol_OD.csv', 'r',encoding='utf-8') as tabfile:
    reader = csv.reader(tabfile, delimiter=',')
    for row in reader: 
        molecule = row[0]
        OD = row[3]
        for item in big_book['label']:
            if (fuzz.ratio(OD, item) >= 60):
                molecules.append(molecule)
                descriptors.append(item)
                if molecule in mol_to_OD:
                    mol_to_OD[molecule].add(item)
                else:
                    mol_to_OD[molecule]  = set([item])

molecules = list(set(molecules))
descriptors = list(set(descriptors))'''

"# filter only those descriptor that are available in flavorent and in our big book\nmol_to_OD = {}\nmolecules = []\ndescriptors = []\nwith open('./data/CAS_mol_OD.csv', 'r',encoding='utf-8') as tabfile:\n    reader = csv.reader(tabfile, delimiter=',')\n    for row in reader: \n        molecule = row[0]\n        OD = row[3]\n        for item in big_book['label']:\n            if (fuzz.ratio(OD, item) >= 60):\n                molecules.append(molecule)\n                descriptors.append(item)\n                if molecule in mol_to_OD:\n                    mol_to_OD[molecule].add(item)\n                else:\n                    mol_to_OD[molecule]  = set([item])\n\nmolecules = list(set(molecules))\ndescriptors = list(set(descriptors))"

<span style="color:#FBB714">D</span>: Dataframe with all *molecules x descriptors* - rigth now only flavornet database

In [227]:
D = pd.DataFrame(index=CAS_numbers, columns=descriptors)

In [238]:
for col in D.columns:
    for index, row in D.iterrows():
        # Check if value in  current row matches any entry in the dictionary
        if index in mol_to_OD.keys() and col in mol_to_OD[index]:
            # Set to 1
            D.at[index, col] = 1

D = D.fillna(0)
D

Unnamed: 0,peppermint,tobacco,grape,meat,fecal,violet,herb,citrus,fruit,caramel,...,smoke,putrid,tar,pear,nut,butterscotch,turpentine,lemon,warm,cooked potato
112-61-8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13744-15-5,0,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
5392-40-5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1117-55-1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24703-35-3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24415-26-7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
110-43-0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3681-71-8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2785-87-7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


match molecules from MS data with flavors

In [239]:
flavors = []
mol_flavors = {}
for mol in ms_cas:
    if mol in mol_to_OD.keys():
        flavors += mol_to_OD[mol]
        mol_flavors[mol] = mol_to_OD[mol]
    else:
        mol_flavors[mol] = {}

found_flavors = set(flavors)


<span style="color:lightblue">found_flavors</span>: list of the ODs found from the identified molecules 

In [240]:
found_flavors

{'citrus',
 'cologne',
 'fat',
 'fresh',
 'green',
 'moss',
 'oil',
 'sweet',
 'tart lime'}



<span style="color:lightblue">mol_flavors</span>: contains the molecule from where each OD is coming from 

In [242]:
mol_flavors

{'928-97-2': {'fresh', 'moss'},
 '20407-84-5': {'fat', 'green', 'sweet'},
 '120021-96-7': {'oil'},
 '18479-58-8': {'citrus', 'cologne', 'tart lime'}}

<span style="color:#FBB714">OD</span>: Dataframe - *molecules x ODs*

In [243]:
# filters only relevant rows - only those ODs coming from indentified molecules
OD = D[(D.index).isin(I_vector.index)]
# get only important/matching columns
OD = OD.loc[:, (OD != 0).any(axis=0)]
OD 

Unnamed: 0,citrus,sweet,tart lime,fat,moss,oil,cologne,green,fresh
18479-58-8,1,0,1,0,0,0,1,0,0
120021-96-7,0,0,0,0,0,1,0,0,0
20407-84-5,0,1,0,1,0,0,0,1,0
928-97-2,0,0,0,0,1,0,0,0,1


#### 3. OD ----------> OQ


##### Roche data
Use ontology to find matching OQ

In [244]:
# filter out 
OD_OQ =  pd.read_csv('./data/DATA_OD-2-OQ.csv', index_col=0)
# filter out rows with only 0 
OD_OQ = OD_OQ.loc[(OD_OQ.iloc[:,1:] != 0).any(axis=1)]


<span style="color:#FBB714">OD_OQ</span>: Dataframe - all ODs x OQs

In [245]:
OD_OQ

Unnamed: 0,Almond,Woody,Camphor,Leather,Cooked,Spicy,Floral,Fresh,Fruity,Smoky,...,Lactonic,Vegetable,Honey,Peel,Sulfurous,Toasty,Vanilla,Green,Vinous,Violet
AGRUMES,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
Apple,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
Apple peels,0,0,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
Apricot,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
Banana,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wine,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
Wine-yeast,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
Winey,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
Woody,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [292]:
'''
find match between dictionary entries and the rows of a dataframe
'''
def find_matching_entries(dictionary,df):
    threshold = 50
    found_match = []
    found_match_per_entry = {}
    for key, value in dictionary.items():
        all_matches = []
        for index, row in df.iterrows():
            for keyword in value:
                if fuzz.ratio(keyword, str(index).lower()) >= threshold:
                    if index not in all_matches:
                        all_matches.append(index)
                        found_match.append(row)
        found_match_per_entry[key] = set(all_matches)
    found_match = pd.DataFrame(found_match)
    return(found_match, found_match_per_entry)

In [284]:
mol_flavors

{'928-97-2': {'fresh', 'moss'},
 '20407-84-5': {'fat', 'green', 'sweet'},
 '120021-96-7': {'oil'},
 '18479-58-8': {'citrus', 'cologne', 'tart lime'}}

In [338]:
# find matching OQs to found ODs from massspec x flavornet
matching_OQs, matching_OQs_dict = find_matching_entries(mol_flavors, OD_OQ)

In [339]:
# get only important columns
matching_OQs = matching_OQs.loc[:, (matching_OQs != 0).any(axis=0)]
matching_OQs

Unnamed: 0,Woody,Leather,Cooked,Spicy,Floral,Fresh,Fruity,Cut-grass,Lactonic,Vegetable,Honey,Sulfurous,Toasty,Green,Vinous
AGRUMES,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
FRUITE,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
FRUITS SECS,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
Fruity-fresh,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0
Mimosa,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
Radish,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
Rose,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
SOUFRE,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
AGRUMES,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
Eugenol,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [340]:
# to know which OD comes from which molecule
matching_OQs_dict

{'928-97-2': {'AGRUMES',
  'FRUITE',
  'FRUITS SECS',
  'Fruity-fresh',
  'Mimosa',
  'Radish',
  'Rose',
  'SOUFRE'},
 '20407-84-5': {'AGRUMES',
  'Eugenol',
  'Grape',
  'Green',
  'Green fruity',
  'Green-fruity',
  'Greener gassy',
  'Leafy-green',
  'Sweet-ethereal',
  'Weedy'},
 '120021-96-7': {'BOISE', 'Tropical'},
 '18479-58-8': {'AGRUMES',
  'CARAMEL',
  'CUIR',
  'Castoreum',
  'Citrus',
  'Clove',
  'Corn',
  'FRUITS ROUGES',
  'Fruity-rum',
  'Honey',
  'LACTONE',
  'Narcissus',
  'Toasted'}}

<span style="color:#FBB714">OQs</span>: Dataframe - ODs x OQs

In [341]:
OQ_dict, OQs = create_matrix_from_dict(matching_OQs_dict, matching_OQs)

In [342]:
OQs

Unnamed: 0,Woody,Leather,Cooked,Spicy,Floral,Fresh,Fruity,Cut-grass,Lactonic,Vegetable,Honey,Sulfurous,Toasty,Green,Vinous
928-97-2,0,0,0,0,1,1,1,0,0,1,0,1,0,0,0
20407-84-5,0,0,0,1,0,0,1,1,0,0,0,0,0,1,0
120021-96-7,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0
18479-58-8,0,1,1,1,1,0,1,0,1,1,1,0,1,0,1


<span style="color:lightblue">OQ_dict</span>: dictionary with key = molecule, value = OQs

In [343]:
OQ_dict

{'928-97-2': {'Floral', 'Fresh,Fruity', 'Fruity', 'Sulfurous', 'Vegetable'},
 '20407-84-5': {'Cut-grass', 'Fruity', 'Fruity,Green', 'Green', 'Spicy'},
 '120021-96-7': {'Fruity', 'Woody'},
 '18479-58-8': {'Cooked',
  'Floral',
  'Fruity',
  'Fruity,Vinous',
  'Honey',
  'Lactonic',
  'Leather',
  'Spicy',
  'Toasty',
  'Vegetable'}}

#### 4.1 OQ ------ Binary Matrix -----> OSA


In [344]:
OQ_OSA = pd.read_csv('./data/OQ_OSA.csv', delimiter=';', index_col=0)
OQ_OSA

Unnamed: 0,floral,fruit,solventy,soapy,sweet,wood,nutty,spicy,oily,sour,...,feinty,cereal,green/grassy,malt,primary taste,mouthfeel,nasal effects,dried fruits,aftertaste,complexity
fresh flowers,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
perfumed,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
fresh fruit,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
dried fruit,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
citrus,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
artificial fruit flavorings,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
nail varnish remover,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
unperfumed soap,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
vanilla,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
honey,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [345]:
# find matching OSAs to found OQs from massspec x flavornet
matching_OSAs, matching_OSAs_dict = find_matching_entries(OQ_dict, OQ_OSA)

In [346]:
matching_OSAs

Unnamed: 0,floral,fruit,solventy,soapy,sweet,wood,nutty,spicy,oily,sour,...,feinty,cereal,green/grassy,malt,primary taste,mouthfeel,nasal effects,dried fruits,aftertaste,complexity
fresh fruit,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
dried fruit,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
cooked vegetables,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
earthy,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
musty,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
herbal,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
green vegetables,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
earthy,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
musty,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
grass,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


<span style="color:lightblue">OSA_dict</span>: dict - keys = molecules x value = OSAs

In [347]:
OSA_dict, OSAs = create_matrix_from_dict(matching_OSAs_dict, matching_OSAs)

In [37]:
# todo: count appeareance of OSA in each odorant 

In [348]:
OSA_dict

{'928-97-2': {'fruit',
  'fruit,dried fruits',
  'green/grassy',
  'stale',
  'sulphury'},
 '20407-84-5': {'green/grassy', 'stale'},
 '120021-96-7': {'stale'},
 '18479-58-8': {'feinty',
  'green/grassy',
  'nutty',
  'peat',
  'stale',
  'sulphury',
  'sweet',
  'wood'}}

<span style="color:#FBB714">OSAs</span>: Dataframe - OQs x OSAs

In [349]:
OSAs

Unnamed: 0,fruit,sweet,wood,nutty,sulphury,stale,peat,feinty,green/grassy,dried fruits
928-97-2,1,0,0,0,1,1,0,0,1,1
20407-84-5,0,0,0,0,0,1,0,0,1,0
120021-96-7,0,0,0,0,0,1,0,0,0,0
18479-58-8,0,1,1,1,1,1,1,1,1,0


In [350]:
OSA_dict

{'928-97-2': {'fruit',
  'fruit,dried fruits',
  'green/grassy',
  'stale',
  'sulphury'},
 '20407-84-5': {'green/grassy', 'stale'},
 '120021-96-7': {'stale'},
 '18479-58-8': {'feinty',
  'green/grassy',
  'nutty',
  'peat',
  'stale',
  'sulphury',
  'sweet',
  'wood'}}

In [351]:
I_vector

Unnamed: 0,intensities
928-97-2,80
20407-84-5,40
120021-96-7,5
18479-58-8,15


In [352]:
from sklearn.preprocessing import Normalizer
normalizer = Normalizer()
I_vector_normalized = normalizer.fit_transform(I_vector.T).T
I_vector_normalized

array([[0.88077101],
       [0.44038551],
       [0.05504819],
       [0.16514456]])

In [353]:
final_OSA = OSAs*I_vector_normalized
final_OSA

Unnamed: 0,fruit,sweet,wood,nutty,sulphury,stale,peat,feinty,green/grassy,dried fruits
928-97-2,0.880771,0.0,0.0,0.0,0.880771,0.880771,0.0,0.0,0.880771,0.880771
20407-84-5,0.0,0.0,0.0,0.0,0.0,0.440386,0.0,0.0,0.440386,0.0
120021-96-7,0.0,0.0,0.0,0.0,0.0,0.055048,0.0,0.0,0.0,0.0
18479-58-8,0.0,0.165145,0.165145,0.165145,0.165145,0.165145,0.165145,0.165145,0.165145,0.0


In [354]:
final_OSA*10

Unnamed: 0,fruit,sweet,wood,nutty,sulphury,stale,peat,feinty,green/grassy,dried fruits
928-97-2,8.80771,0.0,0.0,0.0,8.80771,8.80771,0.0,0.0,8.80771,8.80771
20407-84-5,0.0,0.0,0.0,0.0,0.0,4.403855,0.0,0.0,4.403855,0.0
120021-96-7,0.0,0.0,0.0,0.0,0.0,0.550482,0.0,0.0,0.0,0.0
18479-58-8,0.0,1.651446,1.651446,1.651446,1.651446,1.651446,1.651446,1.651446,1.651446,0.0


In [355]:
list(final_OSA.sum())

[0.8807710121010885,
 0.1651445647689541,
 0.1651445647689541,
 0.1651445647689541,
 1.0459155768700426,
 1.541349271176905,
 0.1651445647689541,
 0.1651445647689541,
 1.486301082920587,
 0.8807710121010885]

In [63]:
#normalizer.fit_transform(np.array(final_OSA.sum()).reshape(1,- 1))*10

array([[3.24509566, 0.60845544, 0.60845544, 0.60845544, 3.85355109,
        5.6789174 , 0.60845544, 0.60845544, 5.47609892, 3.24509566]])

#### 4.2 OQ ------ Fuzzy Logic -----> OSA
Use Fuzzy logic to weigh OQ with expert knowledge/Intensity from Masspec


#### 4.3 Comparison BM and FL

#### 5. Show Flavourprofile

*overall flavorprofile of predicted OSAs*

In [356]:
import plotly.graph_objects as go

fig = go.Figure(data=go.Scatterpolar(
  r=(normalizer.fit_transform(np.array(final_OSA.sum()).reshape(1,- 1))*10)[0],
  theta=list(final_OSA.columns),
  fill='toself'
))

fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True
    ),
  ),
  showlegend=False
)

fig.show()

*comparing OSAs with panelist scores*

In [357]:
# filter out our 8 comparable OSAs 
comparable_OSAs = ["floral","fruit","wood","complexity","malt","sweet","peat","dried fruits"]
filtered_OSAs = pd.DataFrame(columns=comparable_OSAs)
# Populate the new DataFrame based on original DataFrame
for column in filtered_OSAs:
    if column in final_OSA.columns:
        filtered_OSAs[column] = final_OSA[column]
    else:
        filtered_OSAs[column] = 0

filtered_OSAs



Unnamed: 0,floral,fruit,wood,complexity,malt,sweet,peat,dried fruits
928-97-2,,0.880771,0.0,0,0,0.0,0.0,0.880771
20407-84-5,,0.0,0.0,0,0,0.0,0.0,0.0
120021-96-7,,0.0,0.0,0,0,0.0,0.0,0.0
18479-58-8,,0.0,0.165145,0,0,0.165145,0.165145,0.0


In [314]:
import plotly.graph_objects as go

panelist_scores = [40,30,50,40,30,40,30,40] 
fig = go.Figure()

fig.add_trace(go.Scatterpolar(
      r=[value / 10 for value in panelist_scores],
      theta=comparable_OSAs,
      fill='toself',
      name='Product A'
))
fig.add_trace(go.Scatterpolar(
      r=list(filtered_OSAs.sum()*10),
      theta=comparable_OSAs,
      fill='toself',
      name='Product B'
))

fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
      range=[0, 10]
    )),
  showlegend=False
)

fig.show()