## Pipeline Flavor Profile

In [289]:
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
import csv

In [290]:
'''
does convert a pandas dataframe into a matrix where 
keys = row name
values = col name
data = either 0 or 1 
'''
def dict_to_matrix(dictionary):
    unique_values = set(val for sublist in dictionary.values() for val in sublist)
    data_dict = {key: {val: 1 if val in values else 0 for val in unique_values} for key, values in dictionary.items()}
    df = pd.DataFrame(data_dict).T
    return(df)


In [291]:
'''
find overlapp between to lists with a threshold (=70)
'''
def overlapping_elements(list1, list2, threshold=70):
    overlap = []
    for elem1 in list1:
        for elem2 in list2:
            ratio = fuzz.ratio(elem1, elem2)
            if ratio >= threshold:
                overlap.append(elem1)
    return overlap

In [292]:
'''
Convert DataFrame to dictionary
'''
def dataframe_to_dict(df):
    result_dict = {}
    for index, row in df.iterrows():
        if row[0] in result_dict:
            result_dict[row[0]].append(row[1])
        else:
            result_dict[row[0]] = [row[1]]
    return result_dict     

In [293]:
'''
find match between dictionary entries and the rows of a dataframe
'''
def find_matching_entries(dictionary,df):
    threshold = 80
    found_match = []
    found_match_per_entry = {}
    for key, value in dictionary.items():
        all_matches = set()
        for index, row in df.iterrows():
            for keyword in value:
                if fuzz.ratio(keyword, str(index).lower()) >= threshold:
                    all_matches.add(index)
                    found_match.append(row)
        found_match_per_entry[key] = all_matches
    found_match = pd.DataFrame(found_match)
    found_match = found_match.drop_duplicates()
    return(found_match, found_match_per_entry)

In [294]:
'''
create a new matrix by finding matches and filtering right columns
'''
def create_matrix_from_dict(dictionary,df):
    matrix = pd.DataFrame(columns=list(df), index=ms_cas)
    final_dict = {}
    for key, value in dictionary.items():
        current_val = []
        for v in value:
            for index, row in df.iterrows():
                if index == v:
                    current_val.append(','.join(row.loc[row.eq(1)].index.tolist()))
                    # todo: maybe exception -> do change if two ones in a row
                    matrix.loc[key][row.loc[row.eq(1)].index.tolist()] = 1
        final_dict[key] = current_val
    matrix = matrix.fillna(0)
    matrix = matrix.loc[:, (matrix != 0).any(axis=0)]
    return(final_dict, matrix)
   

#### 1. MS Data ----- Massbank -----> Molecules

-> Chris

save which intensities belong to which molecule and give each the % 
2 approaches for one molecule with multiple OQs
1.  each gets same % 
2.  divide % by count of OQs per molecule

<span style="color:lightblue">mol_int</span> : dict where key is the molecule name and value is the intensites from the mass spec data

In [295]:
# DUMMY DATA
intensities = [80, 40, 5]
ms_cas =['(E)-3-hexenol', "coumarin",'citral']
mol_int = {'(E)-3-hexenol': 80, "coumarin": 40 ,'citral': 5}

In [296]:
# actual ms data
file_path = 'data/cas_intensities.csv'
data = pd.read_csv(file_path, delimiter=',')
# only use first repliate
data = data[data["replicate"]==1]
# use sample 920 repliacte 1
used_data = data[data["sample_key"]=="000920"]
used_data = used_data[used_data["replicate"]==1]
#filter columns that are 0
used_data = used_data.loc[:, (used_data != 0).any(axis=0)]
used_data

Unnamed: 0,sample_key,replicate,6485-40-1,3033-23-6,21284-22-0,38427-78-0,495-61-4,29873-99-2,6909-30-4,99-49-0,...,13466-78-9,483-76-1,36564-42-8,705-86-2,2825-91-4,713-95-1,20307-84-0,120021-96-7,586-62-9,710-04-3
0,920,1,194354200.0,1240854000.0,104829500.0,90873190.0,183122300.0,183122300.0,604309100.0,194354200.0,...,252885400.0,183122300.0,104829500.0,464570600.0,464570600.0,189377700.0,183122300.0,183122300.0,252885400.0,266212400.0


In [297]:

used_data = used_data.drop(columns=['sample_key', 'replicate'])
#largest_entries = used_data.iloc[0].nlargest(100)
largest_entries = used_data.iloc[0]

# Create a new DataFrame with only the largest entries
data_top50 = pd.DataFrame({'Intensities': largest_entries})
data_top50

Unnamed: 0,Intensities
6485-40-1,1.943542e+08
3033-23-6,1.240854e+09
21284-22-0,1.048295e+08
38427-78-0,9.087319e+07
495-61-4,1.831223e+08
...,...
713-95-1,1.893777e+08
20307-84-0,1.831223e+08
120021-96-7,1.831223e+08
586-62-9,2.528854e+08


In [298]:
ms_cas = list(data_top50.index)

In [299]:
intensities = list(data_top50['Intensities'])

<span style="color:#FBB714">I_vector</span> : dataframe with *molecules x intensities*

In [300]:
#I_vector = pd.DataFrame(data = mol_int, index = ["intensities"]).T
I_vector = pd.DataFrame(data= intensities,index = ms_cas, columns=["intensities"])
I_vector

Unnamed: 0,intensities
6485-40-1,1.943542e+08
3033-23-6,1.240854e+09
21284-22-0,1.048295e+08
38427-78-0,9.087319e+07
495-61-4,1.831223e+08
...,...
713-95-1,1.893777e+08
20307-84-0,1.831223e+08
120021-96-7,1.831223e+08
586-62-9,2.528854e+08


#### 2. Molecules ----- Flavornet -----> OD 
Find ODs by matching molecules with databases using Flavornet

<span style="color:lightblue">CAS_to_odorants.csv</span> : table with following columns - molecule, CAS, Mol wt, odorant


##### table manipualtion done in R

```r
library(rlang)
library(tidyverse)

CAS_to_odorants <- read.table("~/Bioinformatik_20.21/Bachelorarbeit/CAS_to_mol.txt", sep = "\t", header = FALSE)
OD_to_mol <- read.table("~/Bioinformatik_20.21/Bachelorarbeit/OD_mol.txt", sep = "\t", header = FALSE)

OD_to_mol_sep <- OD_to_mol %>%
  separate_rows(V2, sep = ";\\s*") %>%
  mutate(V2 = trimws(V2))
OD_to_mol_final <- OD_to_mol_sep[, c("V2","V1")]
colnames(OD_to_mol_final) <- c("molecule", "OD")

CAS_to_odorants_sep <- CAS_to_odorants %>%
  separate_rows(V3, sep = ";\\s*") %>%
  mutate(V3 = trimws(V3))
colnames(CAS_to_odorants_sep) <- c("CAS","Mol Wt", "molecule")


merged_df <- merge(CAS_to_odorants_sep, OD_to_mol_final, by = "molecule")
write.csv(merged_df, file = "~/Bioinformatik_20.21/Bachelorarbeit/CAS_mol_OD.csv", row.names = FALSE)
```

*Flavornet only*

<span style="color:lightblue">mol_to_OD</span> : dict where key is the molecule and the descriptors for these molecules are the values

In [301]:
'''
read flavornet data and save it in mol_to_OD
'''
# only using flavornet data
mol_to_OD = {}
molecules = []
descriptors = []
CAS_numbers= []
with open('./data/CAS_mol_OD.csv', 'r',encoding='utf-8') as tabfile:
    reader = csv.reader(tabfile, delimiter=',')
    for row in reader: 
        molecule = row[0]
        CAS = row[1]
        OD = row[3]
        molecules.append(molecule)
        CAS_numbers.append(CAS)
        descriptors.append(OD)
        if CAS in mol_to_OD:
            mol_to_OD[CAS].add(OD)
        else:
            mol_to_OD[CAS]  = set([OD])


CAS_numbers = list(set(CAS_numbers))
#list with all molecules from flavornet
molecules = list(set(molecules))
# list of all individual descriptors/ODs
descriptors = list(set(descriptors))

<span style="color:#FBB714">D</span>: Dataframe with all *molecules x ODs* - rigth now only flavornet database

In [302]:
D = pd.DataFrame(index=CAS_numbers, columns=descriptors)

In [303]:
for col in D.columns:
    for index, row in D.iterrows():
        # Check if value in  current row matches any entry in the dictionary
        if index in mol_to_OD.keys() and col in mol_to_OD[index]:
            # Set to 1
            D.at[index, col] = 1

D = D.fillna(0)
D

Unnamed: 0,baked,pea,roasted meat,cognac,truffle,biscuit,warm,spearmint,hawthorne,ester,...,alkane,lavender,crushed bug,grass,thiamin,urine,almond shell,tallow,nutmeg,lactone
31501-11-8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
97-47-3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16721-39-4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29926-41-8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
03.12.7554,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87-91-2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
25779-13-9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
34047-39-7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3658-77-3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


match molecules from MS data with flavors

In [304]:
flavors = []
mol_flavors = {}
for mol in ms_cas:
    if mol in mol_to_OD.keys():
        flavors += mol_to_OD[mol]
        mol_flavors[mol] = mol_to_OD[mol]
    else:
        mol_flavors[mol] = {}

found_flavors = set(flavors)


<span style="color:lightblue">found_flavors</span>: list of the ODs found from the identified molecules 


In [305]:
found_flavors

{'acid',
 'alkane',
 'almond',
 'almond shell',
 'amine',
 'anise',
 'apple',
 'apple peel',
 'apple. rose',
 'apricot',
 'baked',
 'balsamic',
 'banana',
 'basil',
 'beet',
 'biscuit',
 'bitter almond',
 'black currant',
 'boiled vegetable',
 'box tree',
 'bread',
 'broccoli',
 'brown sugar',
 'burnt',
 'burnt sugar',
 'butter',
 'butterscotch',
 'cabbage',
 'camomile',
 'camphor',
 'caramel',
 'caraway',
 'cardboard',
 'carrot',
 'cat',
 'celery',
 'cheese',
 'chemical',
 'cinnamon',
 'citrus',
 'clove',
 'cocoa',
 'coconut',
 'coffee',
 'cognac',
 'cologne',
 'cooked meat',
 'cooked potato',
 'coriander',
 'cotton candy',
 'coumarin',
 'cream',
 'crushed bug',
 'cucumber',
 'curry',
 'dill',
 'dust',
 'earth',
 'ester',
 'ether',
 'fat',
 'fecal',
 'fennel',
 'fish',
 'flower',
 'foxy',
 'fresh',
 'fried',
 'fruit',
 'garlic',
 'gasoline',
 'geranium',
 'grape',
 'grapefruit',
 'grass',
 'green',
 'green bean',
 'green leaf',
 'green pepper',
 'green tea',
 'hawthorne',
 'hazelnut',


<span style="color:lightblue">mol_flavors</span>: contains the molecule from where each OD is coming from 

In [306]:
mol_flavors

{'6485-40-1': {'mint'},
 '3033-23-6': {'rose', 'sweet'},
 '21284-22-0': {'green tea', 'herb', 'spice'},
 '38427-78-0': {'terpentine'},
 '495-61-4': {'balsamic'},
 '29873-99-2': {'green', 'oil', 'wood'},
 '6909-30-4': {'green'},
 '99-49-0': {'basil', 'caraway', 'fennel', 'mint'},
 '2244-16-8': {'caraway'},
 '876-17-5': {'flower', 'green'},
 '5989-27-5': {'citrus', 'mint'},
 '4674-50-4': {'grapefruit'},
 '2243-33-6': {'dill'},
 '10307-61-6': {'apple'},
 '69064-37-5': {'oil'},
 '13419-69-7': {'fat', 'must'},
 '928-95-0': {'green', 'leaf', 'walnut'},
 '18829-56-6': {'cucumber', 'fat', 'green'},
 '53448-07-0': {'fat', 'green', 'soap'},
 '928-97-2': {'fresh', 'moss'},
 '1197-07-5': {'caraway', 'solvent'},
 '14371-10-9': {'cinnamon', 'paint'},
 '09.04.5948': {'herb', 'warm'},
 '106-28-5': {'muguet'},
 '5273-85-8': {'flower', 'spice'},
 '5932-68-3': {'flower'},
 '4959-35-7': {'citrus', 'fresh'},
 '34995-77-2': {'flower'},
 '39638-67-0': {'coconut', 'flower'},
 '7212-40-0': {'fresh', 'mint'},
 

<span style="color:#FBB714">OD</span>: Dataframe - *molecules x ODs*

In [307]:
# filters only relevant rows - only those ODs coming from indentified molecules
OD = D[(D.index).isin(I_vector.index)]
# get only important/matching columns
OD = OD.loc[:, (OD != 0).any(axis=0)]
OD 

Unnamed: 0,baked,pea,roasted meat,cognac,truffle,biscuit,warm,spearmint,hawthorne,ester,...,alkane,lavender,crushed bug,grass,thiamin,urine,almond shell,tallow,nutmeg,lactone
31501-11-8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
97-47-3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16721-39-4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29926-41-8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
03.12.7554,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87-91-2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
25779-13-9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
34047-39-7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3658-77-3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### 3. OD ----------> OQ


### Whisky Research institute

In [308]:
Research_data = pd.read_csv('./data/swri_flavorwheel.csv', sep=';')
Research_OQ_OD =  Research_data.drop(columns=Research_data.columns[0])
Research_OSA_OQ = Research_data.drop(columns=Research_data.columns[2])

In [309]:
OD_OQ = dict_to_matrix(dataframe_to_dict(Research_OQ_OD)).T
OQ_OSA = dict_to_matrix(dataframe_to_dict(Research_OSA_OQ)).T
# filter out rows with only 0
OD_OQ = OD_OQ.loc[(OD_OQ.iloc[:,1:] != 0).any(axis=1)]
OQ_OSA = OQ_OSA.loc[(OQ_OSA.iloc[:,1:] != 0).any(axis=1)]


<span style="color:#FBB714">OD_OQ</span>: Dataframe - all ODs x OQs from research institute

In [310]:
OD_OQ

Unnamed: 0,Burnt,Smoky,Medicinal,Leathery,Tobacco,Sweaty,Dry cereals,Wet cereals,leafy,Herbal,...,Solvently,Soapy,Sweet,Woody,Nutty,Spicy,Oily,Sour,Sulphury,Stale
Geranium leaves,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
malt extract,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
stale tobacco,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
tomato plants,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
bovril,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
engine oil,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Malt vinegar,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
nutmeg,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Green peppers,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


<span style="color:#FBB714">OQ_OSA</span>: Dataframe - all OQs x OSAs from research institute

In [311]:
#OQ_OSA

In [312]:
# find matching OQs to found ODs from massspec x flavornet
matching_OQs, matching_OQs_dict = find_matching_entries(mol_flavors, OD_OQ)

In [313]:
# get only important columns
matching_OQs = matching_OQs.loc[:, (matching_OQs != 0).any(axis=0)]
matching_OQs

Unnamed: 0,Sweaty,Dry cereals,Green vegetables,Fresh flowers,Fresh fruit,Citrus,Artificial fruit flavourings Citrus,Vanilla,Toffee,Nuts,...,Cooked vegetables,Struck match,Earthy,Fruity,Solvently,Soapy,Sweet,Woody,Spicy,Oily
Roses,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Sweet,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
Spicy,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
Woody,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
Oily,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
grapefruit,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Apples,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
cucumber,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Soapy,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
Solvently,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [314]:
# to know which OD comes from which molecule
matching_OQs_dict

{'6485-40-1': set(),
 '3033-23-6': {'Roses', 'Sweet'},
 '21284-22-0': {'Spicy'},
 '38427-78-0': set(),
 '495-61-4': set(),
 '29873-99-2': {'Oily', 'Woody'},
 '6909-30-4': set(),
 '99-49-0': set(),
 '2244-16-8': set(),
 '876-17-5': set(),
 '5989-27-5': set(),
 '4674-50-4': {'grapefruit'},
 '2243-33-6': set(),
 '10307-61-6': {'Apples'},
 '69064-37-5': {'Oily'},
 '13419-69-7': set(),
 '928-95-0': set(),
 '18829-56-6': {'cucumber'},
 '53448-07-0': {'Soapy'},
 '928-97-2': set(),
 '1197-07-5': {'Solvently'},
 '14371-10-9': {'cinnamon'},
 '09.04.5948': set(),
 '106-28-5': set(),
 '5273-85-8': {'Spicy'},
 '5932-68-3': set(),
 '4959-35-7': set(),
 '34995-77-2': set(),
 '39638-67-0': set(),
 '7212-40-0': set(),
 '07.11.5258': set(),
 '80041-01-6': set(),
 '13474-59-4': {'Woody'},
 '23726-91-2': {'Apples'},
 '3779-61-1': {'Sweet'},
 '4128-17-0': {'Oily'},
 '1117-52-8': set(),
 '21661-99-4': set(),
 '56805-23-3': set(),
 '92356-06-4': set(),
 '15186-51-3': set(),
 '125811-37-2': set(),
 '928-96-1'

In [315]:
# to know which OD comes from which molecule
#remove empty entries
matching_OQs_dict = {key: value for key, value in matching_OQs_dict.items() if value}
matching_OQs_dict

{'3033-23-6': {'Roses', 'Sweet'},
 '21284-22-0': {'Spicy'},
 '29873-99-2': {'Oily', 'Woody'},
 '4674-50-4': {'grapefruit'},
 '10307-61-6': {'Apples'},
 '69064-37-5': {'Oily'},
 '18829-56-6': {'cucumber'},
 '53448-07-0': {'Soapy'},
 '1197-07-5': {'Solvently'},
 '14371-10-9': {'cinnamon'},
 '5273-85-8': {'Spicy'},
 '13474-59-4': {'Woody'},
 '23726-91-2': {'Apples'},
 '3779-61-1': {'Sweet'},
 '4128-17-0': {'Oily'},
 '928-96-1': {'Gassy'},
 '53398-85-9': {'Sweet'},
 '31501-11-8': {'Fruity '},
 '15456-69-6': {'Soapy'},
 '18679-18-0': {'Sweet'},
 '55013-32-6': {'Spicy'},
 '562-74-3': {'nutmeg'},
 '1197-15-5': {'Spicy'},
 '470-67-7': {'Spicy'},
 '470-82-6': {'Sweet'},
 '15051-81-7': {'Sweet', 'Woody'},
 '75853-49-5': {'Sweaty', 'Sweet'},
 '2497-25-8': {'oranges'},
 '3913-71-1': {'oranges'},
 '3913-81-3': {'oranges'},
 '20407-84-5': {'Sweet'},
 '104-76-7': {'Roses'},
 '100-71-0': {'Gassy'},
 '928-94-9': {'Fruity '},
 '624-41-9': {'Fruity '},
 '628-99-9': {'cucumber'},
 '2463-53-8': {'cucumber'

<span style="color:#FBB714">OQs</span>: Dataframe - ODs x OQs

In [316]:
OQ_dict, OQs = create_matrix_from_dict(matching_OQs_dict, matching_OQs)

In [317]:
OQs

Unnamed: 0,Sweaty,Dry cereals,Green vegetables,Fresh flowers,Fresh fruit,Citrus,Artificial fruit flavourings Citrus,Vanilla,Toffee,Nuts,...,Cooked vegetables,Struck match,Earthy,Fruity,Solvently,Soapy,Sweet,Woody,Spicy,Oily
6485-40-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3033-23-6,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
21284-22-0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
38427-78-0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
495-61-4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
713-95-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
20307-84-0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
120021-96-7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
586-62-9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### 4.1 OQ ------ Binary Matrix -----> OSA


In [318]:
OQ_OSA

Unnamed: 0,peaty,feinty,cereal,Green/grassy,Floral,Fruity,Solvently,Soapy,Sweet,Woody,Nutty,Spicy,Oily,Sour,Sulphury,Stale
Toffee,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
Coconut,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
Soapy,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
Artificial fruit flavourings Citrus,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
leafy,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
Green vegetables,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
Unperfumed soap,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
cereal,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
Woody,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
feinty,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [319]:
# find matching OQs to found ODs from massspec x flavornet
matching_OSAs, matching_OSAs_dict = find_matching_entries(OQ_dict, OQ_OSA)

In [320]:
#TODO: remove redundant entries

In [321]:
matching_OSAs

Unnamed: 0,peaty,feinty,cereal,Green/grassy,Floral,Fruity,Solvently,Soapy,Sweet,Woody,Nutty,Spicy,Oily,Sour,Sulphury,Stale
Sweet,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
Fresh flowers,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
Spicy,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
Woody,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
Citrus,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
Green vegetables,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
Soapy,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
Solvently,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
Struck match,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
Sweaty,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


<span style="color:#FBB714">OSAs</span>: Dataframe - OQs x OSAs

In [322]:
OSA_dict, OSAs = create_matrix_from_dict(matching_OSAs_dict, matching_OSAs)

In [323]:
OSAs

Unnamed: 0,feinty,cereal,Green/grassy,Floral,Fruity,Solvently,Soapy,Sweet,Woody,Nutty,Spicy,Oily,Sour,Sulphury,Stale
6485-40-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3033-23-6,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0
21284-22-0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
38427-78-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
495-61-4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
713-95-1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
20307-84-0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
120021-96-7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
586-62-9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [324]:
I_vector

Unnamed: 0,intensities
6485-40-1,1.943542e+08
3033-23-6,1.240854e+09
21284-22-0,1.048295e+08
38427-78-0,9.087319e+07
495-61-4,1.831223e+08
...,...
713-95-1,1.893777e+08
20307-84-0,1.831223e+08
120021-96-7,1.831223e+08
586-62-9,2.528854e+08


In [325]:
from sklearn.preprocessing import Normalizer
normalizer = Normalizer()
I_vector_normalized = normalizer.fit_transform(I_vector.T).T
I_vector_normalized

array([[0.0182754 ],
       [0.11667928],
       [0.00985727],
       [0.00854493],
       [0.01721925],
       [0.01721925],
       [0.05682404],
       [0.0182754 ],
       [0.0182754 ],
       [0.11667928],
       [0.02377917],
       [0.00854493],
       [0.02377917],
       [0.00782137],
       [0.02503232],
       [0.01445839],
       [0.00883519],
       [0.01482754],
       [0.02156321],
       [0.00883519],
       [0.05682404],
       [0.00951258],
       [0.05682404],
       [0.00985727],
       [0.02282212],
       [0.00912609],
       [0.05682404],
       [0.04368423],
       [0.02806551],
       [0.05682404],
       [0.11667928],
       [0.02806551],
       [0.01721925],
       [0.01816901],
       [0.02377917],
       [0.00465916],
       [0.00335162],
       [0.02251762],
       [0.01482754],
       [0.02426643],
       [0.0182754 ],
       [0.0182754 ],
       [0.00883519],
       [0.02503232],
       [0.04368423],
       [0.01780745],
       [0.11667928],
       [0.012

In [326]:
final_OSA = OSAs*I_vector_normalized
final_OSA

Unnamed: 0,feinty,cereal,Green/grassy,Floral,Fruity,Solvently,Soapy,Sweet,Woody,Nutty,Spicy,Oily,Sour,Sulphury,Stale
6485-40-1,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
3033-23-6,0.0,0.0,0.0,0.116679,0.0,0.0,0.0,0.116679,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
21284-22-0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.009857,0.0,0.0,0.0,0.0
38427-78-0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
495-61-4,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
713-95-1,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.017807,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
20307-84-0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.017219,0.0,0.000000,0.0,0.0,0.0,0.0
120021-96-7,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
586-62-9,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0


#### 4.2 OQ ------ Fuzzy Logic -----> OSA
Use Fuzzy logic to weigh OQ with expert knowledge/Intensity from Masspec


#### 4.3 Comparison BM and FL

#### 5. Show Flavourprofile

In [327]:
final_OSA.sum()*10

feinty           1.277607
cereal           1.049584
Green/grassy     4.739577
Floral           6.407959
Fruity           0.522292
Solvently        0.796996
Soapy            2.449260
Sweet           15.839665
Woody           10.115602
Nutty            0.675892
Spicy           11.188353
Oily             0.338081
Sour             0.165527
Sulphury         0.506790
Stale            2.359720
dtype: float64

In [328]:
import plotly.graph_objects as go

fig = go.Figure(data=go.Scatterpolar(
  r=(np.array(final_OSA.sum()*10)),
  theta=list(final_OSA.columns),
  fill='toself'
))

fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
       range=[0, 10]
    ),
  ),
  showlegend=False
)

fig.show()

In [329]:
final_OSA.columns

Index(['feinty', 'cereal', 'Green/grassy', 'Floral', 'Fruity ', 'Solvently',
       'Soapy', 'Sweet', 'Woody', 'Nutty', 'Spicy', 'Oily', 'Sour', 'Sulphury',
       'Stale'],
      dtype='object')

In [330]:
# filter out our 8 comparable OSAs 
comparable_OSAs = ["floral","fruit","wood","complexity","malt","sweet","peat","dried fruits"]
filtered_OSAs = pd.DataFrame(columns=comparable_OSAs)
threshold = 80
# Populate the new DataFrame based on original DataFrame
for column in final_OSA.columns:
    for comparable_OSA in comparable_OSAs:
        if fuzz.ratio(str(column).lower(), comparable_OSA) >= threshold:
            filtered_OSAs[comparable_OSA] = final_OSA[column]

#filtered_OSAs



In [331]:
import plotly.graph_objects as go

panelist_scores = [40,30,50,40,30,40,30]
fig = go.Figure()

fig.add_trace(go.Scatterpolar(
      r=[value / 10 for value in panelist_scores],
      theta=comparable_OSAs,
      fill='toself',
      name='Product A'
))
fig.add_trace(go.Scatterpolar(
      r=list(filtered_OSAs.sum()*10),
      theta=comparable_OSAs,
      fill='toself',
      name='Product B'
))

fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
      range=[0, 10]
    )),
  showlegend=False
)

fig.show()

In [332]:
#only compare only columns from final_OSA
fig = go.Figure()

fig.add_trace(go.Scatterpolar(
    r=[0,4,3,4,5,0],
    theta=list(final_OSA.columns),
    fill='toself',
    name="panelist"
))

fig.add_trace(go.Scatterpolar(
  r=(np.array(final_OSA.sum()*10)),
  theta=list(final_OSA.columns),
  fill='toself',
  name= "MS_prediction"))

fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
      range=[0, 10]
    )),
  showlegend=False
)

fig.show()

In [333]:

fig = go.Figure(data=go.Scatterpolar(
  r=[0,4,3,4,5,0],
  theta=["Green","floral","fruit","sweet","wood","spicy"],
  fill='toself'
))

fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
      range=[0, 10]
    ),
  ),
  showlegend=False
)

fig.show()