## Pipeline Flavor Profile

In [1]:
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
import csv



In [2]:
'''
does convert a pandas dataframe into a matrix where 
keys = row name
values = col name
data = either 0 or 1 
'''
def dict_to_matrix(dictionary):
    unique_values = set(val for sublist in dictionary.values() for val in sublist)
    data_dict = {key: {val: 1 if val in values else 0 for val in unique_values} for key, values in dictionary.items()}
    df = pd.DataFrame(data_dict).T
    return(df)


In [3]:
'''
find overlapp between to lists with a threshold (=70)
'''
def overlapping_elements(list1, list2, threshold=70):
    overlap = []
    for elem1 in list1:
        for elem2 in list2:
            ratio = fuzz.ratio(elem1, elem2)
            if ratio >= threshold:
                overlap.append(elem1)
    return overlap

In [4]:
'''
Convert DataFrame to dictionary
'''
def dataframe_to_dict(df):
    result_dict = {}
    for index, row in df.iterrows():
        if row[0] in result_dict:
            result_dict[row[0]].append(row[1])
        else:
            result_dict[row[0]] = [row[1]]
    return result_dict     

In [5]:
'''
find match between dictionary entries and the rows of a dataframe
'''
def find_matching_entries(dictionary,df):
    threshold = 80
    found_match = []
    found_match_per_entry = {}
    for key, value in dictionary.items():
        all_matches = set()
        for index, row in df.iterrows():
            for keyword in value:
                if fuzz.ratio(keyword, str(index).lower()) >= threshold:
                    all_matches.add(index)
                    found_match.append(row)
        found_match_per_entry[key] = all_matches
    found_match = pd.DataFrame(found_match)
    found_match = found_match.drop_duplicates()
    return(found_match, found_match_per_entry)

In [6]:
'''
create a new matrix by finding matches and filtering right columns
'''
def create_matrix_from_dict(dictionary,df):
    matrix = pd.DataFrame(columns=list(df), index=ms_cas)
    final_dict = {}
    for key, value in dictionary.items():
        current_val = []
        for v in value:
            for index, row in df.iterrows():
                if index == v:
                    current_val.append(','.join(row.loc[row.eq(1)].index.tolist()))
                    # todo: maybe exception -> do change if two ones in a row
                    matrix.loc[key][row.loc[row.eq(1)].index.tolist()] = 1
        final_dict[key] = current_val
    matrix = matrix.fillna(0)
    matrix = matrix.loc[:, (matrix != 0).any(axis=0)]
    return(final_dict, matrix)
   

#### 1. MS Data ----- Massbank -----> Molecules

-> Chris

save which intensities belong to which molecule and give each the % 
2 approaches for one molecule with multiple OQs
1.  each gets same % 
2.  divide % by count of OQs per molecule

<span style="color:lightblue">mol_int</span> : dict where key is the molecule name and value is the intensites from the mass spec data

In [16]:
# DUMMY DATA
intensities = [80, 40, 5]
ms_cas =['(E)-3-hexenol', "coumarin",'citral']
mol_int = {'(E)-3-hexenol': 80, "coumarin": 40 ,'citral': 5}

In [81]:
# actual ms data
file_path = 'data/cas_intensities.csv'
data = pd.read_csv(file_path, delimiter=',')
# only use first repliate
data = data[data["replicate"]==1]

In [82]:
data

Unnamed: 0,sample_key,replicate,6485-40-1,3033-23-6,21284-22-0,38427-78-0,495-61-4,29873-99-2,6909-30-4,99-49-0,...,13466-78-9,483-76-1,36564-42-8,705-86-2,2825-91-4,713-95-1,20307-84-0,120021-96-7,586-62-9,710-04-3
0,000920,1,1.943542e+08,1.240854e+09,1.048295e+08,9.087319e+07,1.831223e+08,1.831223e+08,6.043091e+08,1.943542e+08,...,2.528854e+08,1.831223e+08,1.048295e+08,4.645706e+08,4.645706e+08,1.893777e+08,1.831223e+08,1.831223e+08,2.528854e+08,2.662124e+08
3,000922,1,1.549112e+08,1.029110e+09,1.086362e+08,1.181268e+08,1.913892e+08,1.913892e+08,6.519549e+08,1.549112e+08,...,1.875943e+08,1.913892e+08,1.086362e+08,4.505443e+08,4.505443e+08,2.602672e+08,1.913892e+08,1.913892e+08,1.875943e+08,2.698756e+08
6,000927,1,1.688820e+08,1.102763e+09,1.523761e+08,1.616408e+08,1.932470e+08,1.932470e+08,7.387225e+08,1.688820e+08,...,2.299686e+08,1.932470e+08,1.523761e+08,4.690569e+08,4.690569e+08,2.387369e+08,1.932470e+08,1.932470e+08,2.299686e+08,2.549133e+08
9,000911,1,1.759171e+08,9.992782e+08,1.654314e+08,1.787409e+08,2.531031e+08,2.531031e+08,8.800410e+08,1.759171e+08,...,2.857401e+08,2.531031e+08,1.654314e+08,4.831389e+08,4.831389e+08,3.724425e+08,2.531031e+08,2.531031e+08,2.857401e+08,3.321024e+08
12,000925,1,1.517116e+08,1.068380e+09,1.090820e+08,1.262307e+08,1.855528e+08,1.855528e+08,6.077461e+08,1.517116e+08,...,1.823817e+08,1.855528e+08,1.090820e+08,4.520255e+08,4.520255e+08,2.344530e+08,1.855528e+08,1.855528e+08,1.823817e+08,2.454637e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
756,000596,1,2.413772e+08,2.413874e+09,9.689814e+07,9.213682e+07,1.471158e+08,1.471158e+08,6.822826e+08,2.413772e+08,...,2.592504e+08,1.471158e+08,9.689814e+07,5.506742e+08,5.506742e+08,1.736489e+08,1.471158e+08,1.471158e+08,2.592504e+08,3.138085e+08
759,000804,1,3.452753e+08,3.397598e+09,1.905095e+08,2.577871e+08,2.881055e+08,2.881055e+08,1.122339e+09,3.452753e+08,...,3.308782e+08,2.881055e+08,1.905095e+08,5.763415e+08,5.763415e+08,3.355894e+08,2.881055e+08,2.881055e+08,3.308782e+08,5.460406e+08
762,000853,1,3.740657e+08,1.855998e+09,1.829394e+08,1.613276e+08,2.186666e+08,2.186666e+08,6.303671e+08,3.740657e+08,...,3.798451e+08,2.186666e+08,1.829394e+08,2.239344e+08,2.239344e+08,1.979400e+08,2.186666e+08,2.186666e+08,3.798451e+08,3.843281e+08
765,000931,1,1.714774e+08,1.085542e+09,1.353320e+08,1.542927e+08,3.349072e+08,3.349072e+08,8.858262e+08,1.714774e+08,...,2.587659e+08,3.349072e+08,1.353320e+08,4.699361e+08,4.699361e+08,3.719480e+08,3.349072e+08,3.349072e+08,2.587659e+08,3.572876e+08


In [85]:
sample_key = "000920"

In [86]:
used_data = data[data["replicate"]==1]
used_data = used_data[used_data["sample_key"]==sample_key]
#filter columns that are 0
used_data = used_data.loc[:, (used_data != 0).any(axis=0)]
used_data

Unnamed: 0,sample_key,replicate,6485-40-1,3033-23-6,21284-22-0,38427-78-0,495-61-4,29873-99-2,6909-30-4,99-49-0,...,13466-78-9,483-76-1,36564-42-8,705-86-2,2825-91-4,713-95-1,20307-84-0,120021-96-7,586-62-9,710-04-3
0,920,1,194354200.0,1240854000.0,104829500.0,90873190.0,183122300.0,183122300.0,604309100.0,194354200.0,...,252885400.0,183122300.0,104829500.0,464570600.0,464570600.0,189377700.0,183122300.0,183122300.0,252885400.0,266212400.0


In [79]:
'''# find rows with biggest variances
df = used_data
row_combinations = list(itertools.combinations(df.index, 2))
distances = [abs(df.loc[i[0]] - df.loc[i[1]]).sum() for i in row_combinations]
max_distance_indices = row_combinations[distances.index(max(distances))]
print(df.loc[max_distance_indices[0]])
print(df.loc[max_distance_indices[1]])
#=> sample: 000879 and 000373'''

'# find rows with biggest variances\ndf = used_data\nrow_combinations = list(itertools.combinations(df.index, 2))\ndistances = [abs(df.loc[i[0]] - df.loc[i[1]]).sum() for i in row_combinations]\nmax_distance_indices = row_combinations[distances.index(max(distances))]\nprint(df.loc[max_distance_indices[0]])\nprint(df.loc[max_distance_indices[1]])\n#=> sample: 000879 and 000373'

In [87]:
used_data = used_data.drop(columns=['sample_key', 'replicate'])
#largest_entries = used_data.iloc[0].nlargest(100)
largest_entries = used_data.iloc[0].nlargest(50)

# Create a new DataFrame with only the largest entries
data_top50 = pd.DataFrame({'Intensities': largest_entries})
data_top50

Unnamed: 0,Intensities
3033-23-6,1240854000.0
876-17-5,1240854000.0
07.11.5258,1240854000.0
21662-09-9,1240854000.0
491-04-3,1240854000.0
562-74-3,1240854000.0
470-67-7,1240854000.0
470-82-6,1240854000.0
2497-25-8,1240854000.0
3913-71-1,1240854000.0


In [19]:
ms_cas = list(data_top50.index)

In [20]:
intensities = list(data_top50['Intensities'])

<span style="color:#FBB714">I_vector</span> : dataframe with *molecules x intensities*

In [21]:
#I_vector = pd.DataFrame(data = mol_int, index = ["intensities"]).T
I_vector = pd.DataFrame(data= intensities,index = ms_cas, columns=["intensities"])
I_vector

Unnamed: 0,intensities
3033-23-6,1240854000.0
876-17-5,1240854000.0
07.11.5258,1240854000.0
21662-09-9,1240854000.0
491-04-3,1240854000.0
562-74-3,1240854000.0
470-67-7,1240854000.0
470-82-6,1240854000.0
2497-25-8,1240854000.0
3913-71-1,1240854000.0


#### 2. Molecules ----- Flavornet -----> OD 
Find ODs by matching molecules with databases using Flavornet

<span style="color:lightblue">CAS_to_odorants.csv</span> : table with following columns - molecule, CAS, Mol wt, odorant


##### table manipualtion done in R

```r
library(rlang)
library(tidyverse)

CAS_to_odorants <- read.table("~/Bioinformatik_20.21/Bachelorarbeit/CAS_to_mol.txt", sep = "\t", header = FALSE)
OD_to_mol <- read.table("~/Bioinformatik_20.21/Bachelorarbeit/OD_mol.txt", sep = "\t", header = FALSE)

OD_to_mol_sep <- OD_to_mol %>%
  separate_rows(V2, sep = ";\\s*") %>%
  mutate(V2 = trimws(V2))
OD_to_mol_final <- OD_to_mol_sep[, c("V2","V1")]
colnames(OD_to_mol_final) <- c("molecule", "OD")

CAS_to_odorants_sep <- CAS_to_odorants %>%
  separate_rows(V3, sep = ";\\s*") %>%
  mutate(V3 = trimws(V3))
colnames(CAS_to_odorants_sep) <- c("CAS","Mol Wt", "molecule")


merged_df <- merge(CAS_to_odorants_sep, OD_to_mol_final, by = "molecule")
write.csv(merged_df, file = "~/Bioinformatik_20.21/Bachelorarbeit/CAS_mol_OD.csv", row.names = FALSE)
```

*Flavornet only*

<span style="color:lightblue">mol_to_OD</span> : dict where key is the molecule and the descriptors for these molecules are the values

In [22]:
'''
read flavornet data and save it in mol_to_OD
'''
# only using flavornet data
mol_to_OD = {}
molecules = []
descriptors = []
CAS_numbers= []
with open('./data/CAS_mol_OD.csv', 'r',encoding='utf-8') as tabfile:
    reader = csv.reader(tabfile, delimiter=',')
    for row in reader: 
        molecule = row[0]
        CAS = row[1]
        OD = row[3]
        molecules.append(molecule)
        CAS_numbers.append(CAS)
        descriptors.append(OD)
        if CAS in mol_to_OD:
            mol_to_OD[CAS].add(OD)
        else:
            mol_to_OD[CAS]  = set([OD])


CAS_numbers = list(set(CAS_numbers))
#list with all molecules from flavornet
molecules = list(set(molecules))
# list of all individual descriptors/ODs
descriptors = list(set(descriptors))

<span style="color:#FBB714">D</span>: Dataframe with all *molecules x ODs* - rigth now only flavornet database

In [23]:
D = pd.DataFrame(index=CAS_numbers, columns=descriptors)

In [24]:
for col in D.columns:
    for index, row in D.iterrows():
        # Check if value in  current row matches any entry in the dictionary
        if index in mol_to_OD.keys() and col in mol_to_OD[index]:
            # Set to 1
            D.at[index, col] = 1

D = D.fillna(0)
D

Unnamed: 0,apple peel,tomato leaf,coconut,coumarin,dill,crushed bug,amine,burnt sugar,smoke,wine,...,urine,tallow,medicine,jasmine,cream,fruit,fried,caramel,grapefruit,must
25343-57-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
6125-24-2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
589-75-3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1197-07-5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
106-68-3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
538-86-3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4586-22-5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
638-67-5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31501-11-8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


match molecules from MS data with flavors

In [25]:
flavors = []
mol_flavors = {}
for mol in ms_cas:
    if mol in mol_to_OD.keys():
        flavors += mol_to_OD[mol]
        mol_flavors[mol] = mol_to_OD[mol]
    else:
        mol_flavors[mol] = {}

found_flavors = set(flavors)


<span style="color:lightblue">found_flavors</span>: list of the ODs found from the identified molecules 


In [26]:
found_flavors

{'anise',
 'balsamic',
 'beet',
 'butterscotch',
 'camphor',
 'caramel',
 'citrus',
 'coriander',
 'cucumber',
 'earth',
 'fat',
 'flower',
 'fresh',
 'fruit',
 'geranium',
 'green',
 'herb',
 'lavender',
 'medicine',
 'metal',
 'mint',
 'mold',
 'must',
 'nutmeg',
 'oil',
 'orange',
 'phenol',
 'roast',
 'rose',
 'rubber',
 'smoke',
 'spice',
 'sweet',
 'tallow',
 'turpentine',
 'vanilla',
 'wood'}


<span style="color:lightblue">mol_flavors</span>: contains the molecule from where each OD is coming from 

In [27]:
mol_flavors

{'3033-23-6': {'rose', 'sweet'},
 '876-17-5': {'flower', 'green'},
 '07.11.5258': {'flower'},
 '21662-09-9': {'green', 'must'},
 '491-04-3': {'herb'},
 '562-74-3': {'must', 'nutmeg', 'turpentine'},
 '470-67-7': {'spice'},
 '470-82-6': {'mint', 'sweet'},
 '2497-25-8': {'orange', 'tallow'},
 '3913-71-1': {'orange', 'tallow'},
 '3913-81-3': {'orange', 'tallow'},
 '4861-58-9': {'fruit', 'sweet'},
 '586-82-3': {'must'},
 '105683-99-6': {'cucumber', 'green', 'tallow'},
 '147159-48-6': {'cucumber', 'green', 'tallow'},
 '29887-38-5': {'herb'},
 '507-70-0': {'camphor'},
 '465-31-6': {'camphor'},
 '15537-55-0': {'balsamic'},
 '106-23-0': {'fat'},
 '619-01-2': {'mint', 'spice'},
 '128386-31-2': {'metal'},
 '1632-73-1': {'camphor'},
 '106-24-1': {'geranium', 'rose'},
 '124-76-5': {'camphor', 'must'},
 '18675-35-9': {'spice', 'wood'},
 '16750-94-0': {'rose'},
 '5944-20-7': {'rose'},
 '498-16-8': {'herb'},
 '78-70-6': {'flower', 'lavender'},
 '89-80-5': {'fresh', 'green', 'mint'},
 '10458-14-7': {'f

<span style="color:#FBB714">OD</span>: Dataframe - *molecules x ODs*

In [28]:
# filters only relevant rows - only those ODs coming from indentified molecules
OD = D[(D.index).isin(I_vector.index)]
# get only important/matching columns
OD = OD.loc[:, (OD != 0).any(axis=0)]
OD 

Unnamed: 0,smoke,coriander,nutmeg,orange,mint,beet,butterscotch,anise,rubber,wood,...,geranium,turpentine,fresh,spice,mold,tallow,medicine,fruit,caramel,must
18479-68-0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
14575-74-7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
07.11.5258,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
470-67-7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
470-82-6,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
619-01-2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1189-09-9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
124-76-5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3913-71-1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
138-87-4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


#### 3. OD ----------> OQ


### Whisky Research institute

In [29]:
Research_data = pd.read_csv('./data/swri_flavorwheel.csv', sep=';')
Research_OQ_OD =  Research_data.drop(columns=Research_data.columns[0])
Research_OSA_OQ = Research_data.drop(columns=Research_data.columns[2])

In [30]:
OD_OQ = dict_to_matrix(dataframe_to_dict(Research_OQ_OD)).T
OQ_OSA = dict_to_matrix(dataframe_to_dict(Research_OSA_OQ)).T
# filter out rows with only 0
OD_OQ = OD_OQ.loc[(OD_OQ.iloc[:,1:] != 0).any(axis=1)]
OQ_OSA = OQ_OSA.loc[(OQ_OSA.iloc[:,1:] != 0).any(axis=1)]


<span style="color:#FBB714">OD_OQ</span>: Dataframe - all ODs x OQs from research institute

In [31]:
OD_OQ

Unnamed: 0,Burnt,Smoky,Medicinal,Leathery,Tobacco,Sweaty,Dry cereals,Wet cereals,leafy,Herbal,...,Solvently,Soapy,Sweet,Woody,Nutty,Spicy,Oily,Sour,Sulphury,Stale
Honeycomb,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Butterscotch,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Almond,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Malt vinegar,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
resinous,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
limes,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ice ceam,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
wet iron,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
porridge,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


<span style="color:#FBB714">OQ_OSA</span>: Dataframe - all OQs x OSAs from research institute

In [32]:
#OQ_OSA

In [33]:
# find matching OQs to found ODs from massspec x flavornet
matching_OQs, matching_OQs_dict = find_matching_entries(mol_flavors, OD_OQ)

In [34]:
# get only important columns
matching_OQs = matching_OQs.loc[:, (matching_OQs != 0).any(axis=0)]
matching_OQs

Unnamed: 0,Green vegetables,Fresh flowers,Citrus,Toffee,Dried spice,Fruity,Sweet,Woody,Spicy,Oily
Sweet,0,0,0,0,0,0,1,0,0,0
Roses,0,1,0,0,0,0,0,0,0,0
nutmeg,0,0,0,0,1,0,0,0,0,0
Spicy,0,0,0,0,0,0,0,0,1,0
oranges,0,0,1,0,0,0,0,0,0,0
Fruity,0,0,0,0,0,1,0,0,0,0
cucumber,1,0,0,0,0,0,0,0,0,0
Woody,0,0,0,0,0,0,0,1,0,0
Oily,0,0,0,0,0,0,0,0,0,1
Butterscotch,0,0,0,1,0,0,0,0,0,0


In [35]:
# to know which OD comes from which molecule
matching_OQs_dict

{'3033-23-6': {'Roses', 'Sweet'},
 '876-17-5': set(),
 '07.11.5258': set(),
 '21662-09-9': set(),
 '491-04-3': set(),
 '562-74-3': {'nutmeg'},
 '470-67-7': {'Spicy'},
 '470-82-6': {'Sweet'},
 '2497-25-8': {'oranges'},
 '3913-71-1': {'oranges'},
 '3913-81-3': {'oranges'},
 '4861-58-9': {'Fruity ', 'Sweet'},
 '586-82-3': set(),
 '105683-99-6': {'cucumber'},
 '147159-48-6': {'cucumber'},
 '29887-38-5': set(),
 '507-70-0': set(),
 '465-31-6': set(),
 '15537-55-0': set(),
 '106-23-0': set(),
 '619-01-2': {'Spicy'},
 '128386-31-2': set(),
 '1632-73-1': set(),
 '106-24-1': {'Roses'},
 '124-76-5': set(),
 '18675-35-9': {'Spicy', 'Woody'},
 '16750-94-0': {'Roses'},
 '5944-20-7': {'Roses'},
 '498-16-8': set(),
 '78-70-6': {'lavender'},
 '89-80-5': set(),
 '10458-14-7': set(),
 '106-25-2': {'Sweet'},
 '619-62-5': set(),
 '18479-68-0': {'Fruity '},
 '16721-39-4': set(),
 '91-10-1': set(),
 '17699-16-0': {'Woody'},
 '14575-74-7': set(),
 '98-55-5': {'Oily'},
 '138-87-4': set(),
 '541-58-2': set(),


In [36]:
# to know which OD comes from which molecule
#remove empty entries
matching_OQs_dict = {key: value for key, value in matching_OQs_dict.items() if value}
matching_OQs_dict

{'3033-23-6': {'Roses', 'Sweet'},
 '562-74-3': {'nutmeg'},
 '470-67-7': {'Spicy'},
 '470-82-6': {'Sweet'},
 '2497-25-8': {'oranges'},
 '3913-71-1': {'oranges'},
 '3913-81-3': {'oranges'},
 '4861-58-9': {'Fruity ', 'Sweet'},
 '105683-99-6': {'cucumber'},
 '147159-48-6': {'cucumber'},
 '619-01-2': {'Spicy'},
 '106-24-1': {'Roses'},
 '18675-35-9': {'Spicy', 'Woody'},
 '16750-94-0': {'Roses'},
 '5944-20-7': {'Roses'},
 '78-70-6': {'lavender'},
 '106-25-2': {'Sweet'},
 '18479-68-0': {'Fruity '},
 '17699-16-0': {'Woody'},
 '98-55-5': {'Oily'},
 '20407-84-5': {'Sweet'},
 '1189-09-9': {'Fruity '},
 '3943-74-6': {'Butterscotch'}}

<span style="color:#FBB714">OQs</span>: Dataframe - ODs x OQs

In [37]:
OQ_dict, OQs = create_matrix_from_dict(matching_OQs_dict, matching_OQs)

In [59]:
OQs

Unnamed: 0,Green vegetables,Fresh flowers,Citrus,Toffee,Dried spice,Fruity,Sweet,Woody,Spicy,Oily
3033-23-6,0,1,0,0,0,0,1,0,0,0
876-17-5,0,0,0,0,0,0,0,0,0,0
07.11.5258,0,0,0,0,0,0,0,0,0,0
21662-09-9,0,0,0,0,0,0,0,0,0,0
491-04-3,0,0,0,0,0,0,0,0,0,0
562-74-3,0,0,0,0,1,0,0,0,0,0
470-67-7,0,0,0,0,0,0,0,0,1,0
470-82-6,0,0,0,0,0,0,1,0,0,0
2497-25-8,0,0,1,0,0,0,0,0,0,0
3913-71-1,0,0,1,0,0,0,0,0,0,0


#### 4.1 OQ ------ Binary Matrix -----> OSA


In [39]:
OQ_OSA

Unnamed: 0,peaty,feinty,cereal,Green/grassy,Floral,Fruity,Solvently,Soapy,Sweet,Woody,Nutty,Spicy,Oily,Sour,Sulphury,Stale
Herbal,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
Perfumed,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
Spicy,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
Metallic,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
leafy,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
Solvently,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
Honey,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
Citrus,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
Struck match,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
Peppery,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0


In [40]:
# find matching OQs to found ODs from massspec x flavornet
matching_OSAs, matching_OSAs_dict = find_matching_entries(OQ_dict, OQ_OSA)

In [41]:
#TODO: remove redundant entries

In [65]:
print(matching_OSAs)

                  peaty  feinty  cereal  Green/grassy  Floral  Fruity   \
Fresh flowers         0       0       0             0       1        0   
Sweet                 0       0       0             0       0        0   
Dried spice           0       0       0             0       0        0   
Citrus                0       0       0             0       0        1   
Green vegetables      0       0       0             1       0        0   
Woody                 0       0       0             0       0        0   

                  Solvently  Soapy  Sweet  Woody  Nutty  Spicy  Oily  Sour  \
Fresh flowers             0      0      0      0      0      0     0     0   
Sweet                     0      0      1      0      0      0     0     0   
Dried spice               0      0      0      0      0      1     0     0   
Citrus                    0      0      0      0      0      0     0     0   
Green vegetables          0      0      0      0      0      0     0     0   
Woody        

In [63]:
print(matching_OSAs.loc[:, (matching_OSAs != 0).any(axis=0)])

                  Green/grassy  Floral  Fruity   Sweet  Woody  Spicy
Fresh flowers                0       1        0      0      0      0
Sweet                        0       0        0      1      0      0
Dried spice                  0       0        0      0      0      1
Citrus                       0       0        1      0      0      0
Green vegetables             1       0        0      0      0      0
Woody                        0       0        0      0      1      0


<span style="color:#FBB714">OSAs</span>: Dataframe - OQs x OSAs

In [43]:
OSA_dict, OSAs = create_matrix_from_dict(matching_OSAs_dict, matching_OSAs)

In [56]:
print(OSAs)

             Green/grassy  Floral  Fruity   Sweet  Woody  Spicy
3033-23-6               0       1        0      1      0      0
876-17-5                0       0        0      0      0      0
07.11.5258              0       0        0      0      0      0
21662-09-9              0       0        0      0      0      0
491-04-3                0       0        0      0      0      0
562-74-3                0       0        0      0      0      1
470-67-7                0       0        0      0      0      0
470-82-6                0       0        0      1      0      0
2497-25-8               0       0        1      0      0      0
3913-71-1               0       0        1      0      0      0
3913-81-3               0       0        1      0      0      0
4861-58-9               0       0        0      1      0      0
586-82-3                0       0        0      0      0      0
105683-99-6             1       0        0      0      0      0
147159-48-6             1       0       

In [45]:
I_vector

Unnamed: 0,intensities
3033-23-6,1240854000.0
876-17-5,1240854000.0
07.11.5258,1240854000.0
21662-09-9,1240854000.0
491-04-3,1240854000.0
562-74-3,1240854000.0
470-67-7,1240854000.0
470-82-6,1240854000.0
2497-25-8,1240854000.0
3913-71-1,1240854000.0


In [46]:
from sklearn.preprocessing import Normalizer
normalizer = Normalizer()
I_vector_normalized = normalizer.fit_transform(I_vector.T).T
I_vector_normalized

array([[0.14760054],
       [0.14760054],
       [0.14760054],
       [0.14760054],
       [0.14760054],
       [0.14760054],
       [0.14760054],
       [0.14760054],
       [0.14760054],
       [0.14760054],
       [0.14760054],
       [0.14760054],
       [0.14760054],
       [0.14760054],
       [0.14760054],
       [0.14760054],
       [0.14760054],
       [0.14760054],
       [0.14760054],
       [0.14760054],
       [0.14760054],
       [0.14760054],
       [0.14760054],
       [0.14760054],
       [0.14760054],
       [0.14760054],
       [0.14760054],
       [0.14760054],
       [0.14760054],
       [0.14760054],
       [0.14760054],
       [0.14760054],
       [0.14760054],
       [0.14760054],
       [0.14760054],
       [0.14760054],
       [0.14760054],
       [0.14760054],
       [0.14760054],
       [0.14760054],
       [0.14760054],
       [0.14697561],
       [0.14697561],
       [0.09529923],
       [0.09529923],
       [0.09529923],
       [0.09529923],
       [0.095

In [47]:
final_OSA = OSAs*I_vector_normalized
final_OSA

Unnamed: 0,Green/grassy,Floral,Fruity,Sweet,Woody,Spicy
3033-23-6,0.0,0.147601,0.0,0.147601,0.0,0.0
876-17-5,0.0,0.0,0.0,0.0,0.0,0.0
07.11.5258,0.0,0.0,0.0,0.0,0.0,0.0
21662-09-9,0.0,0.0,0.0,0.0,0.0,0.0
491-04-3,0.0,0.0,0.0,0.0,0.0,0.0
562-74-3,0.0,0.0,0.0,0.0,0.0,0.147601
470-67-7,0.0,0.0,0.0,0.0,0.0,0.0
470-82-6,0.0,0.0,0.0,0.147601,0.0,0.0
2497-25-8,0.0,0.0,0.147601,0.0,0.0,0.0
3913-71-1,0.0,0.0,0.147601,0.0,0.0,0.0


#### 4.2 OQ ------ Fuzzy Logic -----> OSA
Use Fuzzy logic to weigh OQ with expert knowledge/Intensity from Masspec


#### 4.3 Comparison BM and FL

#### 5. Show Flavourprofile

In [48]:
final_OSA.sum()*10

Green/grassy    2.952011
Floral          5.904022
Fruity          4.428016
Sweet           6.857014
Woody           2.952011
Spicy           1.476005
dtype: float64

In [49]:
import plotly.graph_objects as go

fig = go.Figure(data=go.Scatterpolar(
  r=(np.array(final_OSA.sum()*10)),
  theta=list(final_OSA.columns),
  fill='toself'
))

fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
       range=[0, 10]
    ),
  ),
  showlegend=False
)

fig.show()

In [50]:
final_OSA.columns

Index(['Green/grassy', 'Floral', 'Fruity ', 'Sweet', 'Woody', 'Spicy'], dtype='object')

In [51]:
# filter out our 8 comparable OSAs 
comparable_OSAs = ["floral","fruit","wood","complexity","malt","sweet","peat","dried fruits"]
filtered_OSAs = pd.DataFrame(columns=comparable_OSAs)
threshold = 80
# Populate the new DataFrame based on original DataFrame
for column in final_OSA.columns:
    for comparable_OSA in comparable_OSAs:
        if fuzz.ratio(str(column).lower(), comparable_OSA) >= threshold:
            filtered_OSAs[comparable_OSA] = final_OSA[column]

#filtered_OSAs



In [52]:
import plotly.graph_objects as go

panelist_scores = [40,30,50,40,30,40,30]
fig = go.Figure()

fig.add_trace(go.Scatterpolar(
      r=[value / 10 for value in panelist_scores],
      theta=comparable_OSAs,
      fill='toself',
      name='Product A'
))
fig.add_trace(go.Scatterpolar(
      r=list(filtered_OSAs.sum()*10),
      theta=comparable_OSAs,
      fill='toself',
      name='Product B'
))

fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
      range=[0, 10]
    )),
  showlegend=False
)

fig.show()

In [53]:
#only compare only columns from final_OSA
fig = go.Figure()

fig.add_trace(go.Scatterpolar(
    r=[0,4,3,4,5,0],
    theta=list(final_OSA.columns),
    fill='toself',
    name="panelist"
))

fig.add_trace(go.Scatterpolar(
  r=(np.array(final_OSA.sum()*10)),
  theta=list(final_OSA.columns),
  fill='toself',
  name= "MS_prediction"))

fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
      range=[0, 10]
    )),
  showlegend=False
)

fig.show()

In [54]:

fig = go.Figure(data=go.Scatterpolar(
  r=[0,4,3,4,5,0],
  theta=["Green","floral","fruit","sweet","wood","spicy"],
  fill='toself'
))

fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
      range=[0, 10]
    ),
  ),
  showlegend=False
)

fig.show()