## Pipeline Flavor Profile

In [5]:
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
import csv
import numpy as np
import matplotlib.pyplot as plt


In [6]:
'''
does convert a pandas dataframe into a matrix where 
keys = row name
values = col name
data = either 0 or 1 
'''
def dict_to_matrix(dictionary):
    unique_values = set(val for sublist in dictionary.values() for val in sublist)
    data_dict = {key: {val: 1 if val in values else 0 for val in unique_values} for key, values in dictionary.items()}
    df = pd.DataFrame(data_dict).T
    return(df)


In [7]:
'''
find overlapp between to lists with a threshold (=70)
'''
def overlapping_elements(list1, list2, threshold=70):
    overlap = []
    for elem1 in list1:
        for elem2 in list2:
            ratio = fuzz.ratio(elem1, elem2)
            if ratio >= threshold:
                overlap.append(elem1)
    return overlap

In [8]:
'''
Convert DataFrame to dictionary
'''
def dataframe_to_dict(df):
    result_dict = {}
    for index, row in df.iterrows():
        if row[0] in result_dict:
            result_dict[row[0]].append(row[1])
        else:
            result_dict[row[0]] = [row[1]]
    return result_dict     

In [9]:
'''
find match between dictionary entries and the rows of a dataframe
'''
def find_matching_entries(dictionary,df):
    threshold = 80
    found_match = []
    found_match_per_entry = {}
    for key, value in dictionary.items():
        all_matches = set()
        for index, row in df.iterrows():
            for keyword in value:
                if fuzz.ratio(keyword, str(index).lower()) >= threshold:
                    all_matches.add(index)
                    found_match.append(row)
        found_match_per_entry[key] = all_matches
    found_match = pd.DataFrame(found_match)
    found_match = found_match.drop_duplicates()
    return(found_match, found_match_per_entry)

In [10]:
'''
create a new matrix by finding matches and filtering right columns
'''
def create_matrix_from_dict(dictionary,df, index):
    matrix = pd.DataFrame(columns=list(df), index=index)
    final_dict = {}
    for key, value in dictionary.items():
        current_val = []
        for v in value:
            for index, row in df.iterrows():
                if index == v:
                    current_val.append(','.join(row.loc[row.eq(1)].index.tolist()))
                    # todo: maybe exception -> do change if two ones in a row
                    matrix.loc[key][row.loc[row.eq(1)].index.tolist()] = 1
        final_dict[key] = set(current_val)
    matrix = matrix.fillna(0)
    matrix = matrix.loc[:, (matrix != 0).any(axis=0)]
    return(final_dict, matrix)
   

#### 1. MS Data ----- Massbank -----> Molecules

-> Chris

save which intensities belong to which molecule and give each the % 
2 approaches for one molecule with multiple OQs
1.  each gets same % 
2.  divide % by count of OQs per molecule

<span style="color:lightblue">mol_int</span> : dict where key is the molecule name and value is the intensites from the mass spec data

In [11]:
# DUMMY DATA
intensities = [80, 40, 5]
ms_cas =['(E)-3-hexenol', "coumarin",'citral']
mol_int = {'(E)-3-hexenol': 80, "coumarin": 40 ,'citral': 5}

In [107]:
# actual ms data
file_path = 'data/cas_intensities.csv'
data = pd.read_csv(file_path, delimiter=',')
# only use first repliate
data = data[data["replicate"]==1]

In [108]:
data

Unnamed: 0,sample_key,replicate,6485-40-1,3033-23-6,21284-22-0,38427-78-0,495-61-4,29873-99-2,6909-30-4,99-49-0,...,13466-78-9,483-76-1,36564-42-8,705-86-2,2825-91-4,713-95-1,20307-84-0,120021-96-7,586-62-9,710-04-3
0,000920,1,1.943542e+08,1.240854e+09,1.048295e+08,9.087319e+07,1.831223e+08,1.831223e+08,6.043091e+08,1.943542e+08,...,2.528854e+08,1.831223e+08,1.048295e+08,4.645706e+08,4.645706e+08,1.893777e+08,1.831223e+08,1.831223e+08,2.528854e+08,2.662124e+08
3,000922,1,1.549112e+08,1.029110e+09,1.086362e+08,1.181268e+08,1.913892e+08,1.913892e+08,6.519549e+08,1.549112e+08,...,1.875943e+08,1.913892e+08,1.086362e+08,4.505443e+08,4.505443e+08,2.602672e+08,1.913892e+08,1.913892e+08,1.875943e+08,2.698756e+08
6,000927,1,1.688820e+08,1.102763e+09,1.523761e+08,1.616408e+08,1.932470e+08,1.932470e+08,7.387225e+08,1.688820e+08,...,2.299686e+08,1.932470e+08,1.523761e+08,4.690569e+08,4.690569e+08,2.387369e+08,1.932470e+08,1.932470e+08,2.299686e+08,2.549133e+08
9,000911,1,1.759171e+08,9.992782e+08,1.654314e+08,1.787409e+08,2.531031e+08,2.531031e+08,8.800410e+08,1.759171e+08,...,2.857401e+08,2.531031e+08,1.654314e+08,4.831389e+08,4.831389e+08,3.724425e+08,2.531031e+08,2.531031e+08,2.857401e+08,3.321024e+08
12,000925,1,1.517116e+08,1.068380e+09,1.090820e+08,1.262307e+08,1.855528e+08,1.855528e+08,6.077461e+08,1.517116e+08,...,1.823817e+08,1.855528e+08,1.090820e+08,4.520255e+08,4.520255e+08,2.344530e+08,1.855528e+08,1.855528e+08,1.823817e+08,2.454637e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
756,000596,1,2.413772e+08,2.413874e+09,9.689814e+07,9.213682e+07,1.471158e+08,1.471158e+08,6.822826e+08,2.413772e+08,...,2.592504e+08,1.471158e+08,9.689814e+07,5.506742e+08,5.506742e+08,1.736489e+08,1.471158e+08,1.471158e+08,2.592504e+08,3.138085e+08
759,000804,1,3.452753e+08,3.397598e+09,1.905095e+08,2.577871e+08,2.881055e+08,2.881055e+08,1.122339e+09,3.452753e+08,...,3.308782e+08,2.881055e+08,1.905095e+08,5.763415e+08,5.763415e+08,3.355894e+08,2.881055e+08,2.881055e+08,3.308782e+08,5.460406e+08
762,000853,1,3.740657e+08,1.855998e+09,1.829394e+08,1.613276e+08,2.186666e+08,2.186666e+08,6.303671e+08,3.740657e+08,...,3.798451e+08,2.186666e+08,1.829394e+08,2.239344e+08,2.239344e+08,1.979400e+08,2.186666e+08,2.186666e+08,3.798451e+08,3.843281e+08
765,000931,1,1.714774e+08,1.085542e+09,1.353320e+08,1.542927e+08,3.349072e+08,3.349072e+08,8.858262e+08,1.714774e+08,...,2.587659e+08,3.349072e+08,1.353320e+08,4.699361e+08,4.699361e+08,3.719480e+08,3.349072e+08,3.349072e+08,2.587659e+08,3.572876e+08


In [14]:
data_dist = data.copy()
data_dist["sum"] = data_dist.apply(lambda row:row[2:].sum(), axis=1)

In [15]:
data_dist

Unnamed: 0,sample_key,replicate,6485-40-1,3033-23-6,21284-22-0,38427-78-0,495-61-4,29873-99-2,6909-30-4,99-49-0,...,483-76-1,36564-42-8,705-86-2,2825-91-4,713-95-1,20307-84-0,120021-96-7,586-62-9,710-04-3,sum
0,000920,1,1.943542e+08,1.240854e+09,1.048295e+08,9.087319e+07,1.831223e+08,1.831223e+08,6.043091e+08,1.943542e+08,...,1.831223e+08,1.048295e+08,4.645706e+08,4.645706e+08,1.893777e+08,1.831223e+08,1.831223e+08,2.528854e+08,2.662124e+08,1.885973e+11
3,000922,1,1.549112e+08,1.029110e+09,1.086362e+08,1.181268e+08,1.913892e+08,1.913892e+08,6.519549e+08,1.549112e+08,...,1.913892e+08,1.086362e+08,4.505443e+08,4.505443e+08,2.602672e+08,1.913892e+08,1.913892e+08,1.875943e+08,2.698756e+08,1.901274e+11
6,000927,1,1.688820e+08,1.102763e+09,1.523761e+08,1.616408e+08,1.932470e+08,1.932470e+08,7.387225e+08,1.688820e+08,...,1.932470e+08,1.523761e+08,4.690569e+08,4.690569e+08,2.387369e+08,1.932470e+08,1.932470e+08,2.299686e+08,2.549133e+08,2.038954e+11
9,000911,1,1.759171e+08,9.992782e+08,1.654314e+08,1.787409e+08,2.531031e+08,2.531031e+08,8.800410e+08,1.759171e+08,...,2.531031e+08,1.654314e+08,4.831389e+08,4.831389e+08,3.724425e+08,2.531031e+08,2.531031e+08,2.857401e+08,3.321024e+08,2.279643e+11
12,000925,1,1.517116e+08,1.068380e+09,1.090820e+08,1.262307e+08,1.855528e+08,1.855528e+08,6.077461e+08,1.517116e+08,...,1.855528e+08,1.090820e+08,4.520255e+08,4.520255e+08,2.344530e+08,1.855528e+08,1.855528e+08,1.823817e+08,2.454637e+08,1.825964e+11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
756,000596,1,2.413772e+08,2.413874e+09,9.689814e+07,9.213682e+07,1.471158e+08,1.471158e+08,6.822826e+08,2.413772e+08,...,1.471158e+08,9.689814e+07,5.506742e+08,5.506742e+08,1.736489e+08,1.471158e+08,1.471158e+08,2.592504e+08,3.138085e+08,2.626522e+11
759,000804,1,3.452753e+08,3.397598e+09,1.905095e+08,2.577871e+08,2.881055e+08,2.881055e+08,1.122339e+09,3.452753e+08,...,2.881055e+08,1.905095e+08,5.763415e+08,5.763415e+08,3.355894e+08,2.881055e+08,2.881055e+08,3.308782e+08,5.460406e+08,4.142515e+11
762,000853,1,3.740657e+08,1.855998e+09,1.829394e+08,1.613276e+08,2.186666e+08,2.186666e+08,6.303671e+08,3.740657e+08,...,2.186666e+08,1.829394e+08,2.239344e+08,2.239344e+08,1.979400e+08,2.186666e+08,2.186666e+08,3.798451e+08,3.843281e+08,2.783030e+11
765,000931,1,1.714774e+08,1.085542e+09,1.353320e+08,1.542927e+08,3.349072e+08,3.349072e+08,8.858262e+08,1.714774e+08,...,3.349072e+08,1.353320e+08,4.699361e+08,4.699361e+08,3.719480e+08,3.349072e+08,3.349072e+08,2.587659e+08,3.572876e+08,2.168361e+11


In [16]:
data_dist['median']= data_dist.apply(lambda row:row[2:-1].median(), axis=1)
data_dist['mean']= data_dist.apply(lambda row:row[2:-2].mean(), axis=1)

In [17]:
data_dist

Unnamed: 0,sample_key,replicate,6485-40-1,3033-23-6,21284-22-0,38427-78-0,495-61-4,29873-99-2,6909-30-4,99-49-0,...,705-86-2,2825-91-4,713-95-1,20307-84-0,120021-96-7,586-62-9,710-04-3,sum,median,mean
0,000920,1,1.943542e+08,1.240854e+09,1.048295e+08,9.087319e+07,1.831223e+08,1.831223e+08,6.043091e+08,1.943542e+08,...,4.645706e+08,4.645706e+08,1.893777e+08,1.831223e+08,1.831223e+08,2.528854e+08,2.662124e+08,1.885973e+11,1.716748e+08,2.576465e+08
3,000922,1,1.549112e+08,1.029110e+09,1.086362e+08,1.181268e+08,1.913892e+08,1.913892e+08,6.519549e+08,1.549112e+08,...,4.505443e+08,4.505443e+08,2.602672e+08,1.913892e+08,1.913892e+08,1.875943e+08,2.698756e+08,1.901274e+11,1.875943e+08,2.597369e+08
6,000927,1,1.688820e+08,1.102763e+09,1.523761e+08,1.616408e+08,1.932470e+08,1.932470e+08,7.387225e+08,1.688820e+08,...,4.690569e+08,4.690569e+08,2.387369e+08,1.932470e+08,1.932470e+08,2.299686e+08,2.549133e+08,2.038954e+11,1.775292e+08,2.785456e+08
9,000911,1,1.759171e+08,9.992782e+08,1.654314e+08,1.787409e+08,2.531031e+08,2.531031e+08,8.800410e+08,1.759171e+08,...,4.831389e+08,4.831389e+08,3.724425e+08,2.531031e+08,2.531031e+08,2.857401e+08,3.321024e+08,2.279643e+11,2.247833e+08,3.114267e+08
12,000925,1,1.517116e+08,1.068380e+09,1.090820e+08,1.262307e+08,1.855528e+08,1.855528e+08,6.077461e+08,1.517116e+08,...,4.520255e+08,4.520255e+08,2.344530e+08,1.855528e+08,1.855528e+08,1.823817e+08,2.454637e+08,1.825964e+11,1.552483e+08,2.494486e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
756,000596,1,2.413772e+08,2.413874e+09,9.689814e+07,9.213682e+07,1.471158e+08,1.471158e+08,6.822826e+08,2.413772e+08,...,5.506742e+08,5.506742e+08,1.736489e+08,1.471158e+08,1.471158e+08,2.592504e+08,3.138085e+08,2.626522e+11,1.849320e+08,3.588144e+08
759,000804,1,3.452753e+08,3.397598e+09,1.905095e+08,2.577871e+08,2.881055e+08,2.881055e+08,1.122339e+09,3.452753e+08,...,5.763415e+08,5.763415e+08,3.355894e+08,2.881055e+08,2.881055e+08,3.308782e+08,5.460406e+08,4.142515e+11,3.308782e+08,5.659174e+08
762,000853,1,3.740657e+08,1.855998e+09,1.829394e+08,1.613276e+08,2.186666e+08,2.186666e+08,6.303671e+08,3.740657e+08,...,2.239344e+08,2.239344e+08,1.979400e+08,2.186666e+08,2.186666e+08,3.798451e+08,3.843281e+08,2.783030e+11,2.430886e+08,3.801953e+08
765,000931,1,1.714774e+08,1.085542e+09,1.353320e+08,1.542927e+08,3.349072e+08,3.349072e+08,8.858262e+08,1.714774e+08,...,4.699361e+08,4.699361e+08,3.719480e+08,3.349072e+08,3.349072e+08,2.587659e+08,3.572876e+08,2.168361e+11,1.994832e+08,2.962242e+08


In [18]:
data_norm = data_dist.apply(lambda row: row[2:-3]/row['mean'],axis=1)

In [19]:
data_norm.index = data_dist['sample_key']

In [20]:
data_norm2 = data_norm.apply(lambda col:col/col.median(),axis=0)

In [22]:
sample_key = "000920"

In [23]:
used_data = data_norm2[data_norm2.index==sample_key]
#filter columns that are 0
used_data.fillna(0, inplace=True)
used_data = used_data.loc[:, (used_data != 0).any(axis=0)]
used_data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  used_data.fillna(0, inplace=True)


Unnamed: 0_level_0,6485-40-1,3033-23-6,21284-22-0,38427-78-0,495-61-4,29873-99-2,6909-30-4,99-49-0,2244-16-8,876-17-5,...,13466-78-9,483-76-1,36564-42-8,705-86-2,2825-91-4,713-95-1,20307-84-0,120021-96-7,586-62-9,710-04-3
sample_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
920,1.158771,0.895271,0.946861,0.81983,1.523904,1.523904,1.089262,1.158771,1.158771,0.895271,...,1.192345,1.523904,0.946861,1.289678,1.289678,1.012158,1.523904,1.523904,1.192345,1.124045


In [111]:
data

Unnamed: 0,sample_key,replicate,6485-40-1,3033-23-6,21284-22-0,38427-78-0,495-61-4,29873-99-2,6909-30-4,99-49-0,...,13466-78-9,483-76-1,36564-42-8,705-86-2,2825-91-4,713-95-1,20307-84-0,120021-96-7,586-62-9,710-04-3
0,000920,1,1.943542e+08,1.240854e+09,1.048295e+08,9.087319e+07,1.831223e+08,1.831223e+08,6.043091e+08,1.943542e+08,...,2.528854e+08,1.831223e+08,1.048295e+08,4.645706e+08,4.645706e+08,1.893777e+08,1.831223e+08,1.831223e+08,2.528854e+08,2.662124e+08
3,000922,1,1.549112e+08,1.029110e+09,1.086362e+08,1.181268e+08,1.913892e+08,1.913892e+08,6.519549e+08,1.549112e+08,...,1.875943e+08,1.913892e+08,1.086362e+08,4.505443e+08,4.505443e+08,2.602672e+08,1.913892e+08,1.913892e+08,1.875943e+08,2.698756e+08
6,000927,1,1.688820e+08,1.102763e+09,1.523761e+08,1.616408e+08,1.932470e+08,1.932470e+08,7.387225e+08,1.688820e+08,...,2.299686e+08,1.932470e+08,1.523761e+08,4.690569e+08,4.690569e+08,2.387369e+08,1.932470e+08,1.932470e+08,2.299686e+08,2.549133e+08
9,000911,1,1.759171e+08,9.992782e+08,1.654314e+08,1.787409e+08,2.531031e+08,2.531031e+08,8.800410e+08,1.759171e+08,...,2.857401e+08,2.531031e+08,1.654314e+08,4.831389e+08,4.831389e+08,3.724425e+08,2.531031e+08,2.531031e+08,2.857401e+08,3.321024e+08
12,000925,1,1.517116e+08,1.068380e+09,1.090820e+08,1.262307e+08,1.855528e+08,1.855528e+08,6.077461e+08,1.517116e+08,...,1.823817e+08,1.855528e+08,1.090820e+08,4.520255e+08,4.520255e+08,2.344530e+08,1.855528e+08,1.855528e+08,1.823817e+08,2.454637e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
756,000596,1,2.413772e+08,2.413874e+09,9.689814e+07,9.213682e+07,1.471158e+08,1.471158e+08,6.822826e+08,2.413772e+08,...,2.592504e+08,1.471158e+08,9.689814e+07,5.506742e+08,5.506742e+08,1.736489e+08,1.471158e+08,1.471158e+08,2.592504e+08,3.138085e+08
759,000804,1,3.452753e+08,3.397598e+09,1.905095e+08,2.577871e+08,2.881055e+08,2.881055e+08,1.122339e+09,3.452753e+08,...,3.308782e+08,2.881055e+08,1.905095e+08,5.763415e+08,5.763415e+08,3.355894e+08,2.881055e+08,2.881055e+08,3.308782e+08,5.460406e+08
762,000853,1,3.740657e+08,1.855998e+09,1.829394e+08,1.613276e+08,2.186666e+08,2.186666e+08,6.303671e+08,3.740657e+08,...,3.798451e+08,2.186666e+08,1.829394e+08,2.239344e+08,2.239344e+08,1.979400e+08,2.186666e+08,2.186666e+08,3.798451e+08,3.843281e+08
765,000931,1,1.714774e+08,1.085542e+09,1.353320e+08,1.542927e+08,3.349072e+08,3.349072e+08,8.858262e+08,1.714774e+08,...,2.587659e+08,3.349072e+08,1.353320e+08,4.699361e+08,4.699361e+08,3.719480e+08,3.349072e+08,3.349072e+08,2.587659e+08,3.572876e+08


In [24]:
'''# find rows with biggest variances
df = used_data
row_combinations = list(itertools.combinations(df.index, 2))
distances = [abs(df.loc[i[0]] - df.loc[i[1]]).sum() for i in row_combinations]
max_distance_indices = row_combinations[distances.index(max(distances))]
print(df.loc[max_distance_indices[0]])
print(df.loc[max_distance_indices[1]])
#=> sample: 000879 and 000373'''

'# find rows with biggest variances\ndf = used_data\nrow_combinations = list(itertools.combinations(df.index, 2))\ndistances = [abs(df.loc[i[0]] - df.loc[i[1]]).sum() for i in row_combinations]\nmax_distance_indices = row_combinations[distances.index(max(distances))]\nprint(df.loc[max_distance_indices[0]])\nprint(df.loc[max_distance_indices[1]])\n#=> sample: 000879 and 000373'

In [25]:
ms_cas = list(used_data.columns)

<span style="color:#FBB714">I_vector</span> : dataframe with *molecules x intensities*

In [26]:
#I_vector = pd.DataFrame(data = mol_int, index = ["intensities"]).T
I_vector =pd.DataFrame({'Intensities': used_data.iloc[0]})

I_vector

Unnamed: 0,Intensities
6485-40-1,1.158771
3033-23-6,0.895271
21284-22-0,0.946861
38427-78-0,0.819830
495-61-4,1.523904
...,...
713-95-1,1.012158
20307-84-0,1.523904
120021-96-7,1.523904
586-62-9,1.192345


#### 2. Molecules ----- Flavornet -----> OD 
Find ODs by matching molecules with databases using Flavornet

<span style="color:lightblue">CAS_to_odorants.csv</span> : table with following columns - molecule, CAS, Mol wt, odorant


##### table manipualtion done in R

```r
library(rlang)
library(tidyverse)

CAS_to_odorants <- read.table("~/Bioinformatik_20.21/Bachelorarbeit/CAS_to_mol.txt", sep = "\t", header = FALSE)
OD_to_mol <- read.table("~/Bioinformatik_20.21/Bachelorarbeit/OD_mol.txt", sep = "\t", header = FALSE)

OD_to_mol_sep <- OD_to_mol %>%
  separate_rows(V2, sep = ";\\s*") %>%
  mutate(V2 = trimws(V2))
OD_to_mol_final <- OD_to_mol_sep[, c("V2","V1")]
colnames(OD_to_mol_final) <- c("molecule", "OD")

CAS_to_odorants_sep <- CAS_to_odorants %>%
  separate_rows(V3, sep = ";\\s*") %>%
  mutate(V3 = trimws(V3))
colnames(CAS_to_odorants_sep) <- c("CAS","Mol Wt", "molecule")


merged_df <- merge(CAS_to_odorants_sep, OD_to_mol_final, by = "molecule")
write.csv(merged_df, file = "~/Bioinformatik_20.21/Bachelorarbeit/CAS_mol_OD.csv", row.names = FALSE)
```

*Flavornet only*

<span style="color:lightblue">mol_to_OD</span> : dict where key is the molecule and the descriptors for these molecules are the values

In [27]:
'''
read flavornet data and save it in mol_to_OD
'''
# only using flavornet data
mol_to_OD = {}
descriptors = []
CAS_numbers= []
with open('./data/CAS_mol_OD.csv', 'r',encoding='utf-8') as tabfile:
    reader = csv.reader(tabfile, delimiter=',')
    for row in reader: 
        CAS = row[1]
        OD = row[3]
        CAS_numbers.append(CAS)
        descriptors.append(OD)
        if CAS in mol_to_OD:
            mol_to_OD[CAS].add(OD)
        else:
            mol_to_OD[CAS]  = set([OD])


CAS_numbers = list(set(CAS_numbers))
# list of all individual descriptors/ODs
descriptors = list(set(descriptors))

<span style="color:#FBB714">D</span>: Dataframe with all *molecules x ODs* - rigth now only flavornet database

In [28]:
D = pd.DataFrame(index=CAS_numbers, columns=descriptors)

In [29]:
len(CAS_numbers)

727

In [30]:
for col in D.columns:
    for index, row in D.iterrows():
        # Check if value in  current row matches any entry in the dictionary
        if index in mol_to_OD.keys() and col in mol_to_OD[index]:
            # Set to 1
            D.at[index, col] = 1

D = D.fillna(0)
D

Unnamed: 0,apple. rose,medicine,camphor,cardboard,thiamin,grape,green pepper,sweet,rose,truffle,...,strawberry,foxy,cognac,mustard,pine,peanut butter,magnolia,meat broth,paper,bread
4861-58-9,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
59121-25-4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
928-96-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15423-57-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
115-99-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108-50-9,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2311-46-8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2497-25-8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
122-97-4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


match molecules from MS data with flavors

In [31]:
len(ms_cas)

658

In [32]:
flavors = []
mol_flavors = {}
for mol in ms_cas:
    if mol in mol_to_OD.keys():
        flavors += mol_to_OD[mol]
        mol_flavors[mol] = mol_to_OD[mol]
    else:
        mol_flavors[mol] = {}

found_flavors = set(flavors)



<span style="color:lightblue">mol_flavors</span>: contains the molecule from where each OD is coming from 

In [114]:
mol_flavors

{'6485-40-1': {'mint'},
 '3033-23-6': {'rose', 'sweet'},
 '21284-22-0': {'green tea', 'herb', 'spice'},
 '38427-78-0': {'terpentine'},
 '495-61-4': {'balsamic'},
 '29873-99-2': {'green', 'oil', 'wood'},
 '6909-30-4': {'green'},
 '99-49-0': {'basil', 'caraway', 'fennel', 'mint'},
 '2244-16-8': {'caraway'},
 '876-17-5': {'flower', 'green'},
 '5989-27-5': {'citrus', 'mint'},
 '4674-50-4': {'grapefruit'},
 '2243-33-6': {'dill'},
 '10307-61-6': {'apple'},
 '69064-37-5': {'oil'},
 '13419-69-7': {'fat', 'must'},
 '928-95-0': {'green', 'leaf', 'walnut'},
 '18829-56-6': {'cucumber', 'fat', 'green'},
 '53448-07-0': {'fat', 'green', 'soap'},
 '928-97-2': {'fresh', 'moss'},
 '1197-07-5': {'caraway', 'solvent'},
 '14371-10-9': {'cinnamon', 'paint'},
 '09.04.5948': {'herb', 'warm'},
 '106-28-5': {'muguet'},
 '5273-85-8': {'flower', 'spice'},
 '5932-68-3': {'flower'},
 '4959-35-7': {'citrus', 'fresh'},
 '34995-77-2': {'flower'},
 '39638-67-0': {'coconut', 'flower'},
 '7212-40-0': {'fresh', 'mint'},
 

<span style="color:lightblue">found_flavors</span>: list of the ODs found from the identified molecules 


In [34]:
found_flavors

{'acid',
 'alkane',
 'almond',
 'almond shell',
 'amine',
 'anise',
 'apple',
 'apple peel',
 'apple. rose',
 'apricot',
 'baked',
 'balsamic',
 'banana',
 'basil',
 'beet',
 'biscuit',
 'bitter almond',
 'black currant',
 'boiled vegetable',
 'box tree',
 'bread',
 'broccoli',
 'brown sugar',
 'burnt',
 'burnt sugar',
 'butter',
 'butterscotch',
 'cabbage',
 'camomile',
 'camphor',
 'caramel',
 'caraway',
 'cardboard',
 'carrot',
 'cat',
 'celery',
 'cheese',
 'chemical',
 'cinnamon',
 'citrus',
 'clove',
 'cocoa',
 'coconut',
 'coffee',
 'cognac',
 'cologne',
 'cooked meat',
 'cooked potato',
 'coriander',
 'cotton candy',
 'coumarin',
 'cream',
 'crushed bug',
 'cucumber',
 'curry',
 'dill',
 'dust',
 'earth',
 'ester',
 'ether',
 'fat',
 'fecal',
 'fennel',
 'fish',
 'flower',
 'foxy',
 'fresh',
 'fried',
 'fruit',
 'garlic',
 'gasoline',
 'geranium',
 'grape',
 'grapefruit',
 'grass',
 'green',
 'green bean',
 'green leaf',
 'green pepper',
 'green tea',
 'hawthorne',
 'hazelnut',

<span style="color:#FBB714">OD</span>: Dataframe - *molecules x ODs*

In [35]:
# filters only relevant rows - only those ODs coming from indentified molecules
OD = D[(D.index).isin(I_vector.index)]
# get only important/matching columns
OD = OD.loc[:, (OD != 0).any(axis=0)]
OD 

Unnamed: 0,apple. rose,medicine,camphor,cardboard,thiamin,grape,green pepper,sweet,rose,truffle,...,strawberry,foxy,cognac,mustard,pine,peanut butter,magnolia,meat broth,paper,bread
4861-58-9,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
59121-25-4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
928-96-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15423-57-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
115-99-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108-50-9,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2311-46-8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2497-25-8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
122-97-4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### 3. OD ----------> OQ


### Whisky Research institute

In [36]:
Research_data = pd.read_csv('./data/swri_flavorwheel.csv', sep=';')
Research_OQ_OD =  Research_data.drop(columns=Research_data.columns[0])


In [37]:
OD_OQ = dict_to_matrix(dataframe_to_dict(Research_OQ_OD)).T

<span style="color:#FBB714">OD_OQ</span>: Dataframe - all ODs x OQs from research institute

In [38]:
OD_OQ

Unnamed: 0,Burnt,Smoky,Medicinal,Leathery,Tobacco,Sweaty,Dry cereals,Wet cereals,leafy,Herbal,...,Solvently,Soapy,Sweet,Woody,Nutty,Spicy,Oily,Sour,Sulphury,Stale
treacle,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
brazil nuts,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Cabbage,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Sour,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
fresh peas,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
roasted beans,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
fruit cake,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Green/grassy,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCP,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [39]:
# find matching OQs to found ODs from massspec x flavornet
matching_OQs, matching_OQs_dict = find_matching_entries(mol_flavors, OD_OQ)

In [40]:
# get only important columns
matching_OQs = matching_OQs.loc[:, (matching_OQs != 0).any(axis=0)]
matching_OQs

Unnamed: 0,Burnt,Sweaty,Dry cereals,Green vegetables,Fresh flowers,Fresh fruit,Citrus,Artificial fruit flavourings,Vanilla,Toffee,...,Cooked vegetables,Struck match,Earthy,Fruity,Solvently,Soapy,Sweet,Woody,Spicy,Oily
Sweet,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
Roses,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Spicy,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
Oily,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
Woody,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
grapefruit,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Apples,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
cucumber,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Soapy,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
Solvently,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [41]:
len(matching_OQs)

26

In [42]:
# to know which OD comes from which molecule
#remove empty entries
matching_OQs_dict = {key: value for key, value in matching_OQs_dict.items() if value}
matching_OQs_dict

{'3033-23-6': {'Roses', 'Sweet'},
 '21284-22-0': {'Spicy'},
 '29873-99-2': {'Oily', 'Woody'},
 '4674-50-4': {'grapefruit'},
 '10307-61-6': {'Apples'},
 '69064-37-5': {'Oily'},
 '18829-56-6': {'cucumber'},
 '53448-07-0': {'Soapy'},
 '1197-07-5': {'Solvently'},
 '14371-10-9': {'cinnamon'},
 '5273-85-8': {'Spicy'},
 '13474-59-4': {'Woody'},
 '23726-91-2': {'Apples'},
 '3779-61-1': {'Sweet'},
 '4128-17-0': {'Oily'},
 '928-96-1': {'Gassy'},
 '53398-85-9': {'Sweet'},
 '31501-11-8': {'Fruity'},
 '15456-69-6': {'Soapy'},
 '18679-18-0': {'Sweet'},
 '55013-32-6': {'Spicy'},
 '562-74-3': {'nutmeg'},
 '1197-15-5': {'Spicy'},
 '470-67-7': {'Spicy'},
 '470-82-6': {'Sweet'},
 '15051-81-7': {'Sweet', 'Woody'},
 '75853-49-5': {'Sweaty', 'Sweet'},
 '2497-25-8': {'oranges'},
 '3913-71-1': {'oranges'},
 '3913-81-3': {'oranges'},
 '20407-84-5': {'Sweet'},
 '104-76-7': {'Roses'},
 '100-71-0': {'Gassy'},
 '928-94-9': {'Fruity'},
 '624-41-9': {'Fruity'},
 '628-99-9': {'cucumber'},
 '2463-53-8': {'cucumber'},


<span style="color:#FBB714">OQs</span>: Dataframe - ODs x OQs

In [43]:
OQ_dict, OQs = create_matrix_from_dict(matching_OQs_dict, matching_OQs,matching_OQs_dict.keys())

In [44]:
OQs

Unnamed: 0,Burnt,Sweaty,Dry cereals,Green vegetables,Fresh flowers,Fresh fruit,Citrus,Artificial fruit flavourings,Vanilla,Toffee,...,Cooked vegetables,Struck match,Earthy,Fruity,Solvently,Soapy,Sweet,Woody,Spicy,Oily
3033-23-6,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
21284-22-0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
29873-99-2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
4674-50-4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10307-61-6,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13466-78-9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
483-76-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
713-95-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
20307-84-0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [45]:
OQ_dict

{'3033-23-6': {'Fresh flowers', 'Sweet'},
 '21284-22-0': {'Spicy'},
 '29873-99-2': {'Oily', 'Woody'},
 '4674-50-4': {'Citrus'},
 '10307-61-6': {'Fresh fruit'},
 '69064-37-5': {'Oily'},
 '18829-56-6': {'Green vegetables'},
 '53448-07-0': {'Soapy'},
 '1197-07-5': {'Solvently'},
 '14371-10-9': {'Dried spice'},
 '5273-85-8': {'Spicy'},
 '13474-59-4': {'Woody'},
 '23726-91-2': {'Fresh fruit'},
 '3779-61-1': {'Sweet'},
 '4128-17-0': {'Oily'},
 '928-96-1': {'Struck match '},
 '53398-85-9': {'Sweet'},
 '31501-11-8': {'Fruity'},
 '15456-69-6': {'Soapy'},
 '18679-18-0': {'Sweet'},
 '55013-32-6': {'Spicy'},
 '562-74-3': set(),
 '1197-15-5': {'Spicy'},
 '470-67-7': {'Spicy'},
 '470-82-6': {'Sweet'},
 '15051-81-7': {'Sweet', 'Woody'},
 '75853-49-5': {'Sweaty', 'Sweet'},
 '2497-25-8': set(),
 '3913-71-1': set(),
 '3913-81-3': set(),
 '20407-84-5': {'Sweet'},
 '104-76-7': {'Fresh flowers'},
 '100-71-0': {'Struck match '},
 '928-94-9': {'Fruity'},
 '624-41-9': {'Fruity'},
 '628-99-9': {'Green vegetabl

#### 4.  OQ -----------> OSA


<span style="color:#FBB714">OQ_OSA</span>: Dataframe - all OQs x OSAs from research institute

In [46]:
Research_OSA_OQ = Research_data.drop(columns=Research_data.columns[2])
OQ_OSA = dict_to_matrix(dataframe_to_dict(Research_OSA_OQ)).T

 #### 4.1 Binary Matrix

In [47]:
# find matching OQs to found ODs from massspec x flavornet
matching_OSAs, matching_OSAs_dict = find_matching_entries(OQ_dict, OQ_OSA)

In [48]:
len(matching_OSAs_dict)

326

In [49]:
matching_OSAs = matching_OSAs.loc[:, (matching_OSAs != 0).any(axis=0)]
matching_OSAs

Unnamed: 0,Peaty,Feinty,Cereal,Green/grassy,Floral,Fruity,Solvently,Soapy,Sweet,Woody,Nutty,Spicy,Oily,Sour,Sulphury,Stale
Fresh flowers,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
Sweet,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
Spicy,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
Woody,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
Citrus,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
Green vegetables,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
Soapy,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
Solvently,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
Struck match,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
Sweaty,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [50]:
matching_OSAs_dict=  {key: value for key, value in matching_OSAs_dict.items() if value}

In [51]:
matching_OSAs_dict

{'3033-23-6': {'Fresh flowers', 'Sweet'},
 '21284-22-0': {'Spicy'},
 '29873-99-2': {'Woody'},
 '4674-50-4': {'Citrus'},
 '10307-61-6': {'Fresh fruit'},
 '18829-56-6': {'Green vegetables'},
 '53448-07-0': {'Soapy'},
 '1197-07-5': {'Solvently'},
 '14371-10-9': {'Dried spice'},
 '5273-85-8': {'Spicy'},
 '13474-59-4': {'Woody'},
 '23726-91-2': {'Fresh fruit'},
 '3779-61-1': {'Sweet'},
 '928-96-1': {'Struck match '},
 '53398-85-9': {'Sweet'},
 '31501-11-8': {'Fruity'},
 '15456-69-6': {'Soapy'},
 '18679-18-0': {'Sweet'},
 '55013-32-6': {'Spicy'},
 '1197-15-5': {'Spicy'},
 '470-67-7': {'Spicy'},
 '470-82-6': {'Sweet'},
 '15051-81-7': {'Sweet', 'Woody'},
 '75853-49-5': {'Sweaty', 'Sweet'},
 '20407-84-5': {'Sweet'},
 '104-76-7': {'Fresh flowers'},
 '100-71-0': {'Struck match '},
 '928-94-9': {'Fruity'},
 '624-41-9': {'Fruity'},
 '628-99-9': {'Green vegetables'},
 '2463-53-8': {'Green vegetables'},
 '60784-31-8': {'Green vegetables'},
 '123-96-6': {'Earthy'},
 '4861-58-9': {'Fruity', 'Sweet'},
 

<span style="color:#FBB714">OSAs</span>: Dataframe - OQs x OSAs

In [52]:
OSA_dict, OSAs = create_matrix_from_dict(matching_OSAs_dict, matching_OSAs,ms_cas)

In [53]:
print(OSAs)

             Peaty  Feinty  Cereal  Green/grassy  Floral  Fruity  Solvently  \
6485-40-1        0       0       0             0       0       0          0   
3033-23-6        0       0       0             0       1       0          0   
21284-22-0       0       0       0             0       0       0          0   
38427-78-0       0       0       0             0       0       0          0   
495-61-4         0       0       0             0       0       0          0   
...            ...     ...     ...           ...     ...     ...        ...   
713-95-1         0       0       0             0       0       0          0   
20307-84-0       0       0       0             0       0       0          0   
120021-96-7      0       0       0             0       0       0          0   
586-62-9         0       0       0             0       0       0          0   
710-04-3         0       0       0             0       0       0          0   

             Soapy  Sweet  Woody  Nutty  Spicy  Oil

In [54]:
I_vector

Unnamed: 0,Intensities
6485-40-1,1.158771
3033-23-6,0.895271
21284-22-0,0.946861
38427-78-0,0.819830
495-61-4,1.523904
...,...
713-95-1,1.012158
20307-84-0,1.523904
120021-96-7,1.523904
586-62-9,1.192345


In [55]:
'''from sklearn.preprocessing import Normalizer
normalizer = Normalizer()
I_vector_normalized = normalizer.fit_transform(I_vector.T).T
I_vector_normalized'''

'from sklearn.preprocessing import Normalizer\nnormalizer = Normalizer()\nI_vector_normalized = normalizer.fit_transform(I_vector.T).T\nI_vector_normalized'

In [56]:
I_vector_normalized=np.array(I_vector)

In [57]:
OSAs

Unnamed: 0,Peaty,Feinty,Cereal,Green/grassy,Floral,Fruity,Solvently,Soapy,Sweet,Woody,Nutty,Spicy,Oily,Sour,Sulphury,Stale
6485-40-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3033-23-6,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0
21284-22-0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
38427-78-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
495-61-4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
713-95-1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
20307-84-0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
120021-96-7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
586-62-9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [58]:
OSAs.sum(axis=0)

Peaty            1
Feinty           9
Cereal           5
Green/grassy    12
Floral          16
Fruity           2
Solvently        3
Soapy           12
Sweet           67
Woody           46
Nutty            5
Spicy           43
Oily             3
Sour             2
Sulphury         4
Stale           14
dtype: int64

In [59]:
I_vector_normalized

array([[1.15877085],
       [0.89527142],
       [0.94686057],
       [0.81983014],
       [1.52390393],
       [1.52390393],
       [1.08926226],
       [1.15877085],
       [1.15877085],
       [0.89527142],
       [1.1923447 ],
       [0.81983014],
       [1.1923447 ],
       [0.78076356],
       [1.12404518],
       [1.04212523],
       [0.80542014],
       [1.05563826],
       [1.02242923],
       [0.80542014],
       [1.08926226],
       [1.1523065 ],
       [1.08926226],
       [0.94686057],
       [0.73453306],
       [0.95986932],
       [1.08926226],
       [1.289678  ],
       [1.15289343],
       [1.08926226],
       [0.89527142],
       [1.15289343],
       [1.52390393],
       [0.95170974],
       [1.1923447 ],
       [0.7083681 ],
       [0.68370289],
       [1.1459109 ],
       [1.05563826],
       [0.82295116],
       [1.15877085],
       [1.15877085],
       [0.80542014],
       [1.12404518],
       [1.289678  ],
       [1.01215793],
       [0.89527142],
       [0.889

In [60]:
OSAs

Unnamed: 0,Peaty,Feinty,Cereal,Green/grassy,Floral,Fruity,Solvently,Soapy,Sweet,Woody,Nutty,Spicy,Oily,Sour,Sulphury,Stale
6485-40-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3033-23-6,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0
21284-22-0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
38427-78-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
495-61-4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
713-95-1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
20307-84-0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
120021-96-7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
586-62-9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [115]:
final_OSA = OSAs*I_vector_normalized
final_OSA

Unnamed: 0,Peaty,Feinty,Cereal,Green/grassy,Floral,Fruity,Solvently,Soapy,Sweet,Woody,Nutty,Spicy,Oily,Sour,Sulphury,Stale
6485-40-1,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
3033-23-6,0.0,0.0,0.0,0.0,0.895271,0.0,0.0,0.0,0.895271,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
21284-22-0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.946861,0.0,0.0,0.0,0.0
38427-78-0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
495-61-4,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
713-95-1,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,1.012158,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
20307-84-0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,1.523904,0.0,0.000000,0.0,0.0,0.0,0.0
120021-96-7,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
586-62-9,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0


In [116]:
final_OSA.sum(axis=0)

Peaty            0.707071
Feinty           9.706490
Cereal           4.490041
Green/grassy    13.187119
Floral          15.000616
Fruity           2.109508
Solvently        3.188952
Soapy           11.399934
Sweet           71.329696
Woody           58.895719
Nutty            5.520885
Spicy           47.365698
Oily             3.213341
Sour             2.173851
Sulphury         4.160303
Stale           13.913653
dtype: float64

#### 4.2 Fuzzy Logic 
Use Fuzzy logic to weight OQ with expert knowledge/Intensity from Masspec


In [63]:
# list with OSA and corresponding OQs -> look at inner circle of flavorwheel
OSAs_corresponding_OQs = dataframe_to_dict(Research_OSA_OQ.drop_duplicates())
OSAs_corresponding_OQs

{'Peaty': ['Burnt', 'Smoky', 'Medicinal', 'Peaty'],
 'Feinty': ['Leathery', 'Tobacco', 'Sweaty', 'Feinty'],
 'Cereal': ['Dry cereals', 'Wet cereals', 'Cereal'],
 'Green/grassy': ['leafy', 'Herbal', 'Green vegetables', 'Green/grassy'],
 'Floral': ['Fresh flowers', 'Perfumed', 'Floral'],
 'Fruity': ['Fresh fruit',
  'Dried fruit',
  'Citrus',
  'Artificial fruit flavourings',
  'Fruity'],
 'Solvently': ['Nail varnish remover', 'Solvently'],
 'Soapy': ['Unperfumed soap', 'Soapy'],
 'Sweet': ['Vanilla', 'Honey', 'Toffee', 'Sweet'],
 'Woody': ['New wood', 'Okay', 'Woody'],
 'Nutty': ['Coconut', 'Nuts', 'Toasted', 'Nutty'],
 'Spicy': ['Dried spice', 'Peppery', 'Spicy'],
 'Oily': ['Buttery ', 'Waxy', 'Oils/fats', 'Oily'],
 'Sour': ['Vinegary', 'Cheesy', 'Vomit', 'Sour'],
 'Sulphury': ['Cooked vegetables ',
  'Rubbery',
  'Struck match ',
  'Decaying',
  'Meaty ',
  'Sulphury'],
 'Stale': ['Earthy', 'Musty', 'Metallic', 'Stale']}

In [64]:
# all found OQs for our mass spec data
found_OQs =set(item for sublist in OQ_dict.values() for item in sublist)
found_OQs

{'Artificial fruit flavourings',
 'Burnt',
 'Buttery ',
 'Citrus',
 'Cooked vegetables ',
 'Dried spice',
 'Dry cereals',
 'Earthy',
 'Fresh flowers',
 'Fresh fruit',
 'Fruity',
 'Green vegetables',
 'Nuts',
 'Oily',
 'Soapy',
 'Solvently',
 'Spicy',
 'Struck match ',
 'Sweaty',
 'Sweet',
 'Toasted',
 'Toffee',
 'Vanilla',
 'Vinegary',
 'Vomit',
 'Woody'}

1. normalize OQs

In [65]:
CAS_intensities_OQs = I_vector[I_vector.index.isin(OSA_dict.keys())]
CAS_intensities_OQs.loc[:,'OQs'] = OQ_dict.copy()
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  CAS_intensities_OQs.loc[:,'OQs'] = OQ_dict.copy()


In [66]:
CAS_intensities_OQs

Unnamed: 0,Intensities,OQs
3033-23-6,0.895271,"{Sweet, Fresh flowers}"
21284-22-0,0.946861,{Spicy}
29873-99-2,1.523904,"{Woody, Oily}"
4674-50-4,0.819830,{Citrus}
10307-61-6,0.780764,{Fresh fruit}
...,...,...
30021-74-0,1.523904,"{Spicy, Woody}"
515-17-3,1.523904,{Woody}
483-76-1,1.523904,{Woody}
713-95-1,1.012158,"{Sweet, Fruity}"


add up all intensities per OQ 

intensity_per_OQ: dict that contains the OQ and the summed up intensities

In [67]:
intensity_per_OQ = {}
for found_OQ in found_OQs:
    for index, row in CAS_intensities_OQs.iterrows():
        if found_OQ in row["OQs"]:
            if found_OQ in intensity_per_OQ:
                intensity_per_OQ[found_OQ] += row["Intensities"]
            else:
                intensity_per_OQ[found_OQ] = row["Intensities"]

In [68]:
'''
normalizes the OQs by dividing each row/cell by the overall intensities, meaning the sum of all intensities
'''
def normalize_OQs(df_intensities):
    normalized_df = pd.DataFrame(columns=["normalized_intensities"])
    overall_intensities = df_intensities["intensities"].sum()
    normalized_df["normalized_intensities"] = df_intensities/overall_intensities
    return(normalized_df)

In [69]:
OQ_intensities = pd.DataFrame(intensity_per_OQ.values(), index = intensity_per_OQ.keys(), columns=["intensities"])

In [70]:
OQ_intensities

Unnamed: 0,intensities
Toasted,5.520885
Fresh flowers,15.000616
Soapy,11.399934
Citrus,2.109508
Dried spice,3.147415
Vanilla,0.851058
Buttery,3.213341
Cooked vegetables,6.44217
Oily,3.785323
Solvently,3.188952


In [71]:
normalized_OQ_intensities = normalize_OQs(OQ_intensities)
normalized_OQ_intensities

Unnamed: 0,normalized_intensities
Toasted,0.014524
Fresh flowers,0.039462
Soapy,0.02999
Citrus,0.00555
Dried spice,0.00828
Vanilla,0.002239
Buttery,0.008453
Cooked vegetables,0.016947
Oily,0.009958
Solvently,0.008389


2. Looking at each OSA indiviudally to find intensity

In [72]:
'''
find all OQs that are corresponding to one specific OSA
'''
def find_OQ_per_OSA_with_intensities(OSA):
    OQ_per_OSA = {}
    for consisting_OQs in OSAs_corresponding_OQs[OSA]:
        if consisting_OQs in normalized_OQ_intensities.index:
            OQ_per_OSA[consisting_OQs] = normalized_OQ_intensities.loc[consisting_OQs]
    # normalize so that all intensities per OSA sums up to 1
    df_OQ_per_OSA = pd.DataFrame(OQ_per_OSA).T
    if len(OQ_per_OSA)==0:
        df_OQ_per_OSA['normalized_intensities']=0
    df_OQ_per_OSA = df_OQ_per_OSA.div(df_OQ_per_OSA['normalized_intensities'].sum())
    return(df_OQ_per_OSA)

equation_2:
The function µ(𝑥) is defined as:

$$
\mu(𝑥) =
\begin{cases}
0 & \text{if } 𝑥 ≤ 𝑎_1 \\
\frac{𝑥−𝑎_1}{𝑎_2−𝑎_1} & \text{if } 𝑎_1 < 𝑥 ≤ 𝑎_2 \\
1 & \text{if } 𝑎_2 < 𝑥 ≤ 𝑎_3 \\
\frac{𝑎_4−𝑥}{𝑎_4−𝑎_3} & \text{if } 𝑎_3 < 𝑥 ≤ 𝑎_4 \\
0 & \text{if } 𝑎_4 < 𝑥 \\
\end{cases}
$$


In [73]:
#most of the time -> TODO: optimize these with cross validation  
a1 = 0
a2 = 0
a3 = 0.33
a4 = 1

In [74]:
def equation_2(x,a1,a2,a3,a4):
    if x <= a1:
        µ = 0
    elif a1 < x and x <= a2:
        µ = (x-a1)/(a2-a1)
    elif a2 < x and x <= a3:
        µ = 1
    elif a3 < x and x <= a4:
        µ = (a4-x)/(a4-a3)
    elif a4 < x:
        µ = 0
    return µ

equation3: 

𝐼𝑛𝑡𝑒𝑛𝑠𝑖𝑡𝑦(𝑂𝑆𝐴) = 10 × ∏^i 1 µ(𝑂𝑄𝑖)

In [75]:
def equation_3(µ_OQs_list):
    intensity_OSA = 10 * np.prod(µ_OQs_list)
    return(intensity_OSA)


e.g. with OSA sweet

In [76]:
# e.g sweet consists of: OQs ['Vanilla', 'Honey', 'Toffee', 'Sweet']
OSAs_corresponding_OQs['Sweet']

['Vanilla', 'Honey', 'Toffee', 'Sweet']

In [77]:
# sweet 
sweet = find_OQ_per_OSA_with_intensities('Sweet')

In [78]:
sweet

Unnamed: 0,normalized_intensities
Vanilla,0.011264
Toffee,0.044664
Sweet,0.944072


In [79]:
'''OQ1 = equation_2(sweet.loc["Sweet"]["normalized_intensities"],a1,a2,0.33,a4)
OQ2= equation_2(sweet.loc["Vanilla"]["normalized_intensities"],a1,a2,0.033,a4)
OQ3= equation_2(sweet.loc["Toffee"]["normalized_intensities"],a1,a2,0.043,a4)
equation_3([OQ1,OQ2,OQ3]) '''

'OQ1 = equation_2(sweet.loc["Sweet"]["normalized_intensities"],a1,a2,0.33,a4)\nOQ2= equation_2(sweet.loc["Vanilla"]["normalized_intensities"],a1,a2,0.033,a4)\nOQ3= equation_2(sweet.loc["Toffee"]["normalized_intensities"],a1,a2,0.043,a4)\nequation_3([OQ1,OQ2,OQ3]) '

for all OSAs

In [80]:
all_OSAs = list(OSAs_corresponding_OQs.keys())

In [81]:
OSAs_int = {}
for OSA in all_OSAs: 
   OSAs_int[OSA] = 0
   current_OSA = find_OQ_per_OSA_with_intensities(OSA)
   OQ_int = []
   for OQ in current_OSA.index:
      OQ_int.append(equation_2(current_OSA.loc[OQ]["normalized_intensities"],a1,a2,a3,a4))
   OSAs_int[OSA] = equation_3(OQ_int)

In [82]:
OSAs_int

{'Peaty': 0.0,
 'Feinty': 0.0,
 'Cereal': 0.0,
 'Green/grassy': 0.0,
 'Floral': 0.0,
 'Fruity': 3.164679833024881,
 'Solvently': 0.0,
 'Soapy': 0.0,
 'Sweet': 0.8347461479030455,
 'Woody': 0.0,
 'Nutty': 5.367910263443227,
 'Spicy': 0.9299830471788246,
 'Oily': 5.5319706680036,
 'Sour': 3.410833820602648,
 'Sulphury': 5.311206563347821,
 'Stale': 0.0}

In [83]:
df_OSAs_int = pd.DataFrame(OSAs_int.values(), index = OSAs_int.keys(), columns = ["Intensities"])

In [84]:
df_OSAs_int

Unnamed: 0,Intensities
Peaty,0.0
Feinty,0.0
Cereal,0.0
Green/grassy,0.0
Floral,0.0
Fruity,3.16468
Solvently,0.0
Soapy,0.0
Sweet,0.834746
Woody,0.0


In [85]:
plot_fuzzy_OSA = df_OSAs_int[df_OSAs_int['Intensities'] != 0]
plot_fuzzy_OSA.index = ['Fruity', 'Sweet', 'Nutty', 'Spicy', 'Oily', 'Sour', 'Sulphury']

In [86]:
plot_fuzzy_OSA.index

Index(['Fruity', 'Sweet', 'Nutty', 'Spicy', 'Oily', 'Sour', 'Sulphury'], dtype='object')

40,30,50,40,30,40,30,40,60
40,30,50,30,40,30,50,20,40
40,30,50,30,40,30,50,20,40
floral,fruit,wood,complexity,malt,sweet,peat,dried_fruits,aftertaste,date

[3,4,0,0,0,0,0]

optimization with cross validation LOO

In [87]:
plot_fuzzy_OSA

Unnamed: 0,Intensities
Fruity,3.16468
Sweet,0.834746
Nutty,5.36791
Spicy,0.929983
Oily,5.531971
Sour,3.410834
Sulphury,5.311207


In [88]:
import plotly.graph_objects as go

fig = go.Figure(data=go.Scatterpolar(
  r=(plot_fuzzy_OSA['Intensities']),
  theta=list(plot_fuzzy_OSA.index),
  fill='toself'
))

fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
       range=[0, 10]
    ),
  ),
  showlegend=False
)

fig.show()

#### 4.3 Comparison BM and FL

In [89]:
np.array(final_OSA.sum())[12:15]

array([3.21334096, 2.17385084, 4.160303  ])

In [90]:
final_OSA.columns

Index(['Peaty', 'Feinty', 'Cereal', 'Green/grassy', 'Floral', 'Fruity',
       'Solvently', 'Soapy', 'Sweet', 'Woody', 'Nutty', 'Spicy', 'Oily',
       'Sour', 'Sulphury', 'Stale'],
      dtype='object')

In [91]:
panelist_data_df.loc['000920']

NameError: name 'panelist_data_df' is not defined

In [None]:
plot_fuzzy_OSA.index

Index(['Fruity', 'Sweet', 'Nutty', 'Spicy', 'Oily', 'Sour', 'Sulphury'], dtype='object')

In [None]:
import plotly.graph_objects as go

fig = go.Figure()

fig.add_trace(go.Scatterpolar(
  r=(plot_fuzzy_OSA["Intensities"]),
  theta=list(plot_fuzzy_OSA.index),
  fill="toself",
  name='ms fuzzy'
))
fig.add_trace(go.Scatterpolar(
  r=([2.1095081445804507, 71.32969620151374,5.520885037968301, 47.365698207768276,3.21334096, 2.17385084, 4.160303]),
  theta=list(plot_fuzzy_OSA.index),
  fill="toself",
  name='ms binary'
))
fig.add_trace(go.Scatterpolar(
  r=([3,4,0,0,0,0,0]),
  theta=list(plot_fuzzy_OSA.index),
  fill="toself",
  name='panelist 1'
))

fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
       range=[0, 10]
    ),
  ),
  showlegend=False
)

fig.show()

#### 5. Show Flavourprofile

In [92]:
# look at panelist 1
panelist_data_df = pd.read_table("data/vlb_data_combined.txt", delimiter=',', index_col = 1)
panelist_data_df = panelist_data_df[panelist_data_df["panelist"]==1]
panelist_data_df = panelist_data_df.drop(columns=["panelist", "date"])

In [93]:
overlap_OSAs = ["Floral", "Fruity", "Sweet", "Woody", "Peaty"]

In [94]:
def get_panelist_profile(sample_key):
    panelist_profile=[]
    for OSA in overlap_OSAs :
        panelist_profile.append(int(panelist_data_df.loc[sample_key][OSA]/10))
    return panelist_profile

In [95]:
def find_overlapping_profiles(final_OSA,comparable_OSAs):
    # show only overlap meaning: floral, fruit, sweet, wood, peat
    new_series = pd.Series() 
    threshold = 80
    for comparable_OSA in comparable_OSAs:
        for index in final_OSA.index:
            if fuzz.ratio(str(index).lower(), comparable_OSA) >= threshold:
                new_series[comparable_OSA] = final_OSA[index]
                break
            else:
                new_series[comparable_OSA] = 0
    return(new_series)


In [96]:
import plotly.graph_objects as go

#only compare only columns from final_OSA
def compare_profiles(final_OSA, panelists):
    fig = go.Figure()

    fig.add_trace(go.Scatterpolar(
        r=panelists,
        theta=list(final_OSA.index),
        fill='toself',
        name="panelist"
    ))

    fig.add_trace(go.Scatterpolar(
    r=(np.array(final_OSA)),
    theta=list(final_OSA.index),
    fill='toself',
    name= "MS_prediction"))

    fig.update_layout(
    polar=dict(
        radialaxis=dict(
        visible=True,
        range=[0, 10]
        )),
    showlegend=False
    )

    fig.show()

In [97]:
def compare_visual(OSA_plotting,sample_key):
    OSA_compare = find_overlapping_profiles(OSA_plotting, overlap_OSAs)
    panelist_compare = get_panelist_profile(sample_key)
    compare_profiles(OSA_compare.T, panelist_compare)

In [98]:
final_OSA.sum()

Peaty            0.707071
Feinty           9.706490
Cereal           4.490041
Green/grassy    13.187119
Floral          15.000616
Fruity           2.109508
Solvently        3.188952
Soapy           11.399934
Sweet           71.329696
Woody           58.895719
Nutty            5.520885
Spicy           47.365698
Oily             3.213341
Sour             2.173851
Sulphury         4.160303
Stale           13.913653
dtype: float64

In [99]:
import plotly.graph_objects as go

fig = go.Figure(data=go.Scatterpolar(
  r=(np.array(final_OSA.sum())),
  theta=list(final_OSA.columns),
  fill='toself'
))

fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
       range=[0, 100]
    ),
  ),
  showlegend=False
)

fig.show()

In [100]:
# filter out our 8 comparable OSAs 
comparable_OSAs = ["floral","fruit","wood","complexity","malt","sweet","peat","dried fruits"]
filtered_OSAs = pd.DataFrame(columns=comparable_OSAs)
threshold = 80
# Populate the new DataFrame based on original DataFrame
for column in final_OSA.columns:
    for comparable_OSA in comparable_OSAs:
        if fuzz.ratio(str(column).lower(), comparable_OSA) >= threshold:
            filtered_OSAs[comparable_OSA] = final_OSA[column]

#filtered_OSAs



In [101]:
import plotly.graph_objects as go

panelist_scores = [40,30,50,40,30,40,30]
fig = go.Figure()

fig.add_trace(go.Scatterpolar(
      r=[value / 10 for value in panelist_scores],
      theta=comparable_OSAs,
      fill='toself',
      name='Product A'
))
fig.add_trace(go.Scatterpolar(
      r=list(filtered_OSAs.sum()*10),
      theta=comparable_OSAs,
      fill='toself',
      name='Product B'
))

fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
      range=[0, 10]
    )),
  showlegend=False
)

fig.show()

In [102]:
#only compare only columns from final_OSA
fig = go.Figure()

fig.add_trace(go.Scatterpolar(
    r=[0,4,3,4,5,0],
    theta=list(final_OSA.columns),
    fill='toself',
    name="panelist"
))

fig.add_trace(go.Scatterpolar(
  r=(np.array(final_OSA.sum())),
  theta=list(final_OSA.columns),
  fill='toself',
  name= "MS_prediction"))

fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
      range=[0, 10]
    )),
  showlegend=False
)

fig.show()

In [106]:
compare_visual(final_OSA.sum()/10,"000920")





In [117]:
compare_visual(final_OSA.mean(axis=0)*70,"000920")





In [105]:

fig = go.Figure()
fig.add_trace(go.Scatterpolar(
  r=[0,4,3,4,5,0],
  theta=["Green","floral","fruit","sweet","wood","spicy"],
  fill='toself'
))


fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
      range=[0, 10]
    ),
  ),
  showlegend=False
)

fig.show()