In [191]:
import pandas as pd
import csv
from fuzzywuzzy import process
from fuzzywuzzy import fuzz


*Flavornet data*

In [192]:
# only using flavornet data
mol_to_OD = {}
CAS_OD = {}
molecules = []
descriptors = []
with open('./data/CAS_mol_OD.csv', 'r',encoding='utf-8') as tabfile:
    reader = csv.reader(tabfile, delimiter=',')
    for row in reader: 
        molecule = row[0]
        CAS = row[1]
        OD = row[3]
        molecules.append(molecule)
        descriptors.append(OD)
        if CAS in mol_to_OD:
            mol_to_OD[CAS].add(OD)
        else:
            mol_to_OD[CAS]  = set([OD])

molecules = list(set(molecules))
descriptors = list(set(descriptors))

In [193]:
# key = CAS (727), value = odorant (197)
# 727 cas numbers
mol_to_OD

{'6485-40-1': {'mint'},
 '3033-23-6': {'rose', 'sweet'},
 '21284-22-0': {'green tea', 'herb', 'spice'},
 '38427-78-0': {'terpentine'},
 '495-61-4': {'balsamic'},
 '29873-99-2': {'green', 'oil', 'wood'},
 '6909-30-4': {'green'},
 '99-49-0': {'basil', 'caraway', 'fennel', 'mint'},
 '2244-16-8': {'caraway'},
 '876-17-5': {'flower', 'green'},
 '5989-27-5': {'citrus', 'mint'},
 '4674-50-4': {'grapefruit'},
 '2243-33-6': {'dill'},
 '10307-61-6': {'apple'},
 '69064-37-5': {'oil'},
 '85761-70-2': {'green', 'leaf'},
 '6728-26-3': {'apple', 'fat', 'green', 'leaf', 'rancid'},
 '13419-69-7': {'fat', 'must'},
 '928-95-0': {'green', 'leaf', 'walnut'},
 '18829-56-6': {'cucumber', 'fat', 'green'},
 '1576-96-1': {'mushroom'},
 '53448-07-0': {'fat', 'green', 'soap'},
 '928-97-2': {'fresh', 'moss'},
 '1197-07-5': {'caraway', 'solvent'},
 '14371-10-9': {'cinnamon', 'paint'},
 '09.04.5948': {'herb', 'warm'},
 '106-28-5': {'muguet'},
 '5273-85-8': {'flower', 'spice'},
 '5932-68-3': {'flower'},
 '4959-35-7':

*Research data*

In [194]:
Research_data = pd.read_csv('./data/swri_flavorwheel.csv', sep=';')
OSA_research = Research_data['OSA'].unique()
OQ_research = Research_data['OQ'].unique()
OD_research = Research_data['OD'].unique()

In [195]:
Research_OD_OSA = Research_data.drop(columns=Research_data.columns[1])

In [196]:
flavors_list = []

for cas, flavors in mol_to_OD.items():
    for flavor in flavors:
        flavors_list.append({'CAS': cas, 'OD': flavor})

CAS_OD = pd.DataFrame(flavors_list)

In [255]:
# Function to perform fuzzy matching
def fuzzy_match(row):
    match = Research_OD_OSA[Research_OD_OSA['OD'].apply(lambda x: fuzz.ratio(row['OD'], str(x).lower())) >= 70]
    if len(match) > 0:
        return pd.Series([row['CAS'], match.iloc[0]['OD'], match.iloc[0]['OSA']])
    else:
        return pd.Series([row['CAS'], None, None])
    
# Apply fuzzy matching
result_df = CAS_OD.apply(fuzzy_match, axis=1)
result_df.columns = ['CAS', 'OD', 'OSA']


In [259]:
# remove Nones
result_df = result_df.dropna()
print(result_df)

              CAS            OD     OSA
1       3033-23-6        Sweaty  Feinty
2       3033-23-6         Roses  Floral
5      21284-22-0         Spicy   Spicy
9      29873-99-2         Woody   Woody
10     29873-99-2          Oily    Oily
...           ...           ...     ...
1331    2825-91-4  coconut milk   Nutty
1332     713-95-1        Sweaty  Feinty
1333     713-95-1        Fruity  Fruity
1334   20307-84-0         Woody   Woody
1335  120021-96-7          Oily    Oily

[566 rows x 3 columns]


In [260]:
# create df so that CAS numbers are key and their corresponding OSAs values
cas_osa_dict = result_df.groupby('CAS')['OSA'].agg(list).to_dict()
# we could find flavors for our 302 of our available 702 CAS numbers
len(cas_osa_dict)

436

In [261]:
# ms data
file_path = 'data/cas_intensities.csv'
data = pd.read_csv(file_path, delimiter=',')
# only use first repliate
data = data[data["replicate"]==1]
data_dist = data.copy()
# Remove columns where all entries are 0
data_dist = data_dist.loc[:, (data_dist != 0).any(axis=0)]
data_dist = data_dist.drop(columns=['sample_key', 'replicate'])
# normalize each cas number by dividing it with its mean
# todo zscore ?
data_norm = data_dist/data_dist.median()
data_norm.index = data['sample_key']

In [262]:
used_data = data_norm[data_norm.index=="000920"]
#filter columns that are 0
used_data.fillna(0)
used_data = used_data.loc[:, (used_data != 0).any(axis=0)]
used_data

Unnamed: 0_level_0,6485-40-1,3033-23-6,21284-22-0,38427-78-0,495-61-4,29873-99-2,6909-30-4,99-49-0,2244-16-8,876-17-5,...,13466-78-9,483-76-1,36564-42-8,705-86-2,2825-91-4,713-95-1,20307-84-0,120021-96-7,586-62-9,710-04-3
sample_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
920,0.87758,0.671727,0.728071,0.650334,1.147413,1.147413,0.846854,0.87758,0.87758,0.671727,...,0.97034,1.147413,0.728071,1.016826,1.016826,0.848177,1.147413,1.147413,0.97034,0.884654


In [263]:
intensities = used_data.T
intensities.columns=['Intensity']

In [264]:
# 727 CAS numbers from flavornet 
# cas_osa_dict => 302 CAS numbers could be matched with a flavor 
# 658 CAS numbers are in our ms data 
# only 270 overlapp dont know why should be 302
len(set(cas_osa_dict.keys()).intersection(used_data.columns))

398

In [265]:
filtered_data = used_data[set(cas_osa_dict.keys()).intersection(used_data.columns)]

  filtered_data = used_data[set(cas_osa_dict.keys()).intersection(used_data.columns)]


In [266]:
CAS_OD_OSA_filtered = result_df[result_df['CAS'].isin( filtered_data.columns)]


In [267]:
CAS_OD_OSA_Int = pd.merge(CAS_OD_OSA_filtered, intensities, left_on='CAS', right_index=True, how = 'left')

In [268]:
final_OSAs = CAS_OD_OSA_Int.groupby('OSA')['Intensity'].sum().reset_index()
final_OSAs

Unnamed: 0,OSA,Intensity
0,Cereal,3.622373
1,Feinty,56.545254
2,Floral,14.625754
3,Fruity,93.426694
4,Green/grassy,15.868273
5,Nutty,44.877557
6,Oily,23.210559
7,Peaty,0.586593
8,Soapy,8.956356
9,Solvently,2.514264


In [269]:
flavor_counts = result_df['OSA'].value_counts()

In [271]:
flavor_counts

Fruity          124
Feinty           74
Woody            58
Nutty            54
Spicy            52
Sulphury         37
Stale            35
Oily             28
Sweet            21
Floral           20
Sour             19
Green/grassy     19
Soapy            13
Cereal            6
Solvently         5
Peaty             1
Name: OSA, dtype: int64

In [272]:
final_OSAs

Unnamed: 0,OSA,Intensity
0,Cereal,3.622373
1,Feinty,56.545254
2,Floral,14.625754
3,Fruity,93.426694
4,Green/grassy,15.868273
5,Nutty,44.877557
6,Oily,23.210559
7,Peaty,0.586593
8,Soapy,8.956356
9,Solvently,2.514264


In [273]:
# Merge the DataFrames on 'OSA'
merged_OSA_norm = final_OSAs.merge(flavor_counts, left_on='OSA', right_index=True)
merged_OSA_norm= merged_OSA_norm.drop(columns='OSA_x')
merged_OSA_norm['norm'] = merged_OSA_norm['Intensity']/merged_OSA_norm['OSA_y']

In [274]:
# look at panelist 1
panelist_data_df = pd.read_table("data/vlb_data_combined.txt", delimiter=',', index_col = 1)
panelist_data_df = panelist_data_df[panelist_data_df["panelist"]==1]
panelist_data_df = panelist_data_df.drop(columns=["panelist", "date"])

In [275]:
merged_OSA_norm['panelist'] = merged_OSA_norm['OSA'].map(panelist_data_df.loc['000920'])


In [276]:
merged_OSA_norm

Unnamed: 0,OSA,Intensity,OSA_y,norm,panelist
0,Cereal,3.622373,6,0.603729,
1,Feinty,56.545254,74,0.764125,
2,Floral,14.625754,20,0.731288,40.0
3,Fruity,93.426694,124,0.753441,30.0
4,Green/grassy,15.868273,19,0.835172,
5,Nutty,44.877557,54,0.831066,
6,Oily,23.210559,28,0.828949,
7,Peaty,0.586593,1,0.586593,30.0
8,Soapy,8.956356,13,0.68895,
9,Solvently,2.514264,5,0.502853,


todo: for all