### checking if data fits together
#### check if odorants match witch flavornet data to at least 80 % 


In [2]:
import pandas as pd
import csv
from fuzzywuzzy import fuzz




*Research data*

In [3]:
Research_data = pd.read_csv('./data/swri_flavorwheel.csv', sep=';')
OSA_research = Research_data['OSA'].unique()
OQ_research = Research_data['OQ'].unique()
OD_research = Research_data['OD'].unique()

*Flavornet data*

In [6]:
# only using flavornet data
mol_to_OD = {}
molecules = []
descriptors = []
with open('./data/CAS_mol_OD.csv', 'r',encoding='utf-8') as tabfile:
    reader = csv.reader(tabfile, delimiter=',')
    for row in reader: 
        molecule = row[0]
        OD = row[3]
        molecules.append(molecule)
        descriptors.append(OD)
        if molecule in mol_to_OD:
            mol_to_OD[molecule].add(OD)
        else:
            mol_to_OD[molecule]  = set([OD])

molecules = list(set(molecules))
descriptors = list(set(descriptors))

In [11]:
len(OD_research)

142

In [9]:
len(descriptors)

197

In [13]:
'''
find overlapp between to lists with a threshold (=70)
'''
def overlapping_elements(list1, list2, threshold=70):
    overlap = []
    overlap_tuples= []
    for elem1 in list1:
        for elem2 in list2:
            ratio = fuzz.ratio(elem1, elem2)
            if ratio >= threshold:
                overlap.append(elem1)
                overlap_tuples.append((elem1,elem2))
    return overlap_tuples

In [14]:
overlapping_pairs = overlapping_elements(OD_research,descriptors)


In [15]:
overlapping_pairs

[('stale tobacco', 'tobacco'),
 ('Sweaty', 'sweat'),
 ('popcorn', 'popcorn'),
 ('tomato plants', 'tomato leaf'),
 ('Green peppers', 'green pepper'),
 ('cucumber', 'cucumber'),
 ('lettuce', 'lettuce'),
 ('violets', 'violet'),
 ('lavender', 'lavender'),
 ('Apples', 'apple'),
 ('pears', 'pear'),
 ('pears', 'pea'),
 ('bananas', 'banana'),
 ('Lemons', 'lemon'),
 ('oranges', 'orange'),
 ('grapefruit', 'grapefruit'),
 ('esters', 'ester'),
 ('esters', 'ether'),
 ('vanilla pods', 'vanilla'),
 ('custard', 'mustard'),
 ('Butterscotch', 'butterscotch'),
 ('brown sugar', 'brown sugar'),
 ('brown sugar', 'burnt sugar'),
 ('sawdust', 'dust'),
 ('resinous', 'resin'),
 ('toasted oak', 'roasted meat'),
 ('coconut milk', 'hot milk'),
 ('coconut milk', 'coconut'),
 ('Almond', 'almond'),
 ('Almond', 'lemon'),
 ('hazelnuts', 'hazelnut'),
 ('cocoa', 'cocoa'),
 ('coffee', 'coffee'),
 ('roasted beans', 'roasted meat'),
 ('roasted beans', 'roasted nut'),
 ('roasted beans', 'roast beef'),
 ('Clove', 'clove'),
 (

In [16]:
len(overlapping_pairs)

59

In [66]:
len(overlapping_pairs)/len(OD_research)

0.375

##### another database is needed to match odorants from research institute

### panelist data
#### get panelist data with largest variance

In [3]:
import pandas as pd
file_path = 'data/vlb_data_combined.txt'
data = pd.read_csv(file_path, delimiter=',')
# only use first repliate
data

Unnamed: 0,panelist,samplekey,floral,fruit,wood,complexity,malt,sweet,peat,dried_fruits,aftertaste,date
0,1,000656,50,20,60,50,30,30,50,40,40,2023-01-12
1,1,000657,30,40,60,40,40,30,10,30,60,2023-01-12
2,1,000658,30,50,50,50,40,40,10,40,70,2023-01-12
3,1,000661,60,70,60,50,50,60,0,40,60,2023-01-12
4,1,000662,40,60,50,50,50,40,0,30,60,2023-01-12
...,...,...,...,...,...,...,...,...,...,...,...,...
1241,3,000926,40,30,40,40,30,20,40,20,60,2022-10-17
1242,3,000927,40,70,60,70,40,50,0,40,80,2022-10-17
1243,3,000929,20,60,30,40,30,50,0,40,30,2022-10-17
1244,3,000930,10,30,60,70,30,70,0,40,60,2022-10-17


In [8]:
import itertools
# find rows with biggest variances
df = data.drop(columns=["panelist","samplekey","date"])
row_combinations = list(itertools.combinations(df.index, 2))
distances = [abs(df.loc[i[0]] - df.loc[i[1]]).sum() for i in row_combinations]
max_distance_indices = row_combinations[distances.index(max(distances))]
print(df.loc[max_distance_indices[0]])
print(df.loc[max_distance_indices[1]])
#=> sample: 000673 and 000794 but both not in masspec

floral          60
fruit           20
wood            50
complexity      30
malt            20
sweet           20
peat            80
dried_fruits    10
aftertaste      40
Name: 12, dtype: int64
floral          40
fruit           60
wood            80
complexity      70
malt            60
sweet           70
peat             0
dried_fruits    60
aftertaste      80
Name: 159, dtype: int64


In [11]:
data.iloc[159]

panelist                 2
samplekey           000794
floral                  40
fruit                   60
wood                    80
complexity              70
malt                    60
sweet                   70
peat                     0
dried_fruits            60
aftertaste              80
date            2023-01-12
Name: 159, dtype: object

In [10]:
data.iloc[12]

panelist                 1
samplekey           000673
floral                  60
fruit                   20
wood                    50
complexity              30
malt                    20
sweet                   20
peat                    80
dried_fruits            10
aftertaste              40
date            2023-01-12
Name: 12, dtype: object