In [85]:
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
import csv
from sklearn.preprocessing import Normalizer

def dict_to_matrix(dictionary):
    '''
    Does convert a pandas dataframe into a matrix where 
    Parameters
    ----------
    dictionary : dict,
        keys = row name
        values = col name
    data : int, either 0 or 1 
    '''
    unique_values = set(val for sublist in dictionary.values() for val in sublist)
    data_dict = {key: {val: 1 if val in values else 0 for val in unique_values} for key, values in dictionary.items()}
    df = pd.DataFrame(data_dict).T
    return(df)


def overlapping_elements(list1, list2, threshold=70):
    '''
    Find overlapp between to lists with a threshold (=70)
    
    Parameters
    ----------
    list1: list
    list2: list
    threshold: int, fixed to 70
    '''
    overlap = []
    for elem1 in list1:
        for elem2 in list2:
            ratio = fuzz.ratio(elem1, elem2)
            if ratio >= threshold:
                overlap.append(elem1)
    return overlap


def dataframe_to_dict(df):
    '''
    Convert DataFrame to dictionary

    Parameters
    ----------
    df: dataframe
    '''
    result_dict = {}
    for index, row in df.iterrows():
        if row[0] in result_dict:
            result_dict[row[0]].append(row[1])
        else:
            result_dict[row[0]] = [row[1]]
    return result_dict     


def find_matching_entries(dictionary,df):
    '''
    Find match between dictionary entries and the rows of a dataframe
    
    Parameters:
    -----------
    dictionary: dict
    df: dataframe
    '''
    threshold = 80
    found_match = []
    found_match_per_entry = {}
    for key, value in dictionary.items():
        all_matches = set()
        for index, row in df.iterrows():
            for keyword in value:
                if fuzz.ratio(keyword, str(index).lower()) >= threshold:
                    all_matches.add(index)
                    found_match.append(row)
        found_match_per_entry[key] = all_matches
    found_match = pd.DataFrame(found_match)
    found_match = found_match.drop_duplicates()
    #remove empty entries
    '''found_match = found_match.loc[:, (found_match != 0).any(axis=0)]
    found_match_per_entry = {key: value for key, value in found_match_per_entry.items() if value}
    '''
    return(found_match, found_match_per_entry)


def create_matrix_from_dict(dictionary,df,ms_cas):
    '''
    Create a new matrix by finding matches and filtering right columns
    
    Parameters:
    -----------
    dictionary: dict
    df: dataframe
    '''
    matrix = pd.DataFrame(columns=list(df), index=ms_cas)
    final_dict = {}
    for key, value in dictionary.items():
        current_val = []
        for v in value:
            for index, row in df.iterrows():
                if index == v:
                    current_val.append(','.join(row.loc[row.eq(1)].index.tolist()))
                    # todo: maybe exception -> do change if two ones in a row
                    matrix.loc[key][row.loc[row.eq(1)].index.tolist()] = 1
        final_dict[key] = set(current_val)
    matrix = matrix.fillna(0)
    matrix = matrix.loc[:, (matrix != 0).any(axis=0)]
    return(final_dict, matrix)


def read_massspec_data(file_path, replicate, sample_key, n):
    ''''
    Get a dataframe with cas numbers and largest intensities (n times)
    
    Parameters:
    ----------
    file_path: string, path to file
    replicate: int, 1 2 or 3
    sample_key: str
    n: int, nlargest
    '''
    data = pd.read_csv(file_path, delimiter=',')
    data = data[data["sample_key"]== sample_key]
    data = data[data["replicate"] == replicate]
    # filter columns that are 0 
    data = data.loc[:, (data != 0).any(axis=0)]
    data = data.drop(columns=['sample_key', 'replicate'])
    largest_entries = data.iloc[0].nlargest(n)
    data_top_entries = pd.DataFrame({'Intensities':largest_entries})
    return (data_top_entries)


def flavornet_dataframe():
    '''
    read flavornet data and save it in mol_to_OD
    '''
    mol_to_OD = {}
    descriptors = []
    CAS_numbers= []
    with open('./data/CAS_mol_OD.csv', 'r',encoding='utf-8') as tabfile:
        reader = csv.reader(tabfile, delimiter=',')
        for row in reader: 
            molecule = row[0]
            CAS = row[1]
            OD = row[3]
            CAS_numbers.append(CAS)
            descriptors.append(OD)
            if CAS in mol_to_OD:
                mol_to_OD[CAS].add(OD)
            else:
                mol_to_OD[CAS]  = set([OD])
    CAS_numbers = list(set(CAS_numbers))
    descriptors = list(set(descriptors))
    D = pd.DataFrame(index=CAS_numbers, columns=descriptors)
    for col in D.columns:
        for index, row in D.iterrows():
            # Check if value in  current row matches any entry in the dictionary
            if index in mol_to_OD.keys() and col in mol_to_OD[index]:
                # Set to 1
                D.at[index, col] = 1
    D = D.fillna(0)
    return (D,mol_to_OD)


def get_ms_cas(sample_key, n):
    replicate = 1
    file_path = 'data/cas_intensities.csv'
    top_cas = read_massspec_data(file_path, replicate, sample_key,n)
    ms_cas = list(top_cas.index)
    I_vector = top_cas
    return(ms_cas, I_vector)

def match_mol_with_flavors(ms_cas,mol_to_OD):
    flavors = []
    mol_flavors = {}
    for mol in ms_cas:
        if mol in mol_to_OD.keys():
            flavors += mol_to_OD[mol]
            mol_flavors[mol] = mol_to_OD[mol]
        else:
            mol_flavors[mol] = {}
    found_flavors = set(flavors)
    return(found_flavors,mol_flavors)

def OD_X_OQ():
    Research_data = pd.read_csv('./data/swri_flavorwheel.csv', sep=';')
    Research_OQ_OD =  Research_data.drop(columns=Research_data.columns[0])
    Research_OSA_OQ = Research_data.drop(columns=Research_data.columns[2])
    OD_OQ = dict_to_matrix(dataframe_to_dict(Research_OQ_OD)).T
    OQ_OSA = dict_to_matrix(dataframe_to_dict(Research_OSA_OQ)).T
    # filter out rows with only 0
    OD_OQ = OD_OQ.loc[(OD_OQ.iloc[:,1:] != 0).any(axis=1)]
    OQ_OSA = OQ_OSA.loc[(OQ_OSA.iloc[:,1:] != 0).any(axis=1)]
    return(OD_OQ,OQ_OSA)


def I_Vector(I_vector, ms_cas,OSAs):
    normalizer = Normalizer()
    I_vector_normalized = normalizer.fit_transform(I_vector.T).T
    final_OSA = OSAs*I_vector_normalized
    final_OSA_plotting = final_OSA.sum()*10
    return (final_OSA_plotting)


In [86]:
D, mol_to_OD = flavornet_dataframe()
OD_OQ, OQ_OSA = OD_X_OQ()

In [93]:
def main(sample_key,n):
    ms_cas,I_vector = get_ms_cas(sample_key,n)
    
    found_flavors,mol_flavors = match_mol_with_flavors(ms_cas,mol_to_OD)
    print("1) ODs found")
    
    # find matching OQs to found ODs from massspec x flavornet
    matching_OQs, matching_OQs_dict = find_matching_entries(mol_flavors, OD_OQ)
    OQ_dict, OQs = create_matrix_from_dict(matching_OQs_dict, matching_OQs,ms_cas)
    print("2) OQs found")

    # find matching OQs to found ODs from massspec x flavornet
    matching_OSAs, matching_OSAs_dict = find_matching_entries(OQ_dict, OQ_OSA)
    OSA_dict, OSAs = create_matrix_from_dict(matching_OSAs_dict, matching_OSAs,ms_cas)
    print("3) OSAs found")

    final_OSA_plotting = I_Vector(I_vector, ms_cas, OSAs)
    return OSAs, I_vector,final_OSA_plotting

In [192]:
n = 500
OSA_sample_000920, intensities_000920, OSA_plotting_000920 = main("000920",n)
OSA_sample_000586, intensities_000586, OSA_plotting_000586= main("000586",n)
OSA_sample_000636, intensities_000636, OSA_plotting_000636 = main("000636",n)


1) ODs found
2) OQs found
3) OSAs found
1) ODs found
2) OQs found
3) OSAs found
1) ODs found
2) OQs found
3) OSAs found


In [193]:
OSA_plotting_000920

feinty           1.049422
cereal           1.053850
Green/grassy     4.758840
Floral           5.908917
Fruity           4.999598
Solvently        0.721703
Soapy            2.459215
Sweet           14.679889
Woody           10.047294
Nutty            0.492403
Spicy            1.171535
Oily             0.172035
Sour             0.701266
Sulphury         0.281796
Stale            1.982450
dtype: float64

In [194]:
OSA_plotting_000586

feinty           1.140632
cereal           0.794678
Green/grassy     4.382607
Floral           6.383876
Fruity           4.643741
Solvently        0.702364
Soapy            3.945067
Sweet           14.824697
Woody           10.355813
Nutty            0.649029
Spicy            1.140950
Oily             0.418161
Sour             0.621812
Sulphury         0.762601
Stale            2.854883
dtype: float64

In [195]:
OSA_plotting_000636

feinty           0.966572
cereal           0.863722
Green/grassy     4.536102
Floral           6.328094
Fruity           4.781889
Solvently        0.634837
Soapy            3.242430
Sweet           14.445475
Woody            9.322571
Nutty            0.553897
Spicy            1.184107
Oily             0.346244
Sour             0.515766
Sulphury         0.811548
Stale            2.767769
dtype: float64

##### *compare only overlap*

meaning only those OSAs: "floral", "fruit", "sweet", "wood", "peat"

In [196]:
overlap_OSAs = ["floral", "fruit", "sweet", "wood", "peat"]
OSA_compare_000920 = find_overlapping_profiles(OSA_plotting_000920, overlap_OSAs)
panelist_compare_000920 = [4,3,4,5,0]
compare_profiles(OSA_compare_000920.T, panelist_compare_000920)





In [200]:
OSA_compare_000636 = find_overlapping_profiles(OSA_plotting_000636, overlap_OSAs)
panelist_compare_000636 = [4,5,4,5,0]
compare_profiles(OSA_compare_000636, panelist_compare_000636)






In [202]:
OSA_sample_000633, intensities_000633, OSA_plotting_000633 = main("000633",n)


1) ODs found
2) OQs found
3) OSAs found


In [204]:
OSA_compare_000633 = find_overlapping_profiles(OSA_plotting_000633, overlap_OSAs)
panelist_compare_000633 = [3,4,9,3,4]
#["floral", "fruit", "sweet", "wood", "peat"]
compare_profiles(OSA_compare_000633, panelist_compare_000633)






In [153]:
# def find_overlapping_profiles(final_OSA,comparable_OSAs):
#     # show only overlap meaning: floral, fruit, sweet, wood, peat
#     filtered_OSAs = pd.DataFrame(columns=comparable_OSAs, index = ["intensities"])
#     threshold = 80
#     for index in final_OSA.index:
#         for comparable_OSA in comparable_OSAs:
#             if fuzz.ratio(str(index).lower(), comparable_OSA) >= threshold:
#                 filtered_OSAs[comparable_OSA] = final_OSA[index]
#     return(filtered_OSAs)


In [179]:

def find_overlapping_profiles(final_OSA,comparable_OSAs):
    # show only overlap meaning: floral, fruit, sweet, wood, peat
    new_series =pd.Series() 
    threshold = 80
    for comparable_OSA in comparable_OSAs:
        for index in final_OSA.index:
            if fuzz.ratio(str(index).lower(), comparable_OSA) >= threshold:
                new_series[comparable_OSA] = final_OSA[index]
                break
            else:
                new_series[comparable_OSA] = 0
    return(new_series)


*OSAs from panelist:* floral, fruit, wood, complexity, malt, sweet, peat, dried_fruits

*OSAs from flavorwheel:* floral,fruity, solventy, soapy, sweet, woody, nutty, spicy, oily, sour, sulphury, stale, peaty, feinty, cereal, green/grassy 

*Overlapp:* floral, fruit,sweet, wood, peat

In [52]:
#['Green/grassy', 'Floral', 'Fruity ', 'Sweet', 'Woody', 'Spicy']
panelist_scores_000920 = [0,4,3,4,5,0]
panelist_scores_000920 = []
panelist_scores_000920 = []


##### *compare OSAs with available panelist data for found MS-OSAs*

meaning MS-OSA flavorprofile + a part of the panelist profile

In [98]:
compare_profiles(OSA_plotting_000920, panelist_scores_000920)

In [155]:
import plotly.graph_objects as go

#only compare only columns from final_OSA
def compare_profiles(final_OSA, panelists):
    fig = go.Figure()

    fig.add_trace(go.Scatterpolar(
        r=panelists,
        theta=list(final_OSA.index),
        fill='toself',
        name="panelist"
    ))

    fig.add_trace(go.Scatterpolar(
    r=(np.array(final_OSA)),
    theta=list(final_OSA.index),
    fill='toself',
    name= "MS_prediction"))

    fig.update_layout(
    polar=dict(
        radialaxis=dict(
        visible=True,
        range=[0, 10]
        )),
    showlegend=False
    )

    fig.show()

#### todo: compare panelist data with available MS - OSAs