In [1]:
%load_ext autoreload
%autoreload 2
import os
from pathlib import Path
import pickle
import numpy as np
import pandas as pd
from rickpy import ProgressBar
import pyrfume

In [59]:
cids = {}
DATA = pyrfume.DATA_DIR

## From Sigma Fragrance and Flavor Catalog (2014)

In [60]:
# Create a new file with CIDs and store here in `cids` dictionary
file_path = DATA / 'sigma_2014' / 'sigma.csv'
df = pd.read_csv(file_path)
cids['sigma-2014'] = set(df['CID']) - {0}

## Dravnieks Atlas of Odor Character

In [61]:
# Create a new file with CIDs and store here in `cids` dictionary
file_path = DATA / 'dravnieks_1985' / 'dravnieks.csv'
df = pd.read_csv(file_path)
cids['dravnieks-1985'] = set(df['CID']) - {0}

## Abraham et al, 2013

In [62]:
file_path = DATA / 'abraham_2011' / 'abraham-2011-with-CIDs.csv'
df = pd.read_csv(file_path)
cids['abraham-2013'] = set(df['CID']) - {0}

## Bushdid et al, 2014

In [63]:
# Create a new file with CIDs and store here in `cids` dictionary
file_path = DATA / 'bushdid_2014' / 'bushdid.csv'
df = pd.read_csv(file_path)
cids['bushdid-2014'] = set(df['CID']) - {0}

## Chae et al, 2019

In [64]:
# Create a new file with CIDs and store here in `cids` dictionary
file_path = DATA / 'chae_2019' / 'odorants.csv'
df = pd.read_csv(file_path)
cids['chae-2019'] = set(df['CID']) - {0}

## Prestwick

In [65]:
file_path = DATA / 'prestwick' / 'prestwick.csv'
df = pd.read_csv(file_path)
cids['prestwick'] = set(df['CID']) - {0}

## GRAS

In [66]:
file_path = DATA / 'GRAS' / 'gras.csv'
df = pd.read_csv(file_path)
cids['gras'] = set(df['CID']) - {0}

## Sobel Lab (Weiss 2012, Snitz 2013)

In [67]:
file_path = DATA / 'snitz_2013' / 'snitz.csv'
df = pd.read_csv(file_path)
cids['sobel-2013'] = set(df['CID']) - {0}

## Leffingwell

In [68]:
file_path = DATA / 'company_x_2019' / 'company_x.csv'
df = pd.read_csv(file_path)
cids['leffingwell'] = set(df['CID']) - {0}

## Davison

In [69]:
file_path = DATA / 'davison_2007' / 'davison-katz.csv'
df = pd.read_csv(file_path, index_col=0)
cids['davison-2007'] = set(df['CID']) - {0}

## FDB

In [70]:
file_path = DATA / 'fragrancedb' / 'FragranceDB_CIDs.txt'
df = pd.read_csv(file_path)
cids['fragrance-db'] = set(df['CID']) - {0}

## Mainland

In [71]:
file_path = DATA / 'cabinets' / 'Mainland Odor Cabinet with CIDs.csv'
df = pd.read_csv(file_path)
cids['mainland-cabinet'] = set(df['CID']) - {0}

In [72]:
file_path = DATA / 'mainland_intensity' / 'mainland-intensity-odorant-info.csv'
df = pd.read_csv(file_path)
cids['mainland-intensity'] = set(df['CID']) - {0}

In [73]:
file_path = DATA / 'mainland_2015' / 'Odors.tsv'
df = pd.read_csv(file_path, sep='\t')
cids['mainland-receptors'] = set(df['CID'].dropna().astype(int)) - {0}

## Enantiomers

In [74]:
file_path = DATA / 'shadmany' / 'enantiomers.csv'
df = pd.read_csv(file_path)
cids['enantiomers'] = set(df['CID']) - {0}

## Haddad (just the clusters)

In [75]:
file_path = DATA / 'haddad_2008' / 'haddad-clusters.csv'
df = pd.read_csv(file_path)
cids['haddad-2008'] = set(df['CID']) - {0}

## U19 PIs

In [76]:
from rickpy import get_sheet
gerkin_sheet = '1PlU4zHyRXtcI7Y-O6xYtlIyKoKk8hX1I9zfx8KFELdc'
u19_sheet = '1B2sEj9pCk2_zS1X1Cg2ulAB4E_BWPboJBSvH4Gwc8fs'
dfs = {}
dfs['gerkin-cabinet'] = get_sheet(gerkin_sheet, 'gerkin-compounds').set_index('CID')
dfs['smith-cabinet'] = get_sheet(gerkin_sheet, 'smith-compounds').set_index('CID')
dfs['rinberg-glomeruli'] = get_sheet(u19_sheet, 'rinberg').set_index('CID')
dfs['fleischmann-cabinet'] = get_sheet(u19_sheet, 'fleischmann').set_index('CID')
dfs['datta-cabinet'] = get_sheet(u19_sheet, 'datta').set_index('CID')
dfs['bozza-cabinet'] = get_sheet(u19_sheet, 'bozza').set_index('CID')

In [77]:
for name, df in dfs.items():
    cids[name] = set(df.index) - {0}

## Goodscents

In [78]:
file_path = DATA / 'goodscents' / 'goodscents_cids.txt'
df = pd.read_csv(file_path, index_col=False)
cids['goodscents'] = set(df['CID']) - {0}

## Arctander

In [79]:
file_path = DATA / 'arctander_1960' / 'arctander_cids.txt'
df = pd.read_csv(file_path, index_col=False)
cids['arctander-1960'] = set(df['CID']) - {0}

## Flavornet

In [80]:
file_path = DATA / 'flavornet' / 'flavornet.csv'
df = pd.read_csv(file_path)
cids['flavornet'] = set(df['CID']) - {0}

## Scott et al, 2014

In [81]:
file_path = DATA / 'scott_2014' / 'data.csv'
df = pd.read_csv(file_path)
cids['scott-2014'] = set(df['CID']) - {0}

## Superscent

In [82]:
file_path = DATA / 'superscent' / 'superscent_cids.txt'
df = pd.read_csv(file_path)
cids['superscent'] = set(df['CID']) - {0}

## SenseLab

In [83]:
file_path = DATA / 'senselab' / 'senselab.csv'
df = pd.read_csv(file_path)
cids['senselab'] = set(df['CID']) - {0}

In [85]:
file_path = DATA / 'wakayama_2019' / 'wakayama-intensity_with-CIDs.txt'
df = pd.read_csv(file_path, sep='\t')
cids['wakayama-2019'] = set(df['CID']) - {0}

## Save

In [86]:
file_path = DATA / 'odorants' / 'cids.pkl'
with open(file_path, 'wb') as f:
    pickle.dump(cids, f)

## Load

In [87]:
#with open(file_path, 'rb') as f:
#    cids2 = pickle.load(f)

## Merge

In [89]:
all_cids = set()
for key in cids:
    all_cids |= cids[key]
all_cids = pd.DataFrame(index=sorted(list(all_cids)), columns=sorted(list(cids))).fillna(0)
all_cids.index.name = 'CID'
for key in cids:
    all_cids.loc[list(cids[key]), key] = 1
file_path = DATA / 'odorants' / 'all_cids.csv'
all_cids.to_csv(file_path)

In [90]:
all_cids.shape

(9764, 29)