# FooDB.ca Data Analysis

<a id = 'top'></a>

# Contents

- [Libraries and Functions]
- [Exploratory Data Analysis](#eda)
 - [Foods](#eda_foods)
 - [Enzymes](#eda_enzymes)
 - [Flavors](#eda_flavors)
 - [Health effects](#eda_health_effects)
 - [Nutrients](#eda_nutrients) (Dropped)
 - [Pathways](#eda_pathways)
 - [References](#eda_ref) (Dropped)
 - [Food taxonomies](#eda_food_tax)
 - [Compounds](#eda_compounds)
- [Dropping columns](#droppingcolumns)
 - [Foods](#dropcol_foods)
 - [Enzymes](#dropcol_enzymes)
 - [Flavors](#dropcol_flavors)
 - [Health effects](#dropcol_health_effects)
 - [Pathways](#dropcol_pathways)
 - [Food taxonomies](#dropcol_food_tax)
   - [Test query of morels](#test_query_ft)
 - [Compounds](#dropcol_compounds)
- [Dropping whole tables](#dropping_tables)

# Libraries and Functions

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import sqlite3
#For the ORM:
import sqlalchemy
from sqlalchemy import create_engine #To connect to the database
from sqlalchemy.orm import Session, sessionmaker #To interact with the database as object
from sqlalchemy import inspect #To look at tables and column names

In [None]:
#import plotly.plotly as py
#import plotly.figure_factory as ff

In [None]:
#needs plotly password
#df = pd.read_csv("https://raw.githubusercontent.com/plotly/datasets/master/school_earnings.csv")

#table = ff.create_table(df)
#py.iplot(table, filename='jupyter-table1')

In [2]:
#Instantiate a session of the ORM
#Connect to the sqlite database file
engine = create_engine('sqlite:///foodb.db', echo = True)
#Bind a session factory to the engine
Session = sessionmaker(bind=engine)
#Instantiate a session
session = Session()

In [3]:
#An object to get a first look at the database
inspector = inspect(engine)

2019-03-17 09:22:55,781 INFO sqlalchemy.engine.base.Engine SELECT CAST('test plain returns' AS VARCHAR(60)) AS anon_1
2019-03-17 09:22:55,781 INFO sqlalchemy.engine.base.Engine ()
2019-03-17 09:22:55,781 INFO sqlalchemy.engine.base.Engine SELECT CAST('test unicode returns' AS VARCHAR(60)) AS anon_1
2019-03-17 09:22:55,793 INFO sqlalchemy.engine.base.Engine ()


In [27]:
#Make a dict of list of names of all the columns in all the tables
#for easy reference

#Get list of all table names
table_names = inspector.get_table_names()

table_columns = dict()
#For each table name
for table in table_names:
    column_names = []
    #for each column in the table, get just the name
    for column in inspector.get_columns(table):
        column_names.append(column['name'])
    table_columns[table] = column_names

In [17]:
def get_value_from_cols(table, value):
    for column in table.columns:
        print(column, table[table[column] == value].shape[0])

<a id = 'eda'></a>

[(Back to top)](#top)

# Exploratory Data Analysis

## Trying to find the link from foods to compounds

In [28]:
table_names

['compound_alternate_parents',
 'compound_external_descriptors',
 'compound_substituents',
 'compound_synonyms',
 'compounds',
 'compounds_enzymes',
 'compounds_flavors',
 'compounds_health_effects',
 'compounds_pathways',
 'enzymes',
 'flavors',
 'food_taxonomies',
 'foodcomex_compound_providers',
 'foodcomex_compounds',
 'foods',
 'health_effects',
 'nutrients',
 'pathways',
 'references']

<a id = 'eda_foods'></a>

In [6]:
# connect to db
connection = sqlite3.connect('foodb.db')
cur = connection.cursor()

In [8]:
compounds = pd.read_sql_query('''SELECT c.* 
                                FROM compounds c
                                ;''', engine)
references = pd.read_sql_query('''SELECT r.* 
                                FROM [references] r
                                ;''', engine)
nutrients = pd.read_sql_query('''SELECT n.* 
                                FROM nutrients n
                                ;''', engine)

2019-03-17 09:25:21,500 INFO sqlalchemy.engine.base.Engine SELECT c.* 
                                FROM compounds c
                                ;
2019-03-17 09:25:21,500 INFO sqlalchemy.engine.base.Engine ()
2019-03-17 09:25:22,912 INFO sqlalchemy.engine.base.Engine SELECT r.* 
                                FROM [references] r
                                ;
2019-03-17 09:25:22,914 INFO sqlalchemy.engine.base.Engine ()
2019-03-17 09:25:23,130 INFO sqlalchemy.engine.base.Engine SELECT n.* 
                                FROM nutrients n
                                ;
2019-03-17 09:25:23,131 INFO sqlalchemy.engine.base.Engine ()


In [9]:
compounds.head()


Unnamed: 0,id,legacy_id,type,public_id,name,export,state,annotation_quality,description,cas_number,...,superklass,klass,subklass,direct_parent,molecular_framework,chembl_id,chemspider_id,meta_cyc_id,foodcomex,phytohub_id
0,1,1,SmallMoleculeCompound,FDB000001,Mulberrofuran P,True,,low,Constit. of Morus alba (white mulberry) [CCD],101365-02-0,...,Phenylpropanoids and polyketides,2-arylbenzofuran flavonoids,,2-arylbenzofuran flavonoids,Aromatic heteropolycyclic compounds,,,,,
1,4,4,SmallMoleculeCompound,FDB000004,Cyanidin 3-(6''-acetyl-galactoside),True,,low,Constit. of the leaves of Nymphaea alba [CCD],350602-26-5,...,Phenylpropanoids and polyketides,Flavonoids,Flavonoid glycosides,Anthocyanidin-3-O-glycosides,Aromatic heteropolycyclic compounds,,,,,
2,13,22,SmallMoleculeCompound,FDB000013,Cyanidin 3-(6''-succinyl-glucoside),True,,low,Constit. of Phragmites australis [CCD],216692-08-9,...,Phenylpropanoids and polyketides,Flavonoids,Flavonoid glycosides,Anthocyanidin-3-O-glycosides,Aromatic heteropolycyclic compounds,,,,,
3,14,23,SmallMoleculeCompound,FDB000014,Pelargonidin 3-(6''-succinyl-glucoside),True,liquid,low,,,...,,,,,,,,,,
4,22,36,SmallMoleculeCompound,FDB000022,Cyanidin 3-O-(6''-acetyl-arabinoside),True,,low,A polyphenol compound found in foods of plant ...,,...,Phenylpropanoids and polyketides,Flavonoids,Flavonoid glycosides,Anthocyanidin-3-O-glycosides,Aromatic heteropolycyclic compounds,,,,,


In [10]:
compounds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28771 entries, 0 to 28770
Columns: 106 entries, id to phytohub_id
dtypes: object(106)
memory usage: 23.3+ MB


In [24]:
get_value_from_cols(compounds, '')

id 0
legacy_id 9440
type 0
public_id 0
name 0
export 0
state 19602
annotation_quality 1300
description 5479
cas_number 10558
melting_point 21845
protein_formula 28771
protein_weight 28771
experimental_solubility 27885
experimental_logp 27881
hydrophobicity 28771
isoelectric_point 28771
metabolism 28770
kegg_compound_id 22120
pubchem_compound_id 10917
pubchem_substance_id 28766
chebi_id 26241
het_id 27797
uniprot_id 28771
uniprot_name 28771
genbank_id 28771
wikipedia_id 24269
synthesis_citations 28771
general_citations 28600
comments 28295
protein_structure_file_name 28771
protein_structure_content_type 28771
protein_structure_file_size 28771
protein_structure_updated_at 28771
msds_file_name 27318
msds_content_type 27318
msds_file_size 27318
msds_updated_at 27318
creator_id 28695
updater_id 22203
created_at 0
updated_at 0
phenolexplorer_id 27994
dfc_id 13712
hmdb_id 8011
duke_id 22578
drugbank_id 28078
bigg_id 27895
eafus_id 25821
knapsack_id 23957
boiling_point 26467
boiling_point_refe

In [None]:
phenolexplorer_id 27994
dfc_id 13712
hmdb_id 8011
duke_id 22578
drugbank_id 28078
bigg_id 27895
eafus_id 25821
knapsack_id 23957
flavornet_id 28040
goodscent_id 26173
superscent_id 28354

In [33]:
compounds['kegg_compound_id'].value_counts()[:5]

          22120
C00350      960
C00157      956
C00626      378
C00422      119
Name: kegg_compound_id, dtype: int64

In [25]:
compounds['structure_source'].value_counts()

HMDB_from_dfc          8463
HMDB                   7723
DFC                    4504
                       4311
MANUAL                 2285
BIOSPIDER               991
HMDB_from_biospider     284
MANUAL_from_darndt      165
FOODCOMEX                24
DataWrangler             21
Name: structure_source, dtype: int64

Could structure source be helpful here? Maybe workign together with the various id columns?

In [83]:
compounds['compound_source'].value_counts()

DFC               14733
HMDB               7765
DUKE               3108
MANUAL             1279
EAFUS               819
KNAPSACK            466
PHENOLEXPLORER      404
FLAVORNET           140
FOODCOMEX            24
SUPERSCENT           19
OTHER                10
TABLES                4
Name: compound_source, dtype: int64

In [85]:
compounds[compounds['dfc_id'] != '']

Unnamed: 0,id,legacy_id,type,public_id,name,export,state,annotation_quality,description,cas_number,...,superklass,klass,subklass,direct_parent,molecular_framework,chembl_id,chemspider_id,meta_cyc_id,foodcomex,phytohub_id
0,1,1,SmallMoleculeCompound,FDB000001,Mulberrofuran P,TRUE,,low,Constit. of Morus alba (white mulberry) [CCD],101365-02-0,...,Phenylpropanoids and polyketides,2-arylbenzofuran flavonoids,,2-arylbenzofuran flavonoids,Aromatic heteropolycyclic compounds,,,,,
1,4,4,SmallMoleculeCompound,FDB000004,Cyanidin 3-(6''-acetyl-galactoside),TRUE,,low,Constit. of the leaves of Nymphaea alba [CCD],350602-26-5,...,Phenylpropanoids and polyketides,Flavonoids,Flavonoid glycosides,Anthocyanidin-3-O-glycosides,Aromatic heteropolycyclic compounds,,,,,
2,13,22,SmallMoleculeCompound,FDB000013,Cyanidin 3-(6''-succinyl-glucoside),TRUE,,low,Constit. of Phragmites australis [CCD],216692-08-9,...,Phenylpropanoids and polyketides,Flavonoids,Flavonoid glycosides,Anthocyanidin-3-O-glycosides,Aromatic heteropolycyclic compounds,,,,,
13,52,69,SmallMoleculeCompound,FDB000052,Pinotin A,TRUE,,low,Isol. from red wine incl. Pinotage (CCD),663910-41-6,...,Phenylpropanoids and polyketides,Flavonoids,Flavonoid glycosides,Flavonoid-3-O-glycosides,Aromatic heteropolycyclic compounds,,10286568,,,
15,59,76,SmallMoleculeCompound,FDB000059,Peonidin 3-(6''-malonyl-glucoside),TRUE,,low,A polyphenol compound found in foods of plant ...,,...,Phenylpropanoids and polyketides,Flavonoids,Flavonoid glycosides,Anthocyanidin-3-O-glycosides,Aromatic heteropolycyclic compounds,,,,,
25,78,95,SmallMoleculeCompound,FDB000078,Luteolinidin,TRUE,,low,Luteolinidin is a chemical compound belonging ...,1154-78-5,...,Phenylpropanoids and polyketides,Flavonoids,Hydroxyflavonoids,7-hydroxyflavonoids,Aromatic heteropolycyclic compounds,CHEMBL1275834,390308,CPD-11945,,
32,86,107,SmallMoleculeCompound,FDB000086,3-Hydroxyphloretin 2'-O-glucoside,TRUE,,low,,,...,,,,,,,,,,
33,87,108,SmallMoleculeCompound,FDB000087,3-Hydroxyphloretin,TRUE,,low,A polyphenol compound found in foods of plant ...,,...,Phenylpropanoids and polyketides,"Linear 1,3-diarylpropanoids",Chalcones and dihydrochalcones,Chalcones and dihydrochalcones,Aromatic homomonocyclic compounds,CHEMBL492818,9953627,,,
67,173,295,SmallMoleculeCompound,FDB000173,"Quercetin 3,4',7-triglucoside",TRUE,,low,Constit. of Allium cepa (red onion). Also obt....,133563-23-2,...,Phenylpropanoids and polyketides,Flavonoids,Flavonoid glycosides,Flavonoid-7-O-glycosides,Aromatic heteropolycyclic compounds,,,,,
68,176,299,SmallMoleculeCompound,FDB000176,"3,7-Dimethylquercetin",TRUE,,low,Isol. from various plants incl. many Asteracea...,2/2/68,...,Phenylpropanoids and polyketides,Flavonoids,O-methylated flavonoids,7-O-methylated flavonoids,Aromatic heteropolycyclic compounds,CHEMBL164861,4444090,345-TRIHYDROXY-37-DIMETHOXYFLAVONE,,


In [62]:
#Are any of the legacy_ids shared between compounds and nutrients?
#No.
compounds_legacy_ids = list(compounds['legacy_id'])
nutrients_legacy_ids = list(nutrients['legacy_id'])
shared_compnutr_legacy_ids = []
#for sub in subgroups:
#    if any(generic in sub for generic in generics):
#        generic_subgroups.append(sub)
for comp in compounds_legacy_ids:
    for nutr in nutrients_legacy_ids:
        if comp == nutr:
            shared_compnutr_legacy_ids.append(comp)

In [61]:
#38 shared compound/nutrient legacy ids, which means 38 compounds are nutrientsbbb
len(shared_compnutr_legacy_ids)

0

In [64]:
foodcomex_compounds = pd.read_sql_query('''SELECT fcc.* 
                            FROM foodcomex_compounds fcc
                            ;''', engine)

2019-03-17 10:09:31,085 INFO sqlalchemy.engine.base.Engine SELECT fcc.* 
                            FROM foodcomex_compounds fcc
                            ;


INFO:sqlalchemy.engine.base.Engine:SELECT fcc.* 
                            FROM foodcomex_compounds fcc
                            ;


2019-03-17 10:09:31,085 INFO sqlalchemy.engine.base.Engine ()


INFO:sqlalchemy.engine.base.Engine:()


In [65]:
foodcomex_compounds.head()

Unnamed: 0,id,compound_id,origin,storage_form,maximum_quantity,storage_condition,contact_name,contact_address,contact_email,created_at,...,production_method_reference_text,production_method_reference_file_name,production_method_reference_content_type,production_method_reference_file_size,production_method_reference_updated_at,elemental_formula,minimum_quantity,quantity_units,available_spectra,storage_conditions
0,1,9021,commercial,solid,5,temperature: -80 oC,Dr. David Wishart,Departments of Computing Science and Biologica...,dwishart@ualberta.ca,2015-10-09 22:27:11,...,,,,,,,,g,,---\n\-80Â°C\: 1
1,2,12167,commercial,solid,6,temperature: -80 oC,Dr. David Wishart,Departments of Computing Science and Biologica...,dwishart@ualberta.ca,2015-10-09 22:27:12,...,,,,,,,,g,,---\n\-80Â°C\: 1
2,3,22309,commercial,solid,1,temperature: -80 oC,Dr. David Wishart,Departments of Computing Science and Biologica...,dwishart@ualberta.ca,2015-10-09 22:27:13,...,,,,,,,,g,,---\n\-80Â°C\: 1
3,4,22088,commercial,solid,8,temperature: -80 oC,Dr. David Wishart,Departments of Computing Science and Biologica...,dwishart@ualberta.ca,2015-10-09 22:27:13,...,,,,,,,,g,,---\n\-80Â°C\: 1
4,5,12686,commercial,solid,1,temperature: -80 oC,Dr. David Wishart,Departments of Computing Science and Biologica...,dwishart@ualberta.ca,2015-10-09 22:27:13,...,,,,,,,,g,,---\n\-80Â°C\: 1


In [67]:
foodcomex_compounds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1044 entries, 0 to 1043
Data columns (total 36 columns):
id                                          1044 non-null object
compound_id                                 1044 non-null object
origin                                      1044 non-null object
storage_form                                1044 non-null object
maximum_quantity                            1044 non-null object
storage_condition                           1044 non-null object
contact_name                                1044 non-null object
contact_address                             1044 non-null object
contact_email                               1044 non-null object
created_at                                  1044 non-null object
updated_at                                  1044 non-null object
export                                      1044 non-null object
purity                                      1044 non-null object
description                                 1044

In [69]:
get_value_from_cols(foodcomex_compounds, 'NULL')

id 0
compound_id 0
origin 0
storage_form 0
maximum_quantity 33
storage_condition 22
contact_name 23
contact_address 23
contact_email 23
created_at 0
updated_at 0
export 0
purity 1005
description 1005
spectra_details 1028
delivery_time 1005
stability 1005
admin_user_id 0
public_id 0
cas_number 0
taxonomy_class 0
taxonomy_family 0
experimental_logp 0
experimental_solubility 0
melting_point 0
food_of_origin 0
production_method_reference_text 0
production_method_reference_file_name 1043
production_method_reference_content_type 1043
production_method_reference_file_size 1043
production_method_reference_updated_at 1043
elemental_formula 0
minimum_quantity 0
quantity_units 0
available_spectra 1005
storage_conditions 0


In [74]:
#Do food comex compound compound_ids match the compounds.ids?
#Yes. This accounts for 1044 compounds.
compounds_from_foodcomex = pd.read_sql_query('''SELECT c.*
                FROM compounds c
                JOIN foodcomex_compounds fcc ON fcc.compound_id == c.id
                WHERE c.id == fcc.compound_id
                ;''', engine)

2019-03-17 10:15:17,791 INFO sqlalchemy.engine.base.Engine SELECT c.*
                FROM compounds c
                JOIN foodcomex_compounds fcc ON fcc.compound_id == c.id
                WHERE c.id == fcc.compound_id
                ;


INFO:sqlalchemy.engine.base.Engine:SELECT c.*
                FROM compounds c
                JOIN foodcomex_compounds fcc ON fcc.compound_id == c.id
                WHERE c.id == fcc.compound_id
                ;


2019-03-17 10:15:17,793 INFO sqlalchemy.engine.base.Engine ()


INFO:sqlalchemy.engine.base.Engine:()


In [76]:
compounds_from_foodcomex.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1044 entries, 0 to 1043
Columns: 106 entries, id to phytohub_id
dtypes: object(106)
memory usage: 864.6+ KB


In [30]:
pathways = pd.read_sql_query('''SELECT p.* 
                            FROM pathways p
                            ;''', engine)

2019-03-17 09:43:39,275 INFO sqlalchemy.engine.base.Engine SELECT p.* 
                            FROM pathways p
                            ;
2019-03-17 09:43:39,276 INFO sqlalchemy.engine.base.Engine ()


In [31]:
#Possible link between kegg_mag_id and compounds.kegg_compound_id, but only for about 3000 compounds
pathways.head()

Unnamed: 0,id,smpdb_id,kegg_map_id,name,created_at,updated_at
0,1,SMP00006,map00350,Tyrosine Metabolism,2015-02-23 22:40:48,2015-02-23 22:40:48
1,2,SMP00068,map00150,Androgen and Estrogen Metabolism,2015-02-23 22:40:58,2015-02-23 22:40:58
2,3,SMP00011,map00562,Inositol Metabolism,2015-02-23 22:41:14,2015-02-23 22:41:14
3,4,SMP00462,map00562,Inositol Phosphate Metabolism,2015-02-23 22:41:14,2015-02-23 22:41:14
4,5,SMP00012,map00350,Catecholamine Biosynthesis,2015-02-23 22:42:38,2015-02-23 22:42:38


In [34]:
nutrients = pd.read_sql_query('''SELECT n.* 
                            FROM nutrients n
                            ;''', engine)

2019-03-17 09:46:42,081 INFO sqlalchemy.engine.base.Engine SELECT n.* 
                            FROM nutrients n
                            ;
2019-03-17 09:46:42,091 INFO sqlalchemy.engine.base.Engine ()


In [40]:
nutrients

Unnamed: 0,id,legacy_id,type,public_id,name,export,state,annotation_quality,description,wikipedia_id,...,eafus_id,dfc_name,compound_source,metabolism,synthesis_citations,general_citations,creator_id,updater_id,created_at,updated_at
0,1,10930,Nutrient,FDBN00001,Fat,0,,low,,,...,,,DUKE,,,,,,2014-11-05 13:42:10,2014-11-05 13:42:10
1,2,10946,Nutrient,FDBN00002,Proteins,0,,low,,,...,,,DUKE,,,,,,2014-11-05 13:42:15,2014-11-05 13:42:15
2,3,16037,Nutrient,FDBN00003,Carbohydrate,0,,low,Carbohydrates (or saccharides) are organic com...,Carbohydrate,...,,,DUKE,,,,,,2014-11-05 13:44:06,2014-11-05 13:44:06
3,4,23404,Nutrient,FDBN00004,Fatty acids,0,,low,,,...,1335.0,,EAFUS,,,,,,2014-11-05 13:46:00,2014-11-05 13:46:00
4,5,11134,Nutrient,FDBN00005,Fiber (dietary),0,,low,,,...,,,DUKE,,,,,,2014-11-05 13:47:36,2014-11-05 13:47:36
5,6,58893,Nutrient,FDBN00006,13:0,0,,low,,,...,,,TABLES,,,,,,2014-11-05 13:49:16,2014-11-05 13:49:16
6,7,58894,Nutrient,FDBN00007,14:1,0,,low,,,...,,,TABLES,,,,,,2014-11-05 13:49:19,2014-11-05 13:49:19
7,8,58895,Nutrient,FDBN00008,15:1,0,,low,,,...,,,TABLES,,,,,,2014-11-05 13:49:38,2014-11-05 13:49:38
8,9,58896,Nutrient,FDBN00009,16:1 c,0,,low,,,...,,,TABLES,,,,,,2014-11-05 13:49:52,2014-11-05 13:49:52
9,10,58897,Nutrient,FDBN00010,16:1 t,0,,low,,,...,,,TABLES,,,,,,2014-11-05 13:49:59,2014-11-05 13:49:59


In [36]:
nutrients.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38 entries, 0 to 37
Data columns (total 23 columns):
id                     38 non-null object
legacy_id              38 non-null object
type                   38 non-null object
public_id              38 non-null object
name                   38 non-null object
export                 38 non-null object
state                  38 non-null object
annotation_quality     38 non-null object
description            38 non-null object
wikipedia_id           38 non-null object
comments               38 non-null object
dfc_id                 38 non-null object
duke_id                38 non-null object
eafus_id               38 non-null object
dfc_name               38 non-null object
compound_source        38 non-null object
metabolism             38 non-null object
synthesis_citations    38 non-null object
general_citations      38 non-null object
creator_id             38 non-null object
updater_id             38 non-null object
created_at     

In [39]:
get_value_from_cols(nutrients, 'NULL')

id 0
legacy_id 0
type 0
public_id 0
name 0
export 0
state 38
annotation_quality 0
description 37
wikipedia_id 37
comments 38
dfc_id 38
duke_id 33
eafus_id 37
dfc_name 38
compound_source 0
metabolism 38
synthesis_citations 38
general_citations 38
creator_id 38
updater_id 38
created_at 0
updated_at 0


In [11]:
references.head()

Unnamed: 0,id,ref_type,text,pubmed_id,link,title,creator_id,updater_id,created_at,updated_at,source_id,source_type
0,1,general,"Yannai, Shmuel. (2004) Dictionary of food comp...",,,,,,2015-02-23 22:39:47,2015-02-23 22:39:47,1,Compound
1,2,general,"Neveu V, Perez-Jimenez J, Vos F, Crespy V, du ...",,,,,,2015-02-23 22:39:50,2015-02-23 22:39:50,22,Compound
2,3,general,"Neveu V, Perez-Jimenez J, Vos F, Crespy V, du ...",,,,,,2015-02-23 22:39:50,2015-02-23 22:39:50,48,Compound
3,4,general,"de Villiers A, Vanhoenacker G, Majek P, Sandra...",,,,,,2015-02-23 22:39:52,2015-02-23 22:39:52,52,Compound
4,5,general,"Neveu V, Perez-Jimenez J, Vos F, Crespy V, du ...",,,,,,2015-02-23 22:39:54,2015-02-23 22:39:54,59,Compound


In [12]:
references.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31791 entries, 0 to 31790
Data columns (total 12 columns):
id             31791 non-null object
ref_type       31791 non-null object
text           31791 non-null object
pubmed_id      31791 non-null object
link           31791 non-null object
title          31791 non-null object
creator_id     31791 non-null object
updater_id     31791 non-null object
created_at     31791 non-null object
updated_at     31791 non-null object
source_id      31791 non-null object
source_type    31791 non-null object
dtypes: object(12)
memory usage: 2.9+ MB


In [15]:
#14674 source ids in references
#references['source_id'].value_counts()
#Drop pubmed_id, link, title, creator_id, updater_id

In [81]:
foods[foods['name_scientific'] == 'Callinectes sapidus']

Unnamed: 0,id,name,name_scientific,description,itis_id,wikipedia_id,wikipedia_id_img,picture_content_type,picture_file_size,picture_updated_at,...,food_subgroup,food_type,created_at,updated_at,creator_id,updater_id,export_to_afcdb,category,ncbi_taxonomy_id,export_to_foodb
304,306,Blue crab,Callinectes sapidus,"Callinectes sapidus (from the Greek calli- = ""...",98696,Callinectes_sapidus,306.jpg,image/jpeg,69896,2012-04-20 09:32:04 UTC,...,Crustaceans,Type 1,2011-02-09 00:37:31 UTC,2017-03-20 21:26:27 UTC,,,False,specific,6763,True


## Foods table

In [78]:
foods = pd.read_sql_query('''SELECT f.* 
                            FROM foods f
                            ;''', engine)

2019-03-17 10:27:52,239 INFO sqlalchemy.engine.base.Engine SELECT f.* 
                            FROM foods f
                            ;


INFO:sqlalchemy.engine.base.Engine:SELECT f.* 
                            FROM foods f
                            ;


2019-03-17 10:27:52,241 INFO sqlalchemy.engine.base.Engine ()


INFO:sqlalchemy.engine.base.Engine:()


In [27]:
foods.head()

Unnamed: 0,id,name,name_scientific,description,itis_id,wikipedia_id,wikipedia_id_img,picture_content_type,picture_file_size,picture_updated_at,...,food_subgroup,food_type,created_at,updated_at,creator_id,updater_id,export_to_afcdb,category,ncbi_taxonomy_id,export_to_foodb
0,1,Angelica,Angelica keiskei,Angelica is a genus of about 60 species of tal...,,Angelica,1.jpg,image/jpeg,111325,2012-04-20 09:29:57 UTC,...,Herbs,Type 1,2011-02-09 00:37:14 UTC,2017-06-27 17:13:48 UTC,,2.0,False,specific,357850.0,True
1,2,Savoy cabbage,Brassica oleracea var. sabauda,Savoy cabbage (Brassica oleracea convar. capit...,,Savoy cabbage,2.jpg,image/jpeg,155178,2012-04-20 09:39:54 UTC,...,Cabbages,Type 1,2011-02-09 00:37:15 UTC,2017-03-20 21:26:22 UTC,,,False,specific,1216010.0,True
2,3,Silver linden,Tilia argentea,Tilia tomentosa (Silver Lime in the UK and Sil...,,Tilia tomentosa,3.jpg,image/jpeg,56367,2012-04-20 09:41:25 UTC,...,Herbs,Type 1,2011-02-09 00:37:15 UTC,2015-10-02 21:24:39 UTC,,,False,specific,,True
3,4,Kiwi,Actinidia chinensis,"The kiwifruit, often shortened to kiwi in many...",506775.0,Kiwifruit,4.jpg,image/jpeg,110661,2012-04-20 09:32:21 UTC,...,Tropical fruits,Type 1,2011-02-09 00:37:15 UTC,2017-03-20 21:26:22 UTC,,,False,specific,3625.0,True
4,5,Allium (Onion),Allium,Allium haematochiton is a species of wild onio...,42634.0,Allium haematochiton,5.jpg,image/jpeg,341911,2012-04-20 09:37:44 UTC,...,Onion-family vegetables,Type 1,2011-02-09 00:37:15 UTC,2017-03-20 21:26:22 UTC,,,False,specific,4678.0,True


In [69]:
foods.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 907 entries, 0 to 906
Data columns (total 22 columns):
id                      907 non-null object
name                    907 non-null object
name_scientific         907 non-null object
description             907 non-null object
itis_id                 907 non-null object
wikipedia_id            907 non-null object
wikipedia_id_img        907 non-null object
picture_content_type    907 non-null object
picture_file_size       907 non-null object
picture_updated_at      907 non-null object
legacy_id               907 non-null object
food_group              907 non-null object
food_subgroup           907 non-null object
food_type               907 non-null object
created_at              907 non-null object
updated_at              907 non-null object
creator_id              907 non-null object
updater_id              907 non-null object
export_to_afcdb         907 non-null object
category                907 non-null object
ncbi_taxonomy_i

Columns to drop:

picture_updated_at	

created_at

updated_at

creator_id

updater_id

export_to_afcdb

export_to_foodb

In [24]:
foods.shape

(907, 22)

In [25]:
foods['name'].value_counts().sum(), foods['name'].value_counts().sum(), foods['id'].value_counts().sum()

(907, 907, 907)

### There are 907 foods in the database.

In [9]:
tables_columns['foods']

['id',
 'name',
 'name_scientific',
 'description',
 'itis_id',
 'wikipedia_id',
 'wikipedia_id_img',
 'picture_content_type',
 'picture_file_size',
 'picture_updated_at',
 'legacy_id',
 'food_group',
 'food_subgroup',
 'food_type',
 'created_at',
 'updated_at',
 'creator_id',
 'updater_id',
 'export_to_afcdb',
 'category',
 'ncbi_taxonomy_id',
 'export_to_foodb']

In [28]:
#alterante for food subgroup valuecounts
#pd.read_sql_query('''SELECT f.food_subgroup, 
#                        COUNT(f.name) foods
 #                       FROM foods f
 #                       GROUP BY f.food_subgroup
  #                      ORDER BY foods DESC
  #                  ;''', engine)

### Investigating Type 1 and Type 2

In [29]:
pd.read_sql_query('''SELECT f.food_subgroup, 
                        COUNT(f.food_subgroup) subgroup
                        FROM foods f
                        WHERE f.food_type == 'Type 1'
                        GROUP BY f.food_subgroup
                    ;''', engine)

2019-03-12 19:13:32,229 INFO sqlalchemy.engine.base.Engine SELECT f.food_subgroup, 
                        COUNT(f.food_subgroup) subgroup
                        FROM foods f
                        WHERE f.food_type == 'Type 1'
                        GROUP BY f.food_subgroup
                    ;
2019-03-12 19:13:32,230 INFO sqlalchemy.engine.base.Engine ()


Unnamed: 0,food_subgroup,subgroup
0,Amphibians,1
1,Beans,14
2,Berries,55
3,Bovines,4
4,Cabbages,20
5,Caprae,1
6,Cereals,18
7,Cetaceans,3
8,Citrus,8
9,Cocoa,1


In [30]:
pd.read_sql_query('''SELECT f.food_subgroup, 
                        COUNT(f.food_subgroup) subgroup
                        FROM foods f
                        WHERE f.food_type == 'Type 2'
                        GROUP BY f.food_subgroup
                    ;''', engine)

2019-03-12 19:14:06,827 INFO sqlalchemy.engine.base.Engine SELECT f.food_subgroup, 
                        COUNT(f.food_subgroup) subgroup
                        FROM foods f
                        WHERE f.food_type == 'Type 2'
                        GROUP BY f.food_subgroup
                    ;
2019-03-12 19:14:06,830 INFO sqlalchemy.engine.base.Engine ()


Unnamed: 0,food_subgroup,subgroup
0,Alcoholic beverages,2
1,American cuisine,3
2,Animal fats,4
3,Asian cuisine,2
4,Baby foods,1
5,Baking goods,13
6,Berber cuisine,1
7,Bread products,1
8,Candies,8
9,Cereal products,8


### Type 1 looks like unprocessed or unprepared foods. Type 2 foods are processed or prepared.

In [51]:
#foods[foods['food_subgroup'] == 'Marsupials']


In [None]:
#to drop: id 694 (subgroup Unclassified)
#id 685 (subgroup waters)

### Investigating NCBI Taxonomy ID

In [60]:
#Says no NA values but I see some in the head
foods['ncbi_taxonomy_id'].isna().sum()

0

In [72]:
foods['ncbi_taxonomy_id'].value_counts()b

           276
4072         6
3885         3
13427        3
135518       2
41679        2
4682         2
183260       2
190544       2
4679         2
3661         2
138011       2
37656        2
3714         2
16718        1
357850       1
34256        1
8113         1
66014        1
888065       1
9031         1
59166        1
37796        1
6565         1
9721         1
7797         1
167592       1
109171       1
2763         1
225387       1
          ... 
49390        1
33637        1
195615       1
397755       1
36066        1
8112         1
106975       1
36181        1
42229        1
9103         1
4071         1
334483       1
4558         1
32219        1
6763         1
29780        1
1042646      1
4039         1
119950       1
7935         1
8017         1
3891         1
13493        1
30850        1
485725       1
103480       1
24663        1
403101       1
4681         1
66656        1
Name: ncbi_taxonomy_id, Length: 613, dtype: int64

In [57]:
foods[foods['ncbi_taxonomy_id'] == '4072']

Unnamed: 0,id,name,name_scientific,description,itis_id,wikipedia_id,wikipedia_id_img,picture_content_type,picture_file_size,picture_updated_at,...,food_subgroup,food_type,created_at,updated_at,creator_id,updater_id,export_to_afcdb,category,ncbi_taxonomy_id,export_to_foodb
39,40,Pepper (C. annuum),Capsicum annuum,<i>Capsicum annuum</i> is a domesticated speci...,30492.0,Capsicum_annuum,40.jpg,image/jpeg,45222,2012-04-20 09:37:27 UTC,...,Fruit vegetables,Type 1,2011-02-09 00:37:17 UTC,2017-03-20 21:26:23 UTC,,2.0,False,specific,4072,True
891,909,Green bell pepper,Capsicum annuum,Green bell peppers are members of the domestic...,30492.0,Capsicum_annuum,428px-Green-Bell-Pepper.jpg,image/jpeg,26784,2015-02-26 18:37:27 UTC,...,Fruit vegetables,Type 1,2015-02-26 18:37:27 UTC,2017-03-20 21:26:33 UTC,,,True,specific,4072,True
892,910,Yellow bell pepper,Capsicum annuum,Yellow bell peppers are members of the domesti...,30492.0,Capsicum_annuum,bell_pepper_yellow_2.png,image/png,25165,2015-02-26 18:42:45 UTC,...,Fruit vegetables,Type 1,2015-02-26 18:42:45 UTC,2017-03-20 21:26:34 UTC,,,True,specific,4072,True
893,911,Orange bell pepper,Capsicum annuum,Orange bell peppers are members of the domesti...,30492.0,Capsicum_annuum,orange-bell-pepper-02.jpg,image/jpeg,6881,2015-02-26 18:44:18 UTC,...,Fruit vegetables,Type 1,2015-02-26 18:44:18 UTC,2017-03-20 21:26:34 UTC,,,True,specific,4072,True
894,912,Red bell pepper,Capsicum annuum,Red bell peppers are members of the domesticat...,30492.0,Capsicum_annuum,red-bell-pepper.jpg,image/jpeg,76376,2015-02-26 18:47:15 UTC,...,Fruit vegetables,Type 1,2015-02-26 18:47:16 UTC,2017-03-20 21:26:34 UTC,,,True,specific,4072,True
895,913,Italian sweet red pepper,Capsicum annuum,The Italian sweet pepper is a variety of the s...,,Italian sweet pepper,939px-Italian_sweet_peppers.jpg,image/jpeg,124244,2015-02-26 18:54:22 UTC,...,Fruit vegetables,Type 1,2015-02-26 18:54:23 UTC,2017-03-20 21:26:34 UTC,,,True,specific,4072,True


### Some ncbis are repeated but all foods seem to have one.

### Investigating Category

In [74]:
foods['category'].value_counts()

specific    881
generic      26
Name: category, dtype: int64

In [100]:
generics = list(foods['name'][foods['category'] == 'generic'])
subgroups = list(set(foods['food_subgroup']))

In [106]:
len(generics)

26

In [103]:
generic_subgroups = []

for sub in subgroups:
    if any(generic in sub for generic in generics):
        generic_subgroups.append(sub)

In [109]:
len(set(generic_subgroups))

27

### The "generic" category is just 26 of the subgroups with a bit more information about each.

[(Back to top)](#top)

# Exploratory data analysis


<a id = 'eda_enzymes'></a>

## Enzymes table

In [126]:
enzymes = pd.read_sql_query('''SELECT e.* 
                            FROM enzymes e
                            ;''', engine)

2019-03-16 12:41:30,408 INFO sqlalchemy.engine.base.Engine SELECT e.* 
                            FROM enzymes e
                            ;


INFO:sqlalchemy.engine.base.Engine:SELECT e.* 
                            FROM enzymes e
                            ;


2019-03-16 12:41:30,408 INFO sqlalchemy.engine.base.Engine ()


INFO:sqlalchemy.engine.base.Engine:()


In [119]:
enzymes.shape

(1744, 32)

In [125]:
enzymes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1744 entries, 0 to 1743
Data columns (total 32 columns):
id                       1744 non-null object
name                     1744 non-null object
gene_name                1744 non-null object
description              1744 non-null object
go_classification        1744 non-null object
general_function         1744 non-null object
specific_function        1744 non-null object
pathway                  1744 non-null object
reaction                 1744 non-null object
cellular_location        1744 non-null object
signals                  1744 non-null object
transmembrane_regions    1744 non-null object
molecular_weight         1744 non-null object
theoretical_pi           1744 non-null object
locus                    1744 non-null object
chromosome               1744 non-null object
uniprot_name             1744 non-null object
uniprot_id               1744 non-null object
pdb_id                   1744 non-null object
genbank_protein_id 

In [137]:
enzymes['name'].value_counts().sum(), enzymes['gene_name'].value_counts().sum()

(1744, 1744)

### 1744 unique names and gene names. Linked to compounds.

### How many null values?

In [112]:
enzymes.head(3)

Unnamed: 0,id,name,gene_name,description,go_classification,general_function,specific_function,pathway,reaction,cellular_location,...,genatlas_id,hgnc_id,hprd_id,organism,general_citations,comments,creator_id,updater_id,created_at,updated_at
0,1,UDP-glucuronosyltransferase 2B28,UGT2B28,,,,,,,,...,,,,,,,,,2015-02-23 22:40:56,2015-02-23 22:40:56
1,2,Estrogen receptor beta,ESR2,,,,,,,,...,,,,,,,,,2015-02-23 22:40:57,2015-02-23 22:40:57
2,3,UDP-glucuronosyltransferase 2B4,UGT2B4,,,,,,,,...,,,,,,,,,2015-02-23 22:40:57,2015-02-23 22:40:57


In [117]:
null_enzyme_descriptions = enzymes['description'] == 'NULL'

In [118]:
null_enzyme_descriptions.sum()

1744

In [121]:
enzymes[enzymes['description'] == 'NULL'].shape

(1744, 32)

In [122]:
enzymes[enzymes['go_classification'] == 'NULL'].shape

(1744, 32)

In [123]:
enzymes[enzymes['general_function'] == 'NULL'].shape

(1744, 32)

In [124]:
enzymes[enzymes['specific_function'] == 'NULL'].shape

(1744, 32)

In [131]:
for column in enzymes.columns:
    print(column, enzymes[enzymes[column] == 'NULL'].shape[0])

id 0
name 0
gene_name 0
description 1744
go_classification 1744
general_function 1744
specific_function 1744
pathway 1744
reaction 1744
cellular_location 1744
signals 1744
transmembrane_regions 1744
molecular_weight 1744
theoretical_pi 1744
locus 1744
chromosome 1744
uniprot_name 1744
uniprot_id 0
pdb_id 1744
genbank_protein_id 1744
genbank_gene_id 1744
genecard_id 1744
genatlas_id 1744
hgnc_id 1744
hprd_id 1744
organism 1744
general_citations 1744
comments 1744
creator_id 1744
updater_id 1744
created_at 0
updated_at 0


### Drop all columns except id, name, and gene_name.

In [136]:
enzymes['gene_name'].value_counts().sum()

1744

[(Back to top)](#top)
# Exploratory data analysis

<a id = 'eda_flavors'></a>

## flavors table

In [155]:
flavors = pd.read_sql_query('''SELECT fl.* 
                            FROM flavors fl
                            ;''', engine)

2019-03-16 13:00:30,081 INFO sqlalchemy.engine.base.Engine SELECT fl.* 
                            FROM flavors fl
                            ;


INFO:sqlalchemy.engine.base.Engine:SELECT fl.* 
                            FROM flavors fl
                            ;


2019-03-16 13:00:30,083 INFO sqlalchemy.engine.base.Engine ()


INFO:sqlalchemy.engine.base.Engine:()


In [146]:
flavors.head()

Unnamed: 0,id,name,flavor_group,category,created_at,updated_at,creator_id,updater_id
0,1,celery,vegetable,odor,2011-10-02 06:10:04,2011-10-02 06:10:04,,
1,2,corn,vegetable,odor,2011-10-02 06:10:04,2011-10-02 06:10:04,,
2,3,cucumber,vegetable,odor,2011-10-02 06:10:04,2011-10-02 06:10:04,,
3,4,horseradish,vegetable,odor,2011-10-02 06:10:05,2011-10-02 06:10:05,,
4,5,vegetable,vegetable,odor,2011-10-02 06:10:05,2011-10-02 06:10:05,,


In [151]:
flavors['flavor_group'].value_counts()

NULL             743
fruity            24
floral            14
balsamic          10
vegetable          8
fatty              7
wine_like          5
herbaceous         5
citrus             5
nutty              5
woody              4
chemical           3
meaty              3
green              3
earthy             3
spicy              2
musky              1
tobacco            1
smoky              1
minty              1
medicinal          1
camphoraceous      1
mossy              1
fishy              1
pepper             1
seedy              1
animal             1
soapy              1
Name: flavor_group, dtype: int64

In [153]:
flavors['category'].value_counts()

odor    856
Name: category, dtype: int64

### Drop all columns except id, name, and flavor_group
### Will have to see later how compounds are connected to these descriptions of flavor. Disappointed that there are 856 unique descriptors ('name') and very few useful flavor_group designations.

[(Back to top)](#top)
# Exploratory Data Analysis


<a id = 'eda_health_effects'></a>

## Health effects table

In [194]:
health_effects = pd.read_sql_query('''SELECT he.* 
                            FROM health_effects he
                            ;''', engine)

2019-03-16 13:25:32,755 INFO sqlalchemy.engine.base.Engine SELECT he.* 
                            FROM health_effects he
                            ;


INFO:sqlalchemy.engine.base.Engine:SELECT he.* 
                            FROM health_effects he
                            ;


2019-03-16 13:25:32,755 INFO sqlalchemy.engine.base.Engine ()


INFO:sqlalchemy.engine.base.Engine:()


In [158]:
health_effects.shape

(1435, 10)

In [159]:
health_effects.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1435 entries, 0 to 1434
Data columns (total 10 columns):
id                  1435 non-null object
name                1435 non-null object
description         1435 non-null object
chebi_name          1435 non-null object
chebi_id            1435 non-null object
created_at          1435 non-null object
updated_at          1435 non-null object
creator_id          1435 non-null object
updater_id          1435 non-null object
chebi_definition    1435 non-null object
dtypes: object(10)
memory usage: 112.2+ KB


In [160]:
health_effects.head()

Unnamed: 0,id,name,description,chebi_name,chebi_id,created_at,updated_at,creator_id,updater_id,chebi_definition
0,1,(+)-inotropic,An agent that alters the force or energy of mu...,,,2012-04-10 10:05:13,2015-11-11 02:32:19,,,
1,2,(-)-chronotropic,An agent that may change theÂ heartÂ rate by a...,,,2012-04-10 10:05:13,2015-11-11 02:32:19,,,
2,3,(-)-inotropic,An agent that alters the force or energy of mu...,,,2012-04-10 10:05:13,2015-11-11 02:32:19,,,
3,4,11beta-hydroxysteroid-dehydrogenase inhibitor,,enzyme inhibitor,23924.0,2012-04-10 10:05:13,2015-10-27 17:47:35,,,A compound or agent that combines with an enzy...
4,5,12-lipoxygenase inhibitor,,enzyme inhibitor,23924.0,2012-04-10 10:05:13,2015-10-27 17:47:36,,,A compound or agent that combines with an enzy...


In [161]:
for column in health_effects.columns:
    print(column, health_effects[health_effects[column] != 'NULL'].shape[0])

id 1435
name 1435
description 619
chebi_name 710
chebi_id 710
created_at 1435
updated_at 1435
creator_id 0
updater_id 0
chebi_definition 698


In [162]:
#Are the rows with a non-null chebi_id the same as the rows with a non-null chebi_name?
chebi_ids = health_effects[health_effects['chebi_id'] != 'NULL']

In [165]:
chebi_name = health_effects[health_effects['chebi_name'] != 'NULL']

In [167]:
trues = chebi_ids == chebi_name

### Drop created_at, updated_at, creator_id, updater_id

[(Back to top)](#top)
# Exploratory Data Analysis

<a id = 'eda_nutrients'></a>

## Nutrients table

In [55]:
nutrients = pd.read_sql_query('''SELECT n.* 
                            FROM nutrients n
                            ;''', engine)

2019-03-15 13:29:47,268 INFO sqlalchemy.engine.base.Engine SELECT n.* 
                            FROM nutrients n
                            ;


INFO:sqlalchemy.engine.base.Engine:SELECT n.* 
                            FROM nutrients n
                            ;


2019-03-15 13:29:47,268 INFO sqlalchemy.engine.base.Engine ()


INFO:sqlalchemy.engine.base.Engine:()


In [67]:
nutrients.head()

Unnamed: 0,id,legacy_id,type,public_id,name,export,state,annotation_quality,description,wikipedia_id,...,eafus_id,dfc_name,compound_source,metabolism,synthesis_citations,general_citations,creator_id,updater_id,created_at,updated_at
0,1,10930,Nutrient,FDBN00001,Fat,0,,low,,,...,,,DUKE,,,,,,2014-11-05 13:42:10,2014-11-05 13:42:10
1,2,10946,Nutrient,FDBN00002,Proteins,0,,low,,,...,,,DUKE,,,,,,2014-11-05 13:42:15,2014-11-05 13:42:15
2,3,16037,Nutrient,FDBN00003,Carbohydrate,0,,low,Carbohydrates (or saccharides) are organic com...,Carbohydrate,...,,,DUKE,,,,,,2014-11-05 13:44:06,2014-11-05 13:44:06
3,4,23404,Nutrient,FDBN00004,Fatty acids,0,,low,,,...,1335.0,,EAFUS,,,,,,2014-11-05 13:46:00,2014-11-05 13:46:00
4,5,11134,Nutrient,FDBN00005,Fiber (dietary),0,,low,,,...,,,DUKE,,,,,,2014-11-05 13:47:36,2014-11-05 13:47:36


In [60]:
for column in nutrients.columns:
    print(column, nutrients[nutrients[column] == 'NULL'].shape[0])

id 0
legacy_id 0
type 0
public_id 0
name 0
export 0
state 38
annotation_quality 0
description 37
wikipedia_id 37
comments 38
dfc_id 38
duke_id 33
eafus_id 37
dfc_name 38
compound_source 0
metabolism 38
synthesis_citations 38
general_citations 38
creator_id 38
updater_id 38
created_at 0
updated_at 0


### Drop whole table.

[(Back to top)](#top)
# Exploratory data analysis

<a id = 'eda_pathways'></a>

## Pathways table

In [220]:
pathways = pd.read_sql_query('''SELECT pa.* 
                            FROM pathways pa
                            ;''', engine)

2019-03-16 13:39:36,133 INFO sqlalchemy.engine.base.Engine SELECT pa.* 
                            FROM pathways pa
                            ;


INFO:sqlalchemy.engine.base.Engine:SELECT pa.* 
                            FROM pathways pa
                            ;


2019-03-16 13:39:36,134 INFO sqlalchemy.engine.base.Engine ()


INFO:sqlalchemy.engine.base.Engine:()


In [69]:
pathways.head(20)

Unnamed: 0,id,smpdb_id,kegg_map_id,name,created_at,updated_at
0,1,SMP00006,map00350,Tyrosine Metabolism,2015-02-23 22:40:48,2015-02-23 22:40:48
1,2,SMP00068,map00150,Androgen and Estrogen Metabolism,2015-02-23 22:40:58,2015-02-23 22:40:58
2,3,SMP00011,map00562,Inositol Metabolism,2015-02-23 22:41:14,2015-02-23 22:41:14
3,4,SMP00462,map00562,Inositol Phosphate Metabolism,2015-02-23 22:41:14,2015-02-23 22:41:14
4,5,SMP00012,map00350,Catecholamine Biosynthesis,2015-02-23 22:42:38,2015-02-23 22:42:38
5,6,SMP00008,map00360,Phenylalanine and Tyrosine Metabolism,2015-02-23 22:42:38,2015-02-23 22:42:38
6,7,SMP00019,,Transcription/Translation,2015-02-23 22:42:38,2015-02-23 22:42:38
7,8,SMP00028,map00232,Caffeine Metabolism,2015-02-23 22:42:49,2015-02-23 22:42:49
8,9,SMP00037,map00310,Lysine Degradation,2015-02-23 22:42:58,2015-02-23 22:42:58
9,10,SMP00016,map00640,Propanoate Metabolism,2015-02-23 22:43:04,2015-02-23 22:43:04


### Drop created_at, updated_at

[(Back to top)](#top)
# Exploratory data analysis

<a id = 'eda_ref'></a>

## References table

In [79]:
references = pd.read_sql_query('''SELECT r.* 
                            FROM [references] r
                            ;''', engine)

2019-03-15 13:42:27,331 INFO sqlalchemy.engine.base.Engine SELECT r.* 
                            FROM [references] r
                            ;


INFO:sqlalchemy.engine.base.Engine:SELECT r.* 
                            FROM [references] r
                            ;


2019-03-15 13:42:27,333 INFO sqlalchemy.engine.base.Engine ()


INFO:sqlalchemy.engine.base.Engine:()


In [78]:
table_names

['compound_alternate_parents',
 'compound_external_descriptors',
 'compound_substituents',
 'compound_synonyms',
 'compounds',
 'compounds_enzymes',
 'compounds_flavors',
 'compounds_health_effects',
 'compounds_pathways',
 'enzymes',
 'flavors',
 'food_taxonomies',
 'foodcomex_compound_providers',
 'foodcomex_compounds',
 'foods',
 'foods_copy',
 'health_effects',
 'nutrients',
 'pathways',
 'references']

In [80]:
references.head()

Unnamed: 0,id,ref_type,text,pubmed_id,link,title,creator_id,updater_id,created_at,updated_at,source_id,source_type
0,1,general,"Yannai, Shmuel. (2004) Dictionary of food comp...",,,,,,2015-02-23 22:39:47,2015-02-23 22:39:47,1,Compound
1,2,general,"Neveu V, Perez-Jimenez J, Vos F, Crespy V, du ...",,,,,,2015-02-23 22:39:50,2015-02-23 22:39:50,22,Compound
2,3,general,"Neveu V, Perez-Jimenez J, Vos F, Crespy V, du ...",,,,,,2015-02-23 22:39:50,2015-02-23 22:39:50,48,Compound
3,4,general,"de Villiers A, Vanhoenacker G, Majek P, Sandra...",,,,,,2015-02-23 22:39:52,2015-02-23 22:39:52,52,Compound
4,5,general,"Neveu V, Perez-Jimenez J, Vos F, Crespy V, du ...",,,,,,2015-02-23 22:39:54,2015-02-23 22:39:54,59,Compound


In [81]:
references.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31791 entries, 0 to 31790
Data columns (total 12 columns):
id             31791 non-null object
ref_type       31791 non-null object
text           31791 non-null object
pubmed_id      31791 non-null object
link           31791 non-null object
title          31791 non-null object
creator_id     31791 non-null object
updater_id     31791 non-null object
created_at     31791 non-null object
updated_at     31791 non-null object
source_id      31791 non-null object
source_type    31791 non-null object
dtypes: object(12)
memory usage: 2.9+ MB


### Drop table.

[(Back to top)](#top)
# Exploratory data analysis

<a id = 'eda_food_tax'></a>

## food taxonomies table

In [82]:
food_taxonomies = pd.read_sql_query('''SELECT ft.* 
                            FROM food_taxonomies ft
                            ;''', engine)

2019-03-15 13:45:43,423 INFO sqlalchemy.engine.base.Engine SELECT ft.* 
                            FROM food_taxonomies ft
                            ;


INFO:sqlalchemy.engine.base.Engine:SELECT ft.* 
                            FROM food_taxonomies ft
                            ;


2019-03-15 13:45:43,425 INFO sqlalchemy.engine.base.Engine ()


INFO:sqlalchemy.engine.base.Engine:()


In [83]:
food_taxonomies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 919 entries, 0 to 918
Data columns (total 7 columns):
id                      919 non-null object
food_id                 919 non-null object
ncbi_taxonomy_id        919 non-null object
classification_name     919 non-null object
classification_order    919 non-null object
created_at              919 non-null object
updated_at              919 non-null object
dtypes: object(7)
memory usage: 50.3+ KB


In [87]:
food_taxonomies.head()

Unnamed: 0,id,food_id,ncbi_taxonomy_id,classification_name,classification_order,created_at,updated_at
0,1,1,357850,\Eukaryota\,1,2017-03-29 18:35:53,2017-03-29 18:35:53
1,2,1,357850,\Viridiplantae\,2,2017-03-29 18:35:53,2017-03-29 18:35:53
2,3,1,357850,\Streptophyta\,3,2017-03-29 18:35:53,2017-03-29 18:35:53
3,4,1,357850,\Embryophyta\,4,2017-03-29 18:35:53,2017-03-29 18:35:53
4,5,1,357850,\Tracheophyta\,5,2017-03-29 18:35:53,2017-03-29 18:35:53


In [93]:
food_taxonomies[food_taxonomies['classification_order'] == '16']

Unnamed: 0,id,food_id,ncbi_taxonomy_id,classification_name,classification_order,created_at,updated_at
15,16,1,357850,\apioid superclade\,16,2017-03-29 18:35:53,2017-03-29 18:35:53
33,34,19,72341,\Anthemideae\,16,2017-03-29 18:35:54,2017-03-29 18:35:54
51,52,37,3821,\Phaseoleae\,16,2017-03-29 18:35:54,2017-03-29 18:35:54
68,69,54,2708,\Citrus\,16,2017-03-29 18:35:54,2017-03-29 18:35:54
84,85,70,66014,\Cymbopogon\,16,2017-03-29 18:35:54,2017-03-29 18:35:54
100,101,86,4232,\Heliantheae alliance\,16,2017-03-29 18:35:55,2017-03-29 18:35:55
118,119,104,3869,\Genisteae\,16,2017-03-29 18:35:55,2017-03-29 18:35:55
135,136,121,4146,\Olea\,16,2017-03-29 18:35:55,2017-03-29 18:35:55
151,152,137,271192,\apioid superclade\,16,2017-03-29 18:35:56,2017-03-29 18:35:56
182,183,168,49988,\Mentheae\,16,2017-03-29 18:35:56,2017-03-29 18:35:56


In [99]:
food_taxonomies['food_id'].value_counts()

19     18
137    18
249    18
849    18
86     18
1      18
168    17
342    17
104    17
466    17
201    17
436    17
37     17
909    17
814    17
403    17
879    17
887    16
797    16
527    16
599    16
54     16
121    16
786    16
70     16
799    16
185    16
420    16
218    16
748    16
       ..
388    15
483    15
359    15
316    15
723    15
622    15
513    14
801    14
726    14
374    14
302    14
571    14
585    14
267    13
453    13
880    13
289    13
155    13
548    13
798    13
802    11
331    11
863    11
561    10
280     9
756     8
617     5
861     5
544     5
615     2
Name: food_id, Length: 64, dtype: int64

### Disappointed to discover that the full taxonomy is apparently only available for 64 foods in the foods table.

### Could be better organized to have a column for each level of the taxonomy, and a row for each food_id.

### Drop id, created_at, updated_at. 

Keep food_id INTEGER foreign key, ncbi_taxonomy_id INTEGER, classification_name TEXT, classification_order INTEGER

<a id = 'test_query_ft'></a>

### Test query for food taxonomies to see how they'll look

In [119]:
pd.read_sql_query('''SELECT f.name, f.name_scientific,
                            ft.classification_name, 
                            ft.classification_order
                        FROM food_taxonomies ft
                        JOIN foods f ON f.id == ft.food_id
                        WHERE ft.food_id == 756

                        ORDER BY ft.classification_order
                    ;''', engine)

2019-03-15 14:18:28,726 INFO sqlalchemy.engine.base.Engine SELECT f.name, f.name_scientific,
                            ft.classification_name, 
                            ft.classification_order
                        FROM food_taxonomies ft
                        JOIN foods f ON f.id == ft.food_id
                        WHERE ft.food_id == 756

                        ORDER BY ft.classification_order
                    ;


INFO:sqlalchemy.engine.base.Engine:SELECT f.name, f.name_scientific,
                            ft.classification_name, 
                            ft.classification_order
                        FROM food_taxonomies ft
                        JOIN foods f ON f.id == ft.food_id
                        WHERE ft.food_id == 756

                        ORDER BY ft.classification_order
                    ;


2019-03-15 14:18:28,736 INFO sqlalchemy.engine.base.Engine ()


INFO:sqlalchemy.engine.base.Engine:()


Unnamed: 0,name,name_scientific,classification_name,classification_order
0,Morchella (Morel),Morchellaceae,\Eukaryota\,1
1,Morchella (Morel),Morchellaceae,\Fungi\,2
2,Morchella (Morel),Morchellaceae,\Dikarya\,3
3,Morchella (Morel),Morchellaceae,\Ascomycota\,4
4,Morchella (Morel),Morchellaceae,\Pezizomycotina\,5
5,Morchella (Morel),Morchellaceae,\Pezizomycetes\,6
6,Morchella (Morel),Morchellaceae,\Pezizales\,7
7,Morchella (Morel),Morchellaceae,\Morchellaceae\,8


In [None]:
#TEST 
UPDATE food_taxonomies
SET (order_01) = (SELECT LAG (order_01) OVER (order_01)
FROM food_taxonomies
CASE food_id WHEN (LAG(food_id) OVER (food_id) == food_id);

In [None]:
#THIS WORKS
UPDATE food_taxonomies
SET order_02 = '\Viridiplantae\'
WHERE food_id = 1;

In [None]:
#THIS WORKS 
UPDATE food_taxonomies
SET order_03 = (SELECT lag ( order_03, 1) OVER (ORDER BY order_03)
				FROM food_taxonomies
				WHERE food_id = 1)
				
WHERE food_id = 1;b

In [None]:
## Runs but sets all rows to null

UPDATE food_taxonomies
SET order_03 = (SELECT 
					CASE order_03 WHEN (lag ( order_03, 1) OVER (ORDER BY order_03) != NULL)
						THEN lag ( order_03, 1) OVER (ORDER BY order_03)
					END
				FROM food_taxonomies
				WHERE food_id = 1)
WHERE food_id = 1;



In [None]:
## Runs but doesn't do anything to any row

UPDATE food_taxonomies
SET order_03 = (SELECT 
					CASE order_03 WHEN (lag ( order_03, 1) OVER (ORDER BY order_03) != NULL)
						THEN lag ( order_03, 1) OVER (ORDER BY order_03)
					END
				FROM food_taxonomies
				WHERE food_id = 1)
WHERE food_id = 1
AND order_03 = NULL;

In [None]:
# Runs but doesn't do anything to any row
UPDATE food_taxonomies
SET order_03 = (SELECT lag ( order_03, 1) OVER (ORDER BY order_03)
				FROM food_taxonomies
				WHERE food_id = 1)
WHERE food_id = 1
AND order_03 = NULL;

In [None]:
#Sets all rows to Null
UPDATE food_taxonomies
SET order_03 = (SELECT CASE order_03
				WHEN order_03 == NULL
				THEN lag ( order_03, 1) OVER (ORDER BY order_03)
				END
				FROM food_taxonomies);

In [None]:
UPDATE food_taxonomies
SET order_03 = (SELECT CASE order_03
				WHEN order_03 == NULL
					AND (lag ( order_03, 1) OVER (ORDER BY order_03)) != NULL
				THEN lag ( order_03, 1) OVER (ORDER BY order_03)
				END
				FROM food_taxonomies);

In [None]:
#BREAKTHROUGH. Doesn't erase rows with data. Seems to set 845 out of 919 rows to null.
UPDATE food_taxonomies
SET order_03 = (SELECT lag ( order_03, 1) OVER (ORDER BY order_03)
				FROM food_taxonomies)
WHERE order_03 IS NULL;

In [None]:
#Does same as above.
#It knows to skip the row is isn't null, but it's inputting a Null value instead of the lag value.
UPDATE food_taxonomies
SET order_03 = (SELECT CASE order_03
					WHEN (lag ( order_03, 1) OVER (ORDER BY order_03)) IS NOT NULL
					THEN lag ( order_03, 1) OVER (ORDER BY order_03)
					END
					FROM food_taxonomies)
WHERE order_03 IS NULL;

In [None]:
Should I test just getting a lag value?

In [None]:
#This returns them but in the wrong order
SELECT order_03, lag ( order_03, 1, NULL) OVER (ORDER BY order_03)
FROM food_taxonomies
WHERE order_03 IS NOT NULL

In [None]:
#This returns them in the right order
SELECT lag ( order_03, 1, NULL) OVER (ORDER BY food_id)
FROM food_taxonomies
WHERE order_03 IS NOT NULL

In [None]:
[first line is null]
\Streptophyta\
\Streptophyta\
\Streptophyta\
\Streptophyta\
\Streptophyta\
\Streptophyta\
\Streptophyta\
\Streptophyta\
\Streptophyta\
\Streptophyta\
\Streptophyta\
\Streptophyta\
\Streptophyta\
\Streptophyta\
\Streptophyta\
\Streptophyta\
\Streptophyta\
\Streptophyta\
\Streptophyta\
\Lophotrochozoa\
\Streptophyta\
\Chordata\
\Chordata\
\Streptophyta\
\Chordata\
\Ecdysozoa\
\Streptophyta\
\Chordata\
\Chordata\
\Streptophyta\
\Streptophyta\
\Chordata\
\Streptophyta\
\Chordata\
\Chordata\
\Chordata\
\Chordata\
\Lophotrochozoa\
\Chordata\
\Dikarya\
\Chordata\
\Chordata\
\Chordata\
\Lophotrochozoa\
\Streptophyta\
\Chordata\
\Chordata\
\Streptophyta\
\Dikarya\
\Streptophyta\
\Streptophyta\
\Streptophyta\
\Streptophyta\
\Streptophyta\
\Streptophyta\
\Streptophyta\
\Streptophyta\
\Streptophyta\
\Streptophyta\
\Chordata\
\Streptophyta\
\Streptophyta\
\Streptophyta\
\Streptophyta\

In [None]:
#This returns 854 blanks. It skips the rows with content.
Why doesn't it print the content in the row when the lag has content?

SELECT lag ( order_03, 1, NULL) OVER (ORDER BY food_id)
FROM food_taxonomies
WHERE order_03 IS NULL

[(Back to top)](#top)
# Exploratory data analysis

<a id = 'eda_compounds'></a>

## compounds table

In [7]:
compounds = pd.read_sql_query('''SELECT c.* 
                            FROM compounds c
                            ;''', engine)

2019-03-16 11:06:04,975 INFO sqlalchemy.engine.base.Engine SELECT c.* 
                            FROM compounds c
                            ;


INFO:sqlalchemy.engine.base.Engine:SELECT c.* 
                            FROM compounds c
                            ;


2019-03-16 11:06:04,975 INFO sqlalchemy.engine.base.Engine ()


INFO:sqlalchemy.engine.base.Engine:()


In [12]:
compounds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28771 entries, 0 to 28770
Columns: 106 entries, id to phytohub_id
dtypes: object(106)
memory usage: 23.3+ MB


In [15]:
for column in compounds.columns:
    print(column, compounds[compounds[column] != ''].shape[0])

id 28771
legacy_id 19331
type 28771
public_id 28771
name 28771
export 28771
state 9169
annotation_quality 27471
description 23292
cas_number 18213
melting_point 6926
protein_formula 0
protein_weight 0
experimental_solubility 886
experimental_logp 890
hydrophobicity 0
isoelectric_point 0
metabolism 1
kegg_compound_id 6651
pubchem_compound_id 17854
pubchem_substance_id 5
chebi_id 2530
het_id 974
uniprot_id 0
uniprot_name 0
genbank_id 0
wikipedia_id 4502
synthesis_citations 0
general_citations 171
comments 476
protein_structure_file_name 0
protein_structure_content_type 0
protein_structure_file_size 0
protein_structure_updated_at 0
msds_file_name 1453
msds_content_type 1453
msds_file_size 1453
msds_updated_at 1453
creator_id 76
updater_id 6568
created_at 28771
updated_at 28771
phenolexplorer_id 777
dfc_id 15059
hmdb_id 20760
duke_id 6193
drugbank_id 693
bigg_id 876
eafus_id 2950
knapsack_id 4814
boiling_point 2304
boiling_point_reference 2217
charge 64
charge_reference 0
density 773
densi

In [44]:
compound_columns_to_drop = []

In [45]:
for column in compounds.columns:
    if compounds[compounds[column] != ''].shape[0] < 23000:
        compound_columns_to_drop.append(column)

In [46]:
len(compound_columns_to_drop)

83

### Dropped almost all columns

# Exploratory data analysis
## compounds enzymes table

In [152]:
compounds_enzymes = pd.read_sql_query('''SELECT ce.* 
                            FROM compounds_enzymes ce
                            ;''', engine)

2019-03-16 12:58:57,770 INFO sqlalchemy.engine.base.Engine SELECT ce.* 
                            FROM compounds_enzymes ce
                            ;


INFO:sqlalchemy.engine.base.Engine:SELECT ce.* 
                            FROM compounds_enzymes ce
                            ;


2019-03-16 12:58:57,772 INFO sqlalchemy.engine.base.Engine ()


INFO:sqlalchemy.engine.base.Engine:()


In [153]:
compounds_enzymes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105089 entries, 0 to 105088
Data columns (total 3 columns):
id             105089 non-null int64
compound_id    105089 non-null int64
enzyme_id      105089 non-null int64
dtypes: int64(3)
memory usage: 2.4 MB


In [137]:
compounds_enzymes.head()

Unnamed: 0,id,compound_id,enzyme_id,citations,created_at,updated_at,creator_id,updater_id
0,1,362,1,HMDB,2015-02-23 22:40:56,2015-02-23 22:40:56,,
1,2,362,2,HMDB,2015-02-23 22:40:57,2015-02-23 22:40:57,,
2,3,362,3,HMDB,2015-02-23 22:40:57,2015-02-23 22:40:57,,
3,4,362,4,HMDB,2015-02-23 22:40:57,2015-02-23 22:40:57,,
4,5,362,5,HMDB,2015-02-23 22:40:57,2015-02-23 22:40:57,,


### A given compound id may correspond to many enzyme_ids. This seems okay. The enzymes table has the names of all the enzymes corresponding to these ids. Need to set foreign key for enzyme_id.

### To drop: everything except id INTEGER PRIMARY KEY, compound_id INTEGER, enzyme id INTEGER FOREIGN KEY.

# Exploratory data analysis
## compounds flavors table

In [151]:
compounds_flavors = pd.read_sql_query('''SELECT cf.* 
                            FROM compounds_flavors cf
                            ;''', engine)

2019-03-16 12:58:22,986 INFO sqlalchemy.engine.base.Engine SELECT cf.* 
                            FROM compounds_flavors cf
                            ;


INFO:sqlalchemy.engine.base.Engine:SELECT cf.* 
                            FROM compounds_flavors cf
                            ;


2019-03-16 12:58:22,996 INFO sqlalchemy.engine.base.Engine ()


INFO:sqlalchemy.engine.base.Engine:()


In [154]:
compounds_flavors.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11613 entries, 0 to 11612
Data columns (total 10 columns):
id             11613 non-null object
compound_id    11613 non-null object
flavor_id      11613 non-null object
citations      11613 non-null object
created_at     11613 non-null object
updated_at     11613 non-null object
creator_id     11613 non-null object
updater_id     11613 non-null object
source_id      11613 non-null object
source_type    11613 non-null object
dtypes: object(10)
memory usage: 907.3+ KB


In [159]:
compounds_flavors.head(3)

Unnamed: 0,id,compound_id,flavor_id,citations,created_at,updated_at,creator_id,updater_id,source_id,source_type
0,1,11947,159,"# Arn, H, Acree TE. ���Flavornet: A database o...",2011-10-02 06:30:05 UTC,2015-10-23 23:29:45 UTC,,,11947,Compound
1,2,8298,213,"# Arn, H, Acree TE. ���Flavornet: A database o...",2011-10-02 06:30:05 UTC,2015-10-23 23:29:56 UTC,,,8298,Compound
2,3,8298,156,"# Arn, H, Acree TE. ���Flavornet: A database o...",2011-10-02 06:30:05 UTC,2015-10-23 23:29:45 UTC,,,8298,Compound


In [183]:
#test query to see how it will look to bring together compounds and flavors
pd.read_sql_query('''SELECT fl.name
                        FROM compounds_flavors cf
                        JOIN flavors fl ON fl.id = cf.flavor_id
                        WHERE cf.compound_id == '11947'
                    ;''', engine)

2019-03-16 13:14:17,294 INFO sqlalchemy.engine.base.Engine SELECT fl.name
                        FROM compounds_flavors cf
                        JOIN flavors fl ON fl.id = cf.flavor_id
                        WHERE cf.compound_id == '11947'
                    ;


INFO:sqlalchemy.engine.base.Engine:SELECT fl.name
                        FROM compounds_flavors cf
                        JOIN flavors fl ON fl.id = cf.flavor_id
                        WHERE cf.compound_id == '11947'
                    ;


2019-03-16 13:14:17,294 INFO sqlalchemy.engine.base.Engine ()


INFO:sqlalchemy.engine.base.Engine:()


Unnamed: 0,name
0,fish
1,fishy
2,oily
3,rancid
4,sweaty
5,fruity


### 2816 compound_ids, 716 flavor ids.

### Only keep compound_id FOREIGN KEY, flavor_id FOREIGN KEy.

# Exploratory data analysis
## compounds health effects table

In [195]:
compounds_health_effects = pd.read_sql_query('''SELECT che.* 
                            FROM compounds_health_effects che
                            ;''', engine)

2019-03-16 13:25:55,526 INFO sqlalchemy.engine.base.Engine SELECT che.* 
                            FROM compounds_health_effects che
                            ;


INFO:sqlalchemy.engine.base.Engine:SELECT che.* 
                            FROM compounds_health_effects che
                            ;


2019-03-16 13:25:55,526 INFO sqlalchemy.engine.base.Engine ()


INFO:sqlalchemy.engine.base.Engine:()


In [197]:
compounds_health_effects.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11091 entries, 0 to 11090
Data columns (total 14 columns):
id                         11091 non-null object
compound_id                11091 non-null object
health_effect_id           11091 non-null object
orig_health_effect_name    11091 non-null object
orig_compound_name         11091 non-null object
orig_citation              11091 non-null object
citation                   11091 non-null object
citation_type              11091 non-null object
created_at                 11091 non-null object
updated_at                 11091 non-null object
creator_id                 11091 non-null object
updater_id                 11091 non-null object
source_id                  11091 non-null object
source_type                11091 non-null object
dtypes: object(14)
memory usage: 1.2+ MB


In [198]:
compounds_health_effects.head()

Unnamed: 0,id,compound_id,health_effect_id,orig_health_effect_name,orig_compound_name,orig_citation,citation,citation_type,created_at,updated_at,creator_id,updater_id,source_id,source_type
0,1,453,1,(+)-Inotropic,THEOPHYLLINE,,DUKE,DATABASE,4/10/12 14:16,11/5/14 14:49,,,453,Compound
1,2,2100,1,(+)-Inotropic,CAFFEINE,,DUKE,DATABASE,4/10/12 14:16,11/5/14 14:49,,,2100,Compound
2,3,9030,2,(-)-Chronotropic,BORNYL-ACETATE,,DUKE,DATABASE,4/10/12 14:16,11/5/14 14:49,,,9030,Compound
3,4,13577,2,(-)-Chronotropic,BORNEOL,,DUKE,DATABASE,4/10/12 14:16,11/5/14 14:49,,,13577,Compound
4,5,14619,2,(-)-Chronotropic,"1,8-CINEOLE",,DUKE,DATABASE,4/10/12 14:16,11/5/14 14:49,,,14619,Compound


In [207]:
compounds[compounds['id'] == 14619]

Unnamed: 0,id,name,description,moldb_formula,moldb_id
12177,14619,"1,8-Cineole","Occurs in eucalyptus, lavender, sage and many ...",C10H18O,35937


In [196]:
health_effects.head()

Unnamed: 0,id,name,description,chebi_name,chebi_id,chebi_definition
0,1,(+)-inotropic,An agent that alters the force or energy of mu...,,,
1,2,(-)-chronotropic,An agent that may change theÂ heartÂ rate by a...,,,
2,3,(-)-inotropic,An agent that alters the force or energy of mu...,,,
3,4,11beta-hydroxysteroid-dehydrogenase inhibitor,,enzyme inhibitor,23924.0,A compound or agent that combines with an enzy...
4,5,12-lipoxygenase inhibitor,,enzyme inhibitor,23924.0,A compound or agent that combines with an enzy...


### Only need to keep compound_id and health_effect_id. Brief comparison with compound table and health effect table shows that this table's "original health effect name" and "original compound name" columns aren't necessary. This info is contained in the health effects and compound tables.

# Exploratory data analysis
## compounds pathways table

In [219]:
compounds_pathways = pd.read_sql_query('''SELECT cp.* 
                            FROM compounds_pathways cp
                            ;''', engine)

2019-03-16 13:39:26,276 INFO sqlalchemy.engine.base.Engine SELECT cp.* 
                            FROM compounds_pathways cp
                            ;


INFO:sqlalchemy.engine.base.Engine:SELECT cp.* 
                            FROM compounds_pathways cp
                            ;


2019-03-16 13:39:26,276 INFO sqlalchemy.engine.base.Engine ()


INFO:sqlalchemy.engine.base.Engine:()


In [222]:
compounds_pathways.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1604 entries, 0 to 1603
Data columns (total 7 columns):
id             1604 non-null object
compound_id    1604 non-null object
pathway_id     1604 non-null object
creator_id     1604 non-null object
updater_id     1604 non-null object
created_at     1604 non-null object
updated_at     1604 non-null object
dtypes: object(7)
memory usage: 87.8+ KB


In [223]:
compounds_pathways.head()

Unnamed: 0,id,compound_id,pathway_id,creator_id,updater_id,created_at,updated_at
0,1,316,1,,,2015-02-23 22:40:48,2015-02-23 22:40:48
1,2,362,2,,,2015-02-23 22:40:58,2015-02-23 22:40:58
2,3,374,3,,,2015-02-23 22:41:14,2015-02-23 22:41:14
3,4,374,4,,,2015-02-23 22:41:14,2015-02-23 22:41:14
4,5,433,1,,,2015-02-23 22:42:20,2015-02-23 22:42:20


### Only keep compound_id and pathway_id.

<a id = 'droppingcolumns'></a>

[(Back to top)](#top)

# Dropping columns

<a id = 'dropcol_foods'></a>

## Dropping columns from food table

In [104]:
# connect to db
connection = sqlite3.connect('foodb.db')
cur = connection.cursor()

In [23]:
#Look at columns in foods table
cur.execute('''PRAGMA table_info(foods)
;''',).fetchall()

[(0, 'id', '', 0, None, 0),
 (1, 'name', '', 0, None, 0),
 (2, 'name_scientific', '', 0, None, 0),
 (3, 'description', '', 0, None, 0),
 (4, 'itis_id', '', 0, None, 0),
 (5, 'wikipedia_id', '', 0, None, 0),
 (6, 'wikipedia_id_img', '', 0, None, 0),
 (7, 'picture_content_type', '', 0, None, 0),
 (8, 'picture_file_size', '', 0, None, 0),
 (9, 'picture_updated_at', '', 0, None, 0),
 (10, 'legacy_id', '', 0, None, 0),
 (11, 'food_group', '', 0, None, 0),
 (12, 'food_subgroup', '', 0, None, 0),
 (13, 'food_type', '', 0, None, 0),
 (14, 'created_at', '', 0, None, 0),
 (15, 'updated_at', '', 0, None, 0),
 (16, 'creator_id', '', 0, None, 0),
 (17, 'updater_id', '', 0, None, 0),
 (18, 'export_to_afcdb', '', 0, None, 0),
 (19, 'category', '', 0, None, 0),
 (20, 'ncbi_taxonomy_id', '', 0, None, 0),
 (21, 'export_to_foodb', '', 0, None, 0)]

In [12]:
#Create a new foods table
cur.execute('''CREATE TABLE foods_copy
                (id INTEGER PRIMARY KEY, name TEXT, name_scientific TEXT, 
                description TEXT, wikipedia_id TEXT, wikipedia_id_img TEXT,
                food_group TEXT, food_subgroup TEXT, food_type TEXT, 
                category TEXT, ncbi_taxonomy_id INTEGER)
            ;''')

<sqlite3.Cursor at 0x8cfd5e0>

In [14]:
#Insert values into columns of new table
cur.execute('''INSERT INTO foods_copy
                    (id, name, name_scientific, 
                    description, wikipedia_id, wikipedia_id_img,
                    food_group, food_subgroup, food_type, 
                    category, ncbi_taxonomy_id)
                SELECT id, name, name_scientific, 
                    description, wikipedia_id, wikipedia_id_img, 
                    food_group, food_subgroup, food_type, 
                    category, ncbi_taxonomy_id
                FROM foods
            ;''')

<sqlite3.Cursor at 0x8cfd5e0>

In [15]:
#Check that columns were populated from old table
#This function works without committing 
cur.execute('''SELECT *
                FROM foods_copy
                LIMIT 5
            ;''').fetchall()

#Check that columns were populated from old table
#Doesn't work before commit. Works after commit.
#pd.read_sql_query('''SELECT * 
#                        FROM foods_copy
#                    ;''', engine)

[(1,
  'Angelica',
  'Angelica keiskei',
  'Angelica is a genus of about 60 species of tall biennial and perennial herbs in the family Apiaceae, native to temperate and subarctic regions of the Northern Hemisphere, reaching as far north as Iceland and Lapland. They grow to 1äóñ3 m tall, with large bipinnate leaves and large compound umbels of white or greenish-white flowers. Some species can be found in purple moor and rush pastures.',
  'Angelica',
  '1.jpg',
  'Herbs and Spices',
  'Herbs',
  'Type 1',
  'specific',
  357850),
 (2,
  'Savoy cabbage',
  'Brassica oleracea var. sabauda',
  'Savoy cabbage (Brassica oleracea convar. capitata var. sabauda L. ) is a variety of the cabbage, a cultivar of the plant species Brassica oleracea. Savoy cabbage is a winter vegetable. A variety of the savoy cabbage is the January King Cabbage. Savoy cabbage can be used in a variety of recipes. It pairs well with red wine, apples, spices, horseradish and meat. It can be used for roulades, in stews a

In [56]:
foods_copy_Nones = foods_copy['ncbi_taxonomy_id'] == ''

In [57]:
foods_copy_Nones.value_counts()

False    631
True     276
Name: ncbi_taxonomy_id, dtype: int64

Null values are not reading as Null but there are empty values.

In [16]:
#Check data types of new table
#Works before commit

cur.execute('''PRAGMA table_info(foods_copy)
;''',).fetchall()

#Check data types of new table
#Only works after commit
#New table dropped 90kb from memory compared to old table
#foods_copy = pd.read_sql_query('''SELECT * 
#                        FROM foods_copy
#                    ;''', engine)
#foods_copy.info()

[(0, 'id', 'INTEGER', 0, None, 1),
 (1, 'name', 'TEXT', 0, None, 0),
 (2, 'name_scientific', 'TEXT', 0, None, 0),
 (3, 'description', 'TEXT', 0, None, 0),
 (4, 'wikipedia_id', 'TEXT', 0, None, 0),
 (5, 'wikipedia_id_img', 'TEXT', 0, None, 0),
 (6, 'food_group', 'TEXT', 0, None, 0),
 (7, 'food_subgroup', 'TEXT', 0, None, 0),
 (8, 'food_type', 'TEXT', 0, None, 0),
 (9, 'category', 'TEXT', 0, None, 0),
 (10, 'ncbi_taxonomy_id', 'INTEGER', 0, None, 0)]

In [None]:
#Check columns of new table
cur.execute('''PRAGMA table_info(foods_copy)
;''',).fetchall()

In [17]:
#Delete the old table
cur.execute('''DROP TABLE foods;''')

<sqlite3.Cursor at 0x8cfd5e0>

In [18]:
#Rename new table to  foods
cur.execute('''ALTER TABLE foods_copy
                RENAME TO foods
                ;''')

<sqlite3.Cursor at 0x8cfd5e0>

In [19]:
#Check list of tables in database
cur.execute('''SELECT name
                FROM sqlite_master 
                WHERE type ='table' 
                    AND name NOT LIKE 'sqlite_%'
            ;''').fetchall()

[('compound_alternate_parents',),
 ('compound_external_descriptors',),
 ('compound_substituents',),
 ('compound_synonyms',),
 ('compounds_enzymes',),
 ('compounds_flavors',),
 ('compounds_health_effects',),
 ('compounds_pathways',),
 ('compounds',),
 ('enzymes',),
 ('flavors',),
 ('food_taxonomies',),
 ('foodcomex_compound_providers',),
 ('foodcomex_compounds',),
 ('health_effects',),
 ('nutrients',),
 ('pathways',),
 ('references',),
 ('foods',)]

In [20]:
#Check columns of new table
cur.execute('''PRAGMA table_info(foods)
;''',).fetchall()

[(0, 'id', 'INTEGER', 0, None, 1),
 (1, 'name', 'TEXT', 0, None, 0),
 (2, 'name_scientific', 'TEXT', 0, None, 0),
 (3, 'description', 'TEXT', 0, None, 0),
 (4, 'wikipedia_id', 'TEXT', 0, None, 0),
 (5, 'wikipedia_id_img', 'TEXT', 0, None, 0),
 (6, 'food_group', 'TEXT', 0, None, 0),
 (7, 'food_subgroup', 'TEXT', 0, None, 0),
 (8, 'food_type', 'TEXT', 0, None, 0),
 (9, 'category', 'TEXT', 0, None, 0),
 (10, 'ncbi_taxonomy_id', 'INTEGER', 0, None, 0)]

In [21]:
# Save (commit) the changes
connection.commit()

In [59]:
# We can also close the connection if we are done with it.
# Just be sure any changes have been committed or they will be lost.
connection.close()

[(Back to top)](#top)

<a id = 'dropcol_enzymes'></a>

## Dropping columns from enzymes table

In [25]:
#Look at columns in enzymes table
cur.execute('''PRAGMA table_info(enzymes)
;''',).fetchall()

[(0, 'id', '', 0, None, 0),
 (1, 'name', '', 0, None, 0),
 (2, 'gene_name', '', 0, None, 0),
 (3, 'description', '', 0, None, 0),
 (4, 'go_classification', '', 0, None, 0),
 (5, 'general_function', '', 0, None, 0),
 (6, 'specific_function', '', 0, None, 0),
 (7, 'pathway', '', 0, None, 0),
 (8, 'reaction', '', 0, None, 0),
 (9, 'cellular_location', '', 0, None, 0),
 (10, 'signals', '', 0, None, 0),
 (11, 'transmembrane_regions', '', 0, None, 0),
 (12, 'molecular_weight', '', 0, None, 0),
 (13, 'theoretical_pi', '', 0, None, 0),
 (14, 'locus', '', 0, None, 0),
 (15, 'chromosome', '', 0, None, 0),
 (16, 'uniprot_name', '', 0, None, 0),
 (17, 'uniprot_id', '', 0, None, 0),
 (18, 'pdb_id', '', 0, None, 0),
 (19, 'genbank_protein_id', '', 0, None, 0),
 (20, 'genbank_gene_id', '', 0, None, 0),
 (21, 'genecard_id', '', 0, None, 0),
 (22, 'genatlas_id', '', 0, None, 0),
 (23, 'hgnc_id', '', 0, None, 0),
 (24, 'hprd_id', '', 0, None, 0),
 (25, 'organism', '', 0, None, 0),
 (26, 'general_citatio

In [26]:
#Create a new enzymes table
cur.execute('''CREATE TABLE enzymes_copy
                (id INTEGER PRIMARY KEY, name TEXT, gene_name TEXT, 
                uniprot_id TEXT)
            ;''')

<sqlite3.Cursor at 0x8cfd5e0>

In [29]:
#Look at columns in enzymes table
cur.execute('''PRAGMA table_info(enzymes_copy)
;''',).fetchall()

[(0, 'id', 'INTEGER', 0, None, 1),
 (1, 'name', 'TEXT', 0, None, 0),
 (2, 'gene_name', 'TEXT', 0, None, 0),
 (3, 'uniprot_id', 'TEXT', 0, None, 0)]

In [31]:
#Insert values into columns of new table
cur.execute('''INSERT INTO enzymes_copy
                    (id, name, gene_name, uniprot_id)
                SELECT id, name, gene_name, uniprot_id
                FROM enzymes
            ;''')

<sqlite3.Cursor at 0x8cfd5e0>

In [32]:
#Check data types of new table
#Works before commit

cur.execute('''PRAGMA table_info(enzymes_copy)
;''',).fetchall()

[(0, 'id', 'INTEGER', 0, None, 1),
 (1, 'name', 'TEXT', 0, None, 0),
 (2, 'gene_name', 'TEXT', 0, None, 0),
 (3, 'uniprot_id', 'TEXT', 0, None, 0)]

In [34]:
#Check that columns were populated from old table
#This function works without committing 
cur.execute('''SELECT *
                FROM enzymes_copy
                LIMIT 5
            ;''').fetchall()

[(1, 'UDP-glucuronosyltransferase 2B28', 'UGT2B28', 'Q9BY64'),
 (2, 'Estrogen receptor beta', 'ESR2', 'Q92731'),
 (3, 'UDP-glucuronosyltransferase 2B4', 'UGT2B4', 'P06133'),
 (4, 'UDP-glucuronosyltransferase 1-4', 'UGT1A4', 'P22310'),
 (5, 'UDP-glucuronosyltransferase 2B10', 'UGT2B10', 'P36537')]

In [35]:
#Delete the old table
cur.execute('''DROP TABLE enzymes;''')

<sqlite3.Cursor at 0x8cfd5e0>

In [36]:
#Rename new table to enzymes
cur.execute('''ALTER TABLE enzymes_copy
                RENAME TO enzymes
                ;''')

<sqlite3.Cursor at 0x8cfd5e0>

In [37]:
#Check list of tables in database
cur.execute('''SELECT name
                FROM sqlite_master 
                WHERE type ='table' 
                    AND name NOT LIKE 'sqlite_%'
            ;''').fetchall()

[('compound_alternate_parents',),
 ('compound_external_descriptors',),
 ('compound_substituents',),
 ('compound_synonyms',),
 ('compounds_enzymes',),
 ('compounds_flavors',),
 ('compounds_health_effects',),
 ('compounds_pathways',),
 ('compounds',),
 ('flavors',),
 ('food_taxonomies',),
 ('foodcomex_compound_providers',),
 ('foodcomex_compounds',),
 ('health_effects',),
 ('nutrients',),
 ('pathways',),
 ('references',),
 ('foods',),
 ('enzymes',)]

In [39]:
#Check columns of new table
cur.execute('''PRAGMA table_info(enzymes)
;''',).fetchall()

[(0, 'id', 'INTEGER', 0, None, 1),
 (1, 'name', 'TEXT', 0, None, 0),
 (2, 'gene_name', 'TEXT', 0, None, 0),
 (3, 'uniprot_id', 'TEXT', 0, None, 0)]

In [40]:
# Save (commit) the changes
connection.commit()

[(Back to top)](#top)

<a id = 'dropcol_health_effects'></a>

## Dropping columns from health_effects table

In [41]:
#Look at columns in enzymes table
cur.execute('''PRAGMA table_info(health_effects)
;''',).fetchall()

[(0, 'id', '', 0, None, 0),
 (1, 'name', '', 0, None, 0),
 (2, 'description', '', 0, None, 0),
 (3, 'chebi_name', '', 0, None, 0),
 (4, 'chebi_id', '', 0, None, 0),
 (5, 'created_at', '', 0, None, 0),
 (6, 'updated_at', '', 0, None, 0),
 (7, 'creator_id', '', 0, None, 0),
 (8, 'updater_id', '', 0, None, 0),
 (9, 'chebi_definition', '', 0, None, 0)]

In [42]:
#Create a new enzymes table
cur.execute('''CREATE TABLE health_effects_copy
                (id INTEGER PRIMARY KEY, name TEXT, description TEXT, 
                chebi_name TEXT, chebi_id INTEGER, chebi_definition TEXT)
            ;''')

#Look at columns in enzymes table
cur.execute('''PRAGMA table_info(health_effects_copy)
;''',).fetchall()

[(0, 'id', 'INTEGER', 0, None, 1),
 (1, 'name', 'TEXT', 0, None, 0),
 (2, 'description', 'TEXT', 0, None, 0),
 (3, 'chebi_name', 'TEXT', 0, None, 0),
 (4, 'chebi_id', 'INTEGER', 0, None, 0),
 (5, 'chebi_definition', 'TEXT', 0, None, 0)]

In [43]:
#Insert values into columns of new table
cur.execute('''INSERT INTO health_effects_copy
                    (id, name, description,
                    chebi_name, chebi_id, chebi_definition)
                SELECT id, name, description,
                    chebi_name, chebi_id, chebi_definition
                FROM health_effects
            ;''')
#Check data types of new table
#Works before commit

cur.execute('''PRAGMA table_info(health_effects_copy)
;''',).fetchall()

[(0, 'id', 'INTEGER', 0, None, 1),
 (1, 'name', 'TEXT', 0, None, 0),
 (2, 'description', 'TEXT', 0, None, 0),
 (3, 'chebi_name', 'TEXT', 0, None, 0),
 (4, 'chebi_id', 'INTEGER', 0, None, 0),
 (5, 'chebi_definition', 'TEXT', 0, None, 0)]

In [44]:
#Check that columns were populated from old table
#This function works without committing 
cur.execute('''SELECT *
                FROM health_effects_copy
                LIMIT 5
            ;''').fetchall()

[(1,
  '(+)-inotropic',
  'An agent that alters the force or energy of muscular contractions. Positively inotropic agents increase the strength of muscular contraction.',
  'NULL',
  'NULL',
  'NULL'),
 (2,
  '(-)-chronotropic',
  'An agent that may change theÂ\xa0heartÂ\xa0rate by affecting theÂ\xa0nervesÂ\xa0controlling the heart, or by changing theÂ\xa0rhythmÂ\xa0produced by theÂ\xa0sinoatrial node. Negative chronotropes decrease heart rate.',
  'NULL',
  'NULL',
  'NULL'),
 (3,
  '(-)-inotropic',
  'An agent that alters the force or energy of muscular contractions. NegativelyÂ\xa0inotropicÂ\xa0agents weaken the force of muscular contractions.',
  'NULL',
  'NULL',
  'NULL'),
 (4,
  '11beta-hydroxysteroid-dehydrogenase inhibitor',
  'NULL',
  'enzyme inhibitor',
  23924,
  'A compound or agent that combines with an enzyme in such a manner as to prevent the normal substrate-enzyme combination and the catalytic reaction.'),
 (5,
  '12-lipoxygenase inhibitor',
  'NULL',
  'enzyme inhib

In [45]:
#Delete the old table
cur.execute('''DROP TABLE health_effects;''')

#Rename new table to enzymes
cur.execute('''ALTER TABLE health_effects_copy
                RENAME TO health_effects
                ;''')
#Check list of tables in database
cur.execute('''SELECT name
                FROM sqlite_master 
                WHERE type ='table' 
                    AND name NOT LIKE 'sqlite_%'
            ;''').fetchall()

[('compound_alternate_parents',),
 ('compound_external_descriptors',),
 ('compound_substituents',),
 ('compound_synonyms',),
 ('compounds_enzymes',),
 ('compounds_flavors',),
 ('compounds_health_effects',),
 ('compounds_pathways',),
 ('compounds',),
 ('flavors',),
 ('food_taxonomies',),
 ('foodcomex_compound_providers',),
 ('foodcomex_compounds',),
 ('nutrients',),
 ('pathways',),
 ('references',),
 ('foods',),
 ('enzymes',),
 ('health_effects',)]

In [46]:
#Check columns of new table
cur.execute('''PRAGMA table_info(health_effects)
;''',).fetchall()

[(0, 'id', 'INTEGER', 0, None, 1),
 (1, 'name', 'TEXT', 0, None, 0),
 (2, 'description', 'TEXT', 0, None, 0),
 (3, 'chebi_name', 'TEXT', 0, None, 0),
 (4, 'chebi_id', 'INTEGER', 0, None, 0),
 (5, 'chebi_definition', 'TEXT', 0, None, 0)]

In [47]:
# Save (commit) the changes
connection.commit()

[(Back to top)](#top)

<a id = 'dropcol_flavors'></a>

## Dropping columns from flavors table

In [48]:
#Look at columns in enzymes table
cur.execute('''PRAGMA table_info(flavors)
;''',).fetchall()

[(0, 'id', '', 0, None, 0),
 (1, 'name', '', 0, None, 0),
 (2, 'flavor_group', '', 0, None, 0),
 (3, 'category', '', 0, None, 0),
 (4, 'created_at', '', 0, None, 0),
 (5, 'updated_at', '', 0, None, 0),
 (6, 'creator_id', '', 0, None, 0),
 (7, 'updater_id', '', 0, None, 0)]

In [49]:
#Create a new enzymes table
cur.execute('''CREATE TABLE flavors_copy
                (id INTEGER PRIMARY KEY, name TEXT, flavor_group TEXT)
            ;''')

#Look at columns in enzymes table
cur.execute('''PRAGMA table_info(flavors_copy)
;''',).fetchall()

[(0, 'id', 'INTEGER', 0, None, 1),
 (1, 'name', 'TEXT', 0, None, 0),
 (2, 'flavor_group', 'TEXT', 0, None, 0)]

In [50]:
#Insert values into columns of new table
cur.execute('''INSERT INTO flavors_copy
                    (id, name, flavor_group)
                SELECT id, name, flavor_group
                FROM flavors
            ;''')
#Check data types of new table
#Works before commit

cur.execute('''PRAGMA table_info(flavors_copy)
;''',).fetchall()

[(0, 'id', 'INTEGER', 0, None, 1),
 (1, 'name', 'TEXT', 0, None, 0),
 (2, 'flavor_group', 'TEXT', 0, None, 0)]

In [51]:
#Check that columns were populated from old table
#This function works without committing 
cur.execute('''SELECT *
                FROM flavors_copy
                LIMIT 5
            ;''').fetchall()

[(1, 'celery', 'vegetable'),
 (2, 'corn', 'vegetable'),
 (3, 'cucumber', 'vegetable'),
 (4, 'horseradish', 'vegetable'),
 (5, 'vegetable', 'vegetable')]

In [52]:
#Delete the old table
cur.execute('''DROP TABLE flavors;''')

#Rename new table to enzymes
cur.execute('''ALTER TABLE flavors_copy
                RENAME TO flavors
                ;''')
#Check list of tables in database
cur.execute('''SELECT name
                FROM sqlite_master 
                WHERE type ='table' 
                    AND name NOT LIKE 'sqlite_%'
            ;''').fetchall()

[('compound_alternate_parents',),
 ('compound_external_descriptors',),
 ('compound_substituents',),
 ('compound_synonyms',),
 ('compounds_enzymes',),
 ('compounds_flavors',),
 ('compounds_health_effects',),
 ('compounds_pathways',),
 ('compounds',),
 ('food_taxonomies',),
 ('foodcomex_compound_providers',),
 ('foodcomex_compounds',),
 ('nutrients',),
 ('pathways',),
 ('references',),
 ('foods',),
 ('enzymes',),
 ('health_effects',),
 ('flavors',)]

In [53]:
#Check columns of new table
cur.execute('''PRAGMA table_info(flavors)
;''',).fetchall()

[(0, 'id', 'INTEGER', 0, None, 1),
 (1, 'name', 'TEXT', 0, None, 0),
 (2, 'flavor_group', 'TEXT', 0, None, 0)]

In [54]:
# Save (commit) the changes
connection.commit()

[(Back to top)](#top)

<a id = 'dropcol_pathways'></a>

## Dropping columns from pathways table

In [72]:
#Look at columns in enzymes table
cur.execute('''PRAGMA table_info(pathways)
;''',).fetchall()

[(0, 'id', '', 0, None, 0),
 (1, 'smpdb_id', '', 0, None, 0),
 (2, 'kegg_map_id', '', 0, None, 0),
 (3, 'name', '', 0, None, 0),
 (4, 'created_at', '', 0, None, 0),
 (5, 'updated_at', '', 0, None, 0)]

In [73]:
#Create a new  table
cur.execute('''CREATE TABLE pathways_copy
                (id INTEGER PRIMARY KEY, smpdb_id TEXT, kegg_map_id TEXT, 
                name TEXT)
            ;''')

#Insert values into columns of new table
cur.execute('''INSERT INTO pathways_copy
                    (id, smpdb_id, kegg_map_id, name)
                SELECT id, smpdb_id, kegg_map_id, name
                FROM pathways
            ;''')

#Delete the old table
cur.execute('''DROP TABLE pathways;''')

#Rename new table to enzymes
cur.execute('''ALTER TABLE pathways_copy
                RENAME TO pathways
                ;''')

<sqlite3.Cursor at 0x8cfd5e0>

In [74]:
#Check columns of new table
cur.execute('''PRAGMA table_info(pathways)
;''',).fetchall()

[(0, 'id', 'INTEGER', 0, None, 1),
 (1, 'smpdb_id', 'TEXT', 0, None, 0),
 (2, 'kegg_map_id', 'TEXT', 0, None, 0),
 (3, 'name', 'TEXT', 0, None, 0)]

In [75]:
#Check that columns were populated from old table
#This function works without committing 
cur.execute('''SELECT *
                FROM pathways
                LIMIT 3
            ;''').fetchall()

[(1, 'SMP00006', 'map00350', 'Tyrosine Metabolism'),
 (2, 'SMP00068', 'map00150', 'Androgen and Estrogen Metabolism'),
 (3, 'SMP00011', 'map00562', 'Inositol Metabolism')]

In [76]:
# Save (commit) the changes
connection.commit()

[(Back to top)](#top)

<a id = 'dropcol_food_tax'></a>

## Dropping columns from food_taxonomies table

In [237]:
#Look at columns in enzymes table
cur.execute('''PRAGMA table_info(food_taxonomies)
;''',).fetchall()

[(0, 'id', 'INTEGER', 0, None, 0),
 (1, 'food_id', 'INTEGER', 0, None, 0),
 (2, 'ncbi_taxonomy_id', 'INTEGER', 0, None, 0),
 (3, 'classification_name', 'TEXT', 0, None, 0),
 (4, 'classification_order', 'INTEGER', 0, None, 0),
 (5, 'order_18', 'INTEGER', 0, None, 0),
 (6, 'order_17', 'INTEGER', 0, None, 0),
 (7, 'order_16', 'INTEGER', 0, None, 0),
 (8, 'order_15', 'INTEGER', 0, None, 0),
 (9, 'order_14', 'INTEGER', 0, None, 0),
 (10, 'order_13', 'INTEGER', 0, None, 0),
 (11, 'order_12', 'INTEGER', 0, None, 0),
 (12, 'order_11', 'INTEGER', 0, None, 0),
 (13, 'order_10', 'INTEGER', 0, None, 0),
 (14, 'order_09', 'INTEGER', 0, None, 0),
 (15, 'order_08', 'INTEGER', 0, None, 0),
 (16, 'order_07', 'INTEGER', 0, None, 0),
 (17, 'order_06', 'INTEGER', 0, None, 0),
 (18, 'order_05', 'INTEGER', 0, None, 0),
 (19, 'order_04', 'INTEGER', 0, None, 0),
 (20, 'order_03', 'INTEGER', 0, None, 0),
 (21, 'order_02', 'INTEGER', 0, None, 0),
 (22, 'order_01', 'INTEGER', 0, None, 0)]

In [238]:
#Create a new  table
cur.execute('''CREATE TABLE food_taxonomies_copy
                (id INTEGER PRIMARY KEY, food_id INTEGER, 
                ncbi_taxonomy_id INTEGER, classification_name TEXT, 
                classification_order INTEGER,
                order_18 TEXT,
                order_17 TEXT, order_16 TEXT, order_15 TEXT, order_14 TEXT, order_13 TEXT,
                order_12 TEXT, order_11 TEXT, order_10 TEXT, order_09 TEXT, order_08 TEXT,
                order_07 TEXT, order_06 TEXT, order_05 TEXT, order_04 TEXT, order_03 TEXT,
                order_02 TEXT, order_01 TEXT,
                FOREIGN KEY(food_id) REFERENCES foods(id))
            ;''')

<sqlite3.Cursor at 0x233df570>

In [239]:
#Insert values into columns of new table
cur.execute('''INSERT INTO food_taxonomies_copy
                    (id, food_id, ncbi_taxonomy_id, classification_name,
                    classification_order,
                order_18, order_17, order_16, order_15, order_14, order_13,
                order_12, order_11, order_10, order_09, order_08,
                order_07, order_06, order_05, order_04, order_03,
                order_02, order_01)
                SELECT id, food_id, ncbi_taxonomy_id, classification_name,
                    classification_order, 
                order_18, order_17, order_16, order_15, order_14, order_13,
                order_12, order_11, order_10, order_09, order_08,
                order_07, order_06, order_05, order_04, order_03,
                order_02, order_01
                FROM food_taxonomies
            ;''')

#Delete the old table
cur.execute('''DROP TABLE food_taxonomies;''')

#Rename new table to enzymes
cur.execute('''ALTER TABLE food_taxonomies_copy
                RENAME TO food_taxonomies
                ;''')

<sqlite3.Cursor at 0x233df570>

In [240]:
#Check columns of new table
cur.execute('''PRAGMA table_info(food_taxonomies)
;''',).fetchall()

[(0, 'id', 'INTEGER', 0, None, 1),
 (1, 'food_id', 'INTEGER', 0, None, 0),
 (2, 'ncbi_taxonomy_id', 'INTEGER', 0, None, 0),
 (3, 'classification_name', 'TEXT', 0, None, 0),
 (4, 'classification_order', 'INTEGER', 0, None, 0),
 (5, 'order_18', 'TEXT', 0, None, 0),
 (6, 'order_17', 'TEXT', 0, None, 0),
 (7, 'order_16', 'TEXT', 0, None, 0),
 (8, 'order_15', 'TEXT', 0, None, 0),
 (9, 'order_14', 'TEXT', 0, None, 0),
 (10, 'order_13', 'TEXT', 0, None, 0),
 (11, 'order_12', 'TEXT', 0, None, 0),
 (12, 'order_11', 'TEXT', 0, None, 0),
 (13, 'order_10', 'TEXT', 0, None, 0),
 (14, 'order_09', 'TEXT', 0, None, 0),
 (15, 'order_08', 'TEXT', 0, None, 0),
 (16, 'order_07', 'TEXT', 0, None, 0),
 (17, 'order_06', 'TEXT', 0, None, 0),
 (18, 'order_05', 'TEXT', 0, None, 0),
 (19, 'order_04', 'TEXT', 0, None, 0),
 (20, 'order_03', 'TEXT', 0, None, 0),
 (21, 'order_02', 'TEXT', 0, None, 0),
 (22, 'order_01', 'TEXT', 0, None, 0)]

In [241]:
#Check that columns were populated from old table
#This function works without committing 
cur.execute('''SELECT *
                FROM food_taxonomies
                LIMIT 3
            ;''').fetchall()

[(1,
  1,
  357850,
  '\\Eukaryota\\',
  1,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  '\\Eukaryota\\'),
 (2,
  1,
  357850,
  '\\Viridiplantae\\',
  2,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  '\\Viridiplantae\\',
  None),
 (3,
  1,
  357850,
  '\\Streptophyta\\',
  3,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  '\\Streptophyta\\',
  None,
  None)]

In [242]:
# Save (commit) the changes
connection.commit()

[(Back to top)](#top)

<a id = 'dropcol_compounds'></a>

## Dropping columns from compounds table

In [70]:
compound_columns_to_keep = []

In [None]:
#for sub in subgroups:
#    if any(generic in sub for generic in generics):
#        generic_subgroups.append(sub)

In [71]:
for column in compounds.columns:
    if not any(to_drop in column for to_drop in compound_columns_to_drop):
        compound_columns_to_keep.append(column)

In [100]:
compound_columns_to_keep

['id', 'name', 'description', 'moldb_formula', 'moldb_id']

In [101]:
compounds[compound_columns_to_keep]

Unnamed: 0,id,name,description,moldb_formula,moldb_id
0,1,Mulberrofuran P,Constit. of Morus alba (white mulberry) [CCD],C34H22O9,31941
1,4,Cyanidin 3-(6''-acetyl-galactoside),Constit. of the leaves of Nymphaea alba [CCD],C23H23O12,80753
2,13,Cyanidin 3-(6''-succinyl-glucoside),Constit. of Phragmites australis [CCD],C25H25O14,80754
3,14,Pelargonidin 3-(6''-succinyl-glucoside),,C25H25O13,80799
4,22,Cyanidin 3-O-(6''-acetyl-arabinoside),A polyphenol compound found in foods of plant ...,C22H21O11,80755
5,24,Petunidin 3-O-(6''-acetyl-galactoside),,C24H25O13,27777
6,25,Peonidin 3-(6''-acetyl-galactoside),,C24H25O12,27774
7,27,Malvidin 3-(6''-acetyl-galactoside),,C25H27O13,27770
8,31,Pelargonidin 3-arabinoside,,C20H19O9,80800
9,35,Peonidin 3-(6''-p-coumaroyl-glucoside),,C31H29O13,27776


In [None]:
#Look at columns in enzymes table
cur.execute('''PRAGMA table_info()
;''',).fetchall()

In [107]:
#Create a new  table
cur.execute('''CREATE TABLE compounds_copy
                (id INTEGER PRIMARY KEY, name TEXT, description TEXT,
                moldb_formula TEXT, moldb_id INTEGER)
            ;''')

#Look at columns in  table
cur.execute('''PRAGMA table_info(compounds_copy)
;''',).fetchall()

[(0, 'id', 'INTEGER', 0, None, 1),
 (1, 'name', 'TEXT', 0, None, 0),
 (2, 'description', 'TEXT', 0, None, 0),
 (3, 'moldb_formula', 'TEXT', 0, None, 0),
 (4, 'moldb_id', 'INTEGER', 0, None, 0)]

In [108]:
#Insert values into columns of new table
cur.execute('''INSERT INTO compounds_copy
                    (id, name, description, moldb_formula, moldb_id)
                SELECT id, name, description, moldb_formula, moldb_id
                FROM compounds
            ;''')
#Check data types of new table
#Works before commit

cur.execute('''PRAGMA table_info(compounds_copy)
;''',).fetchall()

[(0, 'id', 'INTEGER', 0, None, 1),
 (1, 'name', 'TEXT', 0, None, 0),
 (2, 'description', 'TEXT', 0, None, 0),
 (3, 'moldb_formula', 'TEXT', 0, None, 0),
 (4, 'moldb_id', 'INTEGER', 0, None, 0)]

In [None]:
#Check that columns were populated from old table
#This function works without committing 
cur.execute('''SELECT *
                FROM _copy
                LIMIT 5
            ;''').fetchall()

In [109]:
#Delete the old table
cur.execute('''DROP TABLE compounds;''')

#Rename new table to enzymes
cur.execute('''ALTER TABLE compounds_copy
                RENAME TO compounds
                ;''')
#Check list of tables in database
cur.execute('''SELECT name
                FROM sqlite_master 
                WHERE type ='table' 
                    AND name NOT LIKE 'sqlite_%'
            ;''').fetchall()

[('compounds_enzymes',),
 ('compounds_flavors',),
 ('compounds_health_effects',),
 ('compounds_pathways',),
 ('foods',),
 ('enzymes',),
 ('health_effects',),
 ('flavors',),
 ('pathways',),
 ('food_taxonomies',),
 ('compounds',)]

In [111]:
#Check columns of new table
cur.execute('''PRAGMA table_info(compounds)
;''',).fetchall()

[(0, 'id', 'INTEGER', 0, None, 1),
 (1, 'name', 'TEXT', 0, None, 0),
 (2, 'description', 'TEXT', 0, None, 0),
 (3, 'moldb_formula', 'TEXT', 0, None, 0),
 (4, 'moldb_id', 'INTEGER', 0, None, 0)]

In [112]:
# Save (commit) the changes
connection.commit()

In [121]:
connection.close()

In [113]:
compounds = pd.read_sql_query('''SELECT * 
                                    FROM compounds''', engine)

2019-03-16 12:28:30,631 INFO sqlalchemy.engine.base.Engine SELECT * 
                                    FROM compounds


INFO:sqlalchemy.engine.base.Engine:SELECT * 
                                    FROM compounds


2019-03-16 12:28:30,631 INFO sqlalchemy.engine.base.Engine ()


INFO:sqlalchemy.engine.base.Engine:()


In [114]:
compounds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28771 entries, 0 to 28770
Data columns (total 5 columns):
id               28771 non-null int64
name             28771 non-null object
description      28771 non-null object
moldb_formula    28771 non-null object
moldb_id         28771 non-null object
dtypes: int64(1), object(4)
memory usage: 1.1+ MB


In [120]:
compounds[compounds['moldb_id'] != ''].shape[0]

24399

[(Back to top)](#top)
## Dropping columns from compounds_enzymes table

In [139]:
# connect to db
connection = sqlite3.connect('foodb.db')
cur = connection.cursor()

In [243]:
#Look at columns in enzymes table
cur.execute('''PRAGMA table_info(compounds_enzymes)
;''',).fetchall()

[(0, 'id', 'INTEGER', 0, None, 1),
 (1, 'compound_id', 'INTEGER', 0, None, 0),
 (2, 'enzyme_id', 'INTEGER', 0, None, 0)]

In [244]:
#Create a new  table
cur.execute('''CREATE TABLE compounds_enzymes_copy
                (compound_id INTEGER, enzyme_id INTEGER,
                FOREIGN KEY(compound_id) REFERENCES compounds(id),
                FOREIGN KEY(enzyme_id) REFERENCES enzymes(id))
            ;''')

#Look at columns in  table
cur.execute('''PRAGMA table_info(compounds_enzymes_copy)
;''',).fetchall()

[(0, 'compound_id', 'INTEGER', 0, None, 0),
 (1, 'enzyme_id', 'INTEGER', 0, None, 0)]

In [246]:
#Insert values into columns of new table
cur.execute('''INSERT INTO compounds_enzymes_copy
                    (compound_id, enzyme_id)
                SELECT compound_id, enzyme_id
                FROM compounds_enzymes
            ;''')
#Check data types of new table
#Works before commit

cur.execute('''PRAGMA table_info(compounds_enzymes_copy)
;''',).fetchall()

[(0, 'compound_id', 'INTEGER', 0, None, 0),
 (1, 'enzyme_id', 'INTEGER', 0, None, 0)]

In [247]:
#Check that columns were populated from old table
#This function works without committing 
cur.execute('''SELECT *
                FROM compounds_enzymes_copy
                LIMIT 5
            ;''').fetchall()

[(362, 1), (362, 2), (362, 3), (362, 4), (362, 5)]

In [248]:
#Delete the old table
cur.execute('''DROP TABLE compounds_enzymes;''')

#Rename new table to enzymes
cur.execute('''ALTER TABLE compounds_enzymes_copy
                RENAME TO compounds_enzymes
                ;''')
#Check list of tables in database
cur.execute('''SELECT name
                FROM sqlite_master 
                WHERE type ='table' 
                    AND name NOT LIKE 'sqlite_%'
            ;''').fetchall()

[('foods',),
 ('enzymes',),
 ('health_effects',),
 ('flavors',),
 ('pathways',),
 ('compounds',),
 ('compounds_flavors',),
 ('compounds_health_effects',),
 ('compounds_pathways',),
 ('food_taxonomies',),
 ('compounds_enzymes',)]

In [249]:
#Check columns of new table
cur.execute('''PRAGMA table_info(compounds_enzymes)
;''',).fetchall()

[(0, 'compound_id', 'INTEGER', 0, None, 0),
 (1, 'enzyme_id', 'INTEGER', 0, None, 0)]

In [250]:
# Save (commit) the changes
connection.commit()

In [150]:
connection.close()

[(Back to top)](#top)
## Dropping columns from compounds_flavors table

In [184]:
# connect to db
connection = sqlite3.connect('foodb.db')
cur = connection.cursor()

In [186]:
#Look at columns in enzymes table
cur.execute('''PRAGMA table_info(compounds_flavors)
;''',).fetchall()

[(0, 'id', '', 0, None, 0),
 (1, 'compound_id', '', 0, None, 0),
 (2, 'flavor_id', '', 0, None, 0),
 (3, 'citations', '', 0, None, 0),
 (4, 'created_at', '', 0, None, 0),
 (5, 'updated_at', '', 0, None, 0),
 (6, 'creator_id', '', 0, None, 0),
 (7, 'updater_id', '', 0, None, 0),
 (8, 'source_id', '', 0, None, 0),
 (9, 'source_type', '', 0, None, 0)]

In [187]:
#Create a new  table
cur.execute('''CREATE TABLE compounds_flavors_copy
                (compound_id INTEGER, flavor_id INTEGER,
                FOREIGN KEY(compound_id) REFERENCES compounds(id),
                FOREIGN KEY(flavor_id) REFERENCES flavors(id))
            ;''')

#Look at columns in  table
cur.execute('''PRAGMA table_info(compounds_flavors_copy)
;''',).fetchall()

[(0, 'compound_id', 'INTEGER', 0, None, 0),
 (1, 'flavor_id', 'INTEGER', 0, None, 0)]

In [188]:
#Insert values into columns of new table
cur.execute('''INSERT INTO compounds_flavors_copy
                    (compound_id, flavor_id)
                SELECT compound_id, flavor_id
                FROM compounds_flavors
            ;''')
#Check data types of new table
#Works before commit

cur.execute('''PRAGMA table_info(compounds_flavors_copy)
;''',).fetchall()

[(0, 'compound_id', 'INTEGER', 0, None, 0),
 (1, 'flavor_id', 'INTEGER', 0, None, 0)]

In [190]:
#Check that columns were populated from old table
#This function works without committing 
cur.execute('''SELECT *
                FROM compounds_flavors_copy
                LIMIT 5
            ;''').fetchall()

[(11947, 159), (8298, 213), (8298, 156), (11889, 231), (11889, 166)]

In [191]:
#Delete the old table
cur.execute('''DROP TABLE compounds_flavors;''')

#Rename new table to enzymes
cur.execute('''ALTER TABLE compounds_flavors_copy
                RENAME TO compounds_flavors
                ;''')
#Check list of tables in database
cur.execute('''SELECT name
                FROM sqlite_master 
                WHERE type ='table' 
                    AND name NOT LIKE 'sqlite_%'
            ;''').fetchall()

[('compounds_health_effects',),
 ('compounds_pathways',),
 ('foods',),
 ('enzymes',),
 ('health_effects',),
 ('flavors',),
 ('pathways',),
 ('food_taxonomies',),
 ('compounds',),
 ('compounds_enzymes',),
 ('compounds_flavors',)]

In [192]:
#Check columns of new table
cur.execute('''PRAGMA table_info(compounds_flavors)
;''',).fetchall()

[(0, 'compound_id', 'INTEGER', 0, None, 0),
 (1, 'flavor_id', 'INTEGER', 0, None, 0)]

In [193]:
# Save (commit) the changes
connection.commit()

[(Back to top)](#top)
## Dropping columns from compounds_health_effects table

In [208]:
# connect to db
connection = sqlite3.connect('foodb.db')
cur = connection.cursor()

In [209]:
#Look at columns in enzymes table
cur.execute('''PRAGMA table_info(compounds_health_effects )
;''',).fetchall()

[(0, 'id', '', 0, None, 0),
 (1, 'compound_id', '', 0, None, 0),
 (2, 'health_effect_id', '', 0, None, 0),
 (3, 'orig_health_effect_name', '', 0, None, 0),
 (4, 'orig_compound_name', '', 0, None, 0),
 (5, 'orig_citation', '', 0, None, 0),
 (6, 'citation', '', 0, None, 0),
 (7, 'citation_type', '', 0, None, 0),
 (8, 'created_at', '', 0, None, 0),
 (9, 'updated_at', '', 0, None, 0),
 (10, 'creator_id', '', 0, None, 0),
 (11, 'updater_id', '', 0, None, 0),
 (12, 'source_id', '', 0, None, 0),
 (13, 'source_type', '', 0, None, 0)]

In [210]:
#Create a new  table
cur.execute('''CREATE TABLE compounds_health_effects_copy
                (compound_id INTEGER, health_effect_id INTEGER,
                FOREIGN KEY(compound_id) REFERENCES compounds(id),
                FOREIGN KEY(health_effect_id) REFERENCES health_effects(id))
            ;''')

#Look at columns in  table
cur.execute('''PRAGMA table_info(compounds_health_effects_copy)
;''',).fetchall()

[(0, 'compound_id', 'INTEGER', 0, None, 0),
 (1, 'health_effect_id', 'INTEGER', 0, None, 0)]

In [213]:
#Insert values into columns of new table
cur.execute('''INSERT INTO compounds_health_effects_copy
                    (compound_id, health_effect_id)
                SELECT compound_id, health_effect_id
                FROM compounds_health_effects 
            ;''')
#Check data types of new table
#Works before commit

cur.execute('''PRAGMA table_info(compounds_health_effects_copy)
;''',).fetchall()

[(0, 'compound_id', 'INTEGER', 0, None, 0),
 (1, 'health_effect_id', 'INTEGER', 0, None, 0)]

In [215]:
#Check that columns were populated from old table
#This function works without committing 
cur.execute('''SELECT *
                FROM compounds_health_effects_copy
                LIMIT 5
            ;''').fetchall()

[(453, 1), (2100, 1), (9030, 2), (13577, 2), (14619, 2)]

In [216]:
#Delete the old table
cur.execute('''DROP TABLE compounds_health_effects;''')

#Rename new table to enzymes
cur.execute('''ALTER TABLE compounds_health_effects_copy
                RENAME TO compounds_health_effects 
                ;''')
#Check list of tables in database
cur.execute('''SELECT name
                FROM sqlite_master 
                WHERE type ='table' 
                    AND name NOT LIKE 'sqlite_%'
            ;''').fetchall()

[('compounds_pathways',),
 ('foods',),
 ('enzymes',),
 ('health_effects',),
 ('flavors',),
 ('pathways',),
 ('food_taxonomies',),
 ('compounds',),
 ('compounds_enzymes',),
 ('compounds_flavors',),
 ('compounds_health_effects',)]

In [217]:
#Check columns of new table
cur.execute('''PRAGMA table_info(compounds_health_effects )
;''',).fetchall()

[(0, 'compound_id', 'INTEGER', 0, None, 0),
 (1, 'health_effect_id', 'INTEGER', 0, None, 0)]

In [218]:
# Save (commit) the changes
connection.commit()

[(Back to top)](#top)
## Dropping columns from compounds_pathways table

In [None]:
# connect to db
connection = sqlite3.connect('foodb.db')
cur = connection.cursor()

In [225]:
#Look at columns in enzymes table
cur.execute('''PRAGMA table_info(compounds_pathways)
;''',).fetchall()

[(0, 'id', '', 0, None, 0),
 (1, 'compound_id', '', 0, None, 0),
 (2, 'pathway_id', '', 0, None, 0),
 (3, 'creator_id', '', 0, None, 0),
 (4, 'updater_id', '', 0, None, 0),
 (5, 'created_at', '', 0, None, 0),
 (6, 'updated_at', '', 0, None, 0)]

In [226]:
#Create a new  table
cur.execute('''CREATE TABLE compounds_pathways_copy
                (compound_id INTEGER, pathway_id INTEGER,
                FOREIGN KEY(compound_id) REFERENCES compounds(id),
                FOREIGN KEY(pathway_id) REFERENCES pathways(id))
            ;''')

#Look at columns in  table
cur.execute('''PRAGMA table_info(compounds_pathways_copy)
;''',).fetchall()

[(0, 'compound_id', 'INTEGER', 0, None, 0),
 (1, 'pathway_id', 'INTEGER', 0, None, 0)]

In [228]:
#Insert values into columns of new table
cur.execute('''INSERT INTO compounds_pathways_copy
                    (compound_id, pathway_id)
                SELECT compound_id, pathway_id
                FROM compounds_pathways
            ;''')
#Check data types of new table
#Works before commit

cur.execute('''PRAGMA table_info(compounds_pathways_copy)
;''',).fetchall()

[(0, 'compound_id', 'INTEGER', 0, None, 0),
 (1, 'pathway_id', 'INTEGER', 0, None, 0)]

In [229]:
#Check that columns were populated from old table
#This function works without committing 
cur.execute('''SELECT *
                FROM compounds_pathways_copy
                LIMIT 5
            ;''').fetchall()

[(316, 1), (362, 2), (374, 3), (374, 4), (433, 1)]

In [230]:
#Delete the old table
cur.execute('''DROP TABLE compounds_pathways;''')

#Rename new table to enzymes
cur.execute('''ALTER TABLE compounds_pathways_copy
                RENAME TO compounds_pathways
                ;''')
#Check list of tables in database
cur.execute('''SELECT name
                FROM sqlite_master 
                WHERE type ='table' 
                    AND name NOT LIKE 'sqlite_%'
            ;''').fetchall()

[('foods',),
 ('enzymes',),
 ('health_effects',),
 ('flavors',),
 ('pathways',),
 ('food_taxonomies',),
 ('compounds',),
 ('compounds_enzymes',),
 ('compounds_flavors',),
 ('compounds_health_effects',),
 ('compounds_pathways',)]

In [231]:
#Check columns of new table
cur.execute('''PRAGMA table_info(compounds_pathways)
;''',).fetchall()

[(0, 'compound_id', 'INTEGER', 0, None, 0),
 (1, 'pathway_id', 'INTEGER', 0, None, 0)]

In [232]:
# Save (commit) the changes
connection.commit()

[(Back to top)](#top)
## Dropping columns from [    ] table

In [None]:
# connect to db
connection = sqlite3.connect('foodb.db')
cur = connection.cursor()

In [None]:
#Look at columns in enzymes table
cur.execute('''PRAGMA table_info()
;''',).fetchall()

In [None]:
#Create a new  table
cur.execute('''CREATE TABLE e_copy
                (id INTEGER PRIMARY KEY, name TEXT, gene_name TEXT, 
                uniprot_id TEXT)
            ;''')

#Look at columns in  table
cur.execute('''PRAGMA table_info(_copy)
;''',).fetchall()

In [None]:
#Insert values into columns of new table
cur.execute('''INSERT INTO _copy
                    (id, name, gene_name, uniprot_id)
                SELECT id, name, gene_name, uniprot_id
                FROM 
            ;''')
#Check data types of new table
#Works before commit

cur.execute('''PRAGMA table_info(_copy)
;''',).fetchall()

In [None]:
#Check that columns were populated from old table
#This function works without committing 
cur.execute('''SELECT *
                FROM _copy
                LIMIT 5
            ;''').fetchall()

In [None]:
#Delete the old table
cur.execute('''DROP TABLE ;''')

#Rename new table to enzymes
cur.execute('''ALTER TABLE _copy
                RENAME TO 
                ;''')
#Check list of tables in database
cur.execute('''SELECT name
                FROM sqlite_master 
                WHERE type ='table' 
                    AND name NOT LIKE 'sqlite_%'
            ;''').fetchall()

In [None]:
#Check columns of new table
cur.execute('''PRAGMA table_info()
;''',).fetchall()

In [None]:
# Save (commit) the changes
connection.commit()

<a id = 'dropping_tables'></a>

[(Back to top)](#top)
# Dropping whole tables

In [122]:
cur.execute('''DROP TABLE nutrients;''')

OperationalError: no such table: nutrients

In [123]:
cur.execute('''DROP TABLE [references];''')

<sqlite3.Cursor at 0x8cfd5e0>

In [124]:
# Save (commit) the changes
connection.commit()

In [125]:
connection.close()

In [34]:
#Drop compound-related tables
cur.execute('''DROP TABLE foodcomex_compound_providers;''')
cur.execute('''DROP TABLE foodcomex_compounds;''')
cur.execute('''DROP TABLE compound_alternate_parents;''')
cur.execute('''DROP TABLE compound_external_descriptors;''')
cur.execute('''DROP TABLE compound_substituents;''')
cur.execute('''DROP TABLE compound_synonyms;''')
connection.commit()

In [35]:
connection.close()

In [None]:
###############################################################################################

In [None]:
#This encoding works with Windows.
foods = pd.read_csv('foods.csv', encoding = "ISO-8859-1")




In [None]:
foods.head()

In [None]:
compounds = pd.read_csv('compounds.csv', encoding = "ISO-8859-1")

In [None]:
compounds_flavors = pd.read_csv('compounds_flavors.csv', encoding = "ISO-8859-1")

In [None]:
foodcomex_compounds = pd.read_csv('foodcomex_compounds.csv', encoding = "ISO-8859-1")