# Radhika Mardikar, Xinxin Mo

## Step 1:
Looking at the KEGG pathway, we will select 4 enzymes from the glycolysis, TCA cycle and pentose phosphate cycle. 
Glycolysis: hexokinase 1, phosphoglucose isomerase, phosphofructokinase, fructose-bisphosphate aldolase (https://www.ebi.ac.uk/interpro/potm/2004_2/Page2.htm)
TCA: citrate synthase, aconitase, isocitrase dehydrogenase, alpha-ketoglurate (https://www.news-medical.net/life-sciences/Krebs-Cycle-Enzymes.aspx)
pentose phosphate: transketolase, transaldolase, lactonase, phosphopentose isomerase (https://mcb.berkeley.edu/labs/krantz/mcb102/lect_S2008/MCB102-SPRING2008-LECTURE5-PENTOSE.pdf)

In [3]:
from Bio import Entrez
from Bio import SeqIO
#from beautifultable import BeautifulTable
import sqlite3

In [4]:

Entrez.email = "rmardikar@berkeley.edu"
# first row is glycolysis enzyme, second is pentose phosphate cycle, third is TCA
enzymelist = ["2.7.1.1", "5.4.2.2", "3.1.3.11", "3.1.3.9",
              "5.3.1.9", "3.1.3.11", "2.7.1.11", "1.1.5.9",
             "1.2.7.1", "1.2.4.1", "1.1.1.27", "1.1.1.37"]
organismlist = ['Homo sapiens', 'Macaca mulatta', "Escherichia coli"]
idlist = []

for org in organismlist:
    for enzyme in enzymelist:
        handle = Entrez.esearch(db="gene",
                                term = org + '[ORGN] AND ' + enzyme, 
                                idtype = 'acc', 
                                sort='relevance',
                                retmax=1)
        print(org + '[ORGN]' + enzyme)
        record = Entrez.read(handle)
        idlist.append(record["IdList"])


Homo sapiens[ORGN]2.7.1.1
Homo sapiens[ORGN]5.4.2.2
Homo sapiens[ORGN]3.1.3.11
Homo sapiens[ORGN]3.1.3.9
Homo sapiens[ORGN]5.3.1.9
Homo sapiens[ORGN]3.1.3.11
Homo sapiens[ORGN]2.7.1.11
Homo sapiens[ORGN]1.1.5.9
Homo sapiens[ORGN]1.2.7.1
Homo sapiens[ORGN]1.2.4.1
Homo sapiens[ORGN]1.1.1.27
Homo sapiens[ORGN]1.1.1.37
Macaca mulatta[ORGN]2.7.1.1
Macaca mulatta[ORGN]5.4.2.2
Macaca mulatta[ORGN]3.1.3.11
Macaca mulatta[ORGN]3.1.3.9
Macaca mulatta[ORGN]5.3.1.9
Macaca mulatta[ORGN]3.1.3.11
Macaca mulatta[ORGN]2.7.1.11
Macaca mulatta[ORGN]1.1.5.9
Macaca mulatta[ORGN]1.2.7.1
Macaca mulatta[ORGN]1.2.4.1
Macaca mulatta[ORGN]1.1.1.27
Macaca mulatta[ORGN]1.1.1.37
Rattus norvegicus[ORGN]2.7.1.1
Rattus norvegicus[ORGN]5.4.2.2
Rattus norvegicus[ORGN]3.1.3.11
Rattus norvegicus[ORGN]3.1.3.9
Rattus norvegicus[ORGN]5.3.1.9
Rattus norvegicus[ORGN]3.1.3.11
Rattus norvegicus[ORGN]2.7.1.11
Rattus norvegicus[ORGN]1.1.5.9
Rattus norvegicus[ORGN]1.2.7.1
Rattus norvegicus[ORGN]1.2.4.1
Rattus norvegicus[ORGN]1.1.1.

In [6]:
conn = sqlite3.connect('my.db')
c = conn.cursor()
#c.execute("""CREATE TABLE genes (id INT, name TEXT, description TEXT, organism TEXT, chromosome TEXT, start INT, end INT, strand VARCHAR(1));""")
rows = []
for i in idlist:
    print(i)
    handle = Entrez.efetch(db="gene", id = i, rettype = 'gb', retmode = 'text', retnum=1)
    #print(handle.read())
    #print(handle.readlines())
    for line in handle.readlines():
        if 'Name' in line:
            name = line[line.find('Name')+len('Name:')+1:line.find('[')]
            organism = line[line.find('[')+1:line.find(']')]
            print(name)
            print(organism)
        if 'Annotation' in line:
            anno = line[line.find('Annotation')+len('Annotation:')+1:-1]
            #chromosome = line[line.find('Chromosome')+len('Chromosome')+1]
            annoList = line.split()
            chromosome = annoList[2]
            print(annoList)
            startEndSplit = annoList[4].split('..')
            print(startEndSplit)
            start = (startEndSplit[0])[1:-1]
            end = (startEndSplit[1])[0:-1]
            print(int(start), int(end))
        if 'ID' in line:
            iD = int(line.split(' ')[-1])
            print(iD)
    description = '-'
    strand = '-'
    rows.append((iD, name, description, organism, chromosome, start, end, strand))
    print("------------------------------------------")
    #records = Entrez.parse(handle)
    #print(records['Title'])
    #c.execute("""INSERT INTO genes (id, name, description, organism, chromosome, start, end, strand)
    #                            VALUES(?,?,?,?,?,?,?,?), rows;""")

    

['3098']
hexokinase 1 
Homo sapiens (human)
['Annotation:', 'Chromosome', '10', 'NC_000010.11', '(69269991..69401882)']
['(69269991', '69401882)']
(6926999, 69401882)
3098
------------------------------------------
['5236']
phosphoglucomutase 1 
Homo sapiens (human)
['Annotation:', 'Chromosome', '1', 'NC_000001.11', '(63593276..63660245)']
['(63593276', '63660245)']
(6359327, 63660245)
5236
------------------------------------------
['2203']
fructose-bisphosphatase 1 
Homo sapiens (human)
['Annotation:', 'Chromosome', '9', 'NC_000009.12', '(94603133..94640258,', 'complement)']
['(94603133', '94640258,']
(9460313, 94640258)
2203
------------------------------------------
['57818']
glucose-6-phosphatase catalytic subunit 2 
Homo sapiens (human)
['Annotation:', 'Chromosome', '2', 'NC_000002.12', '(168901223..168910000)']
['(168901223', '168910000)']
(16890122, 168910000)
57818
------------------------------------------
['2821']
glucose-6-phosphate isomerase 
Homo sapiens (human)
['Annotat

In [102]:
#for row in rows:
    #print(row)
    #c.execute('INSERT INTO genes values (?,?,?,?,?,?,?,?)', row)
print(len(rows))
c.executemany('INSERT INTO genes values (?,?,?,?,?,?,?,?)', rows)
c.execute("SELECT * FROM genes;")
#print(c.fetchall())
for tu in c.fetchall():
    print(tu)

36
(3098, 'hexokinase 1 ', '-', 'Homo sapiens (human)', '10', 6926999, 69401882, '-')
(5236, 'phosphoglucomutase 1 ', '-', 'Homo sapiens (human)', '1', 6359327, 63660245, '-')
(2203, 'fructose-bisphosphatase 1 ', '-', 'Homo sapiens (human)', '9', 9460313, 94640258, '-')
(57818, 'glucose-6-phosphatase catalytic subunit 2 ', '-', 'Homo sapiens (human)', '2', 16890122, 168910000, '-')
(2821, 'glucose-6-phosphate isomerase ', '-', 'Homo sapiens (human)', '19', 3435333, 34402413, '-')
(2203, 'fructose-bisphosphatase 1 ', '-', 'Homo sapiens (human)', '9', 9460313, 94640258, '-')
(5213, 'phosphofructokinase, muscle ', '-', 'Homo sapiens (human)', '12', 4810525, 48146404, '-')
(5213, 'phosphofructokinase, muscle ', '-', 'Homo sapiens (human)', '12', 4810525, 48146404, '-')
(5213, 'phosphofructokinase, muscle ', '-', 'Homo sapiens (human)', '12', 4810525, 48146404, '-')
(5160, 'pyruvate dehydrogenase E1 alpha 1 subunit ', '-', 'Homo sapiens (human)', 'X', 1934389, 19361707, '-')
(3939, 'lactate

In [11]:
from Bio import Entrez
Entrez.email = 'xinxinmo@berkeley.edu'
handle = Entrez.esearch(db='nucleotide',
                        term='Escherichia coli[ORGN]'+'5.4.2.2',
                        sort='relevance',
                        idtype='acc',
                        retmax=1)
for i in Entrez.read(handle)['IdList']:
    #handle = Entrez.efetch(db='nucleotide', id=i, rettype='fasta', retmode='text')
    handle = Entrez.efetch(db='nucleotide', id=i, rettype='fasta', retmode='text')    
    print(handle.read())


>NZ_PDAR01000013.1 Escherichia coli strain 2012C-4704 NODE_13_length_126606_cov_36.0443_ID_21952, whole genome shotgun sequence
TTATTAAACCTGCCAAAAATATTATTATTTGGCAGGTTTAATTTCTTAACGCAAATATAAAACACAAAAT
TACAACATTTAAATAACAAAACAGCATCTAATATGGCGCTATTCATAATTAATGATTTATTATTTGGGGT
ATGATCGTTTTTTGTTGATCTTCTTCACAGATTATAGCCATTTCATGGATAGAATAACTCTACCTTCAAC
TGACACAGCAAGAGGTAAAGGTAAATGGAAAATAATAACCGCTTAATGCCTCATATAAGGCGGACAACCC
ATATCATGAAGTTTGCCCATCGTAATAGTTTTGACTTTCATTTCTTTAATGCCCGCTAGTCTTCTGACTA
AAGGGCACCCCAACGTACAGGTCTCCCTGACTTTAAGCATTACAGGTTAATACCTGTATTCCTCGGTGCT
CATATACTGCTAACCCTTTTAAACTCTAAATAATTCGAGTCGCAGCACTTGCAACTTGAGGTATGACGAG
TATAGCCAGTTACCGGGCTGGTCTGGGTTATTGCATCTGCAAAAAGCAAACTACTGATTTATTTATCAGC
GGTGGAGCTTTGCTTTTTTTCCGGCGTGATCGATTTCTCCTTTGAGAAATTGAGGACCTGCTATTACCTG
AAATAAAGAGATGAACAAAATGTCAGAATTAAAAATTGCCGTTAGTCGTTCTTGCCCGGATTGTTTTTCC
ACTCATCGTGCATGCGTGAATATAGACGAAAGTAATTATATTGACGTTGCCGCCATTATTTTATCAGTCA
GTGATGTTGAACGTGGAAAACTCGATGAAATAGACGCTACTGGCTATGACATTCCTGTTTTTATTGCAAC
GGAAAATGAAGAACGTGTAC

In [None]:
term = org + '[ORGN]' + enzyme+'NOT whole', 

In [61]:

conn = sqlite3.connect('my.db')
c = conn.cursor()
c.execute("""CREATE TABLE genes (id INT, name TEXT, description TEXT, organism TEXT, chromosome TEXT, start INT, end INT, strand VARCHAR(1));""")
c.execute("""INSERT INTO genes (id, name, description, organism, chromosome, start, end, strand)
                                VALUES(58341,"BRCA1", "Breast Cancer 1","chr17", 43033295, 43170245,'-');""")
c.execute("SELECT * FROM genes WHERE name = 'BRCA1';")
print(c.fetchone())

(58341, 'BRCA1', 'Breast Cancer 1', 'chr17', 43033295, 43170245, '-')
