# Data science project in Python

In [None]:
import pandas
from Bio import SeqIO
from Bio import Entrez
from Bio.SeqUtils import GC, molecular_weight

In [None]:
def get_gc_and_mw_from_gbid(id='NM_177676.6'):
    Entrez.email = 'A.N.Other@example.com' # Always tell NCBI who you are
    handle = Entrez.efetch(db="nucleotide", id=id, rettype="gb")
    seq_record = SeqIO.read(handle, "gb")
    handle.close()
    return GC(seq_record.seq), molecular_weight(seq_record.seq)

In [None]:
print(get_gc_and_mw_from_gbid('NM_177676.6'))

In [None]:
def get_gc_and_mw_from_gbids(df_ids):
    gcs = []
    mws = []
    for i in df_ids:
        gc, mw = get_gc_and_mw_from_gbid(i)
        gcs.append(gc)
        mws.append(mw)
    return gcs, mws

In [None]:
mouse = pandas.read_csv('data/GRCm38.gff3', sep='\t')
mouse.head()

In [None]:
mouse.dropna(inplace=True)
print(mouse.type.unique())

In [None]:
exon_mouse = mouse[mouse.type=='exon']

In [None]:
exon_mouse.head()

In [None]:
small_exon_mouse = exon_mouse.iloc[0:9,]

In [None]:
gcs, mws = get_gc_and_mw_from_gbids(small_exon_mouse['gbid'])

In [None]:
small_exon_mouse.insert(len(small_exon_mouse.columns), 'gc', gcs)

In [None]:
small_exon_mouse.insert(len(small_exon_mouse.columns), 'mw', mws)

In [None]:
small_exon_mouse.head()

In [None]:
%matplotlib inline
small_exon_mouse.gc.plot.hist()

In [None]:
human = pandas.read_csv('data/GRCh38.gff3', sep='\t')
human.head()

In [None]:
zebrafish = pandas.read_csv('data/GRCz11.gff3', sep='\t')
zebrafish.head()

In [None]:
panda = pandas.read_csv('data/AilMel.gff3', sep='\t')
panda.head()