In [None]:
import pandas as pd
from Bio import Entrez, SeqIO
from Bio.SeqUtils import GC, molecular_weight
import matplotlib.pyplot as plt

In [None]:
def plot_gff_stats(fn, label, your_email, n_records = None):
    """Draw box plots of exon lengths, GC% & molecular weight
    for a given .gff file.
    
    Args:
        fn: file path of the gff.
        label: the title to be given to each boxplot.
        your_email: so that NCBI knows who you are.
        n_records: for trial runs, set this to the number of exons
            you wish to plot."""
    gff_df = get_exon_data(fn, n_records, your_email)
    plot_stats_from_df(gff_df, label)

def get_exon_data(fn, n_records, your_email):
    """From a gff file, obtain a dataframe filtered for exons
    only and excluding rows without GBIDs
    
    Args:
        fn: file path of the gff.
        your_email: so that NCBI knows who you are.
        n_records: for trial runs, set this to the number of exons
            you wish to plot."""
    Entrez.email = your_email
    
    # read the table
    gff = pd.read_csv(fn, '\t')
    # filter the exons
    gff_exons = gff.loc[gff['type'] == 'exon']
    # remove the rows with missing gbid
    gff_exons = gff_exons.dropna()

    # shorten the table if desired
    if n_records:
        gff_exons = gff_exons.head(n_records)

    # obtain sequence records for each exon with a gbid
    records = []
    for gbid in gff_exons['gbid']:
        handle = Entrez.efetch('nucleotide', id=gbid, rettype='gb')
        record = SeqIO.read(handle, 'gb')
        records.append(record)

    # calculate the stats
    #   list comprehensions are used here 
    #   https://realpython.com/list-comprehension-python/
    lengths = [len(r.seq) for r in records]
    gcs = [GC(r.seq) for r in records]
    weights = [molecular_weight(r.seq) for r in records]
    
        # assign the values to the 
    gff_exons.loc[:, 'lengths'] = lengths
    gff_exons.loc[:, 'GC'] = gcs
    gff_exons.loc[:, 'molecular_weight'] = weights
    return gff_exons

def plot_stats_from_df(df, label):
    """Plot boxplots of values in columns named:
    'lengths', 'GC' or 'molecular_weight'"""
    for col in ['lengths', 'GC', 'molecular_weight']:
        gff_exons[col].plot(kind='box')
        plt.title(label)
        plt.show()

In [None]:
animal_files = {'mouse': 'GRCm38.gff3', 
         'human': 'GRCh38.gff3', 
         'zebrafish': 'GRCz11.gff3',
          'panda': 'AilMel.gff3'}

In [None]:
for animal, file_name in animal_files.items():
    plot_gff_stats('data/'+file_name, animal, 'jct61@cam.ac.uk', 5)