# Gene Finder
The purpose of this function is to return genbank genome data as an organized pandas dataframe with relevant data. 
This data includes chromosome location, name, locus tag, start codon location, stop codon location, 
direction of transcription, and the entire gene sequence. Listing genomic data in this fashion improves
bioinformatics related workflows where gene sequences are a focus. 

In [28]:
from Bio import SeqIO
from Bio import Seq
import csv
import numpy as np
import pandas as pd

def gene_finder(file, chromosome = np.NaN):
    chromosome_number = chromosome
    filename = file
    genes = pd.DataFrame(columns=['chromosome', 'gene name', 'locus tag', 'start', 'stop', 'direction', 'sequence'])
    record = SeqIO.read(filename, 'genbank')
    currentChromosome = record.seq
    sequence = ''
    name = ''
    locus = ''
    start = ''
    stop = ''
    direction = ''
    sequences = ''
    
    for feature in record.features:
        if(feature.type == 'gene'):
            # gene name
            name = str(feature.qualifiers.get('gene'))    
        if(feature.type == 'gene'):
            # locus tag (unique)
            locus =  str(feature.qualifiers.get('locus_tag'))
        if(feature.type == 'gene'):
            # start codon
            original_location = str(feature.location)
            edit1 = original_location.replace('[', '')
            edit2 = edit1.replace('<','')
            edit3 = edit2.replace('>','')
            edit4 = edit3.replace(']','')
            edit5 = edit4.replace('(-)','')
            edit6 = edit5.replace('(+)','')
            splitWords = edit6.split(':')
            start = str(splitWords[0])
        if(feature.type == 'gene'):
            original_location = str(feature.location)
            edit1 = original_location.replace('[', '')
            edit2 = edit1.replace('<','')
            edit3 = edit2.replace('>','')
            edit4 = edit3.replace(']','')
            edit5 = edit4.replace('(-)','')
            edit6 = edit5.replace('(+)','')
            splitWords = edit6.split(':')
            stop = str(splitWords[1])
        if(feature.type == 'gene'):
            original_location = str(feature.location)
            splitWords = original_location.split(']')
            direction = str(splitWords[1])
        if(feature.type=='gene'):
            original_location = str(feature.location)
            edit1 = original_location.replace('[', '')
            edit2 = edit1.replace('<','')
            edit3 = edit2.replace('>','')
            edit4 = edit3.replace(']','')
            edit5 = edit4.replace('(-)','')
            edit6 = edit5.replace('(+)','')
            splitWords = edit6.split(':')
            lower = int(splitWords[0])
            upper = int(splitWords[1])
            sequence = currentChromosome[lower:upper]
            sequences = sequence
        genes = genes.append({'chromosome': chromosome_number, 'gene name': name, 'locus tag': locus, 
        'start': start, 'stop': stop, 'direction': direction, 'sequence': sequences}, ignore_index=True)
    
    genes.drop_duplicates(subset = 'locus tag', keep = 'first', inplace = True)
    genes.dropna(subset = ['locus tag'], inplace=True)
    genes.drop(0, inplace = True)
    genes.reset_index(inplace = True)
    genes.drop(columns='index', inplace = True)
    genes['sequence'] = genes['sequence'].astype(str)
    return genes 
    

Here is an example of this function in action with the data from the first chromosome of 
Saccharomyces cerevisiae. 

In [29]:
chr_1 = gene_finder('assets/chromosome 1.flat', 1)
chr_1

Unnamed: 0,chromosome,gene name,locus tag,start,stop,direction,sequence
0,1,['PAU8'],['YAL068C'],1806,2169,(-),CTAGTTTGCGATAGTGTAGATACCGTCCTTGGATAGAGCACTGGAG...
1,1,,['YAL067W-A'],2479,2707,(+),ATGCCAATTATAGGGGTGCCGAGGTGCCTTATAAAACCCTTTTCTG...
2,1,['SEO1'],['YAL067C'],7234,9016,(-),TTATTTTTCATCAGATACTGATAAGGTTTCAACGTCTTTTGACGTT...
3,1,,['YAL065C'],11564,11951,(-),TTACCATACGATTGCCAGCAATACGGTGGAAATAAAAACACTTATG...
4,1,,['YAL064W-B'],12045,12426,(+),ATGGCAGGTGAAGCAGTTTCGGAACACACACCAGATTCGCAGGAAG...
...,...,...,...,...,...,...,...
96,1,,['YAR061W'],218139,219145,(+),ATGCCTTATCACTATTTATTTTTGGCACTCTTCACCTACCTGGCCA...
97,1,,['YAR064W'],220197,220497,(+),ATGTTGATTGATTTTTGCTGTAGTTATATAGCAGGGACCCACGGAA...
98,1,,['YAR066W'],221048,221660,(+),ATGTTCAATCGTTTTAACAAATTCCAAGCTGCTGTCGCTTTGGCCC...
99,1,,['YAR068W'],222405,222891,(+),ATGCCACAAGTACAGTCGTGGTTTCCTGTTCAGAAACAACCGACGC...


This data can easily be exported as a `.csv` or Microsoft Excel file with pandas. 

In [30]:
# export to .csv
chr_1.to_csv('assets/chr_1.csv', index = False)

# export to .xlsx (Microsoft Excel)
chr_1.to_excel('assets/chr_1.xlsx', index = False)