# Gene Finder
The purpose of this function is to return genbank genome data as an organized `.csv` file with relevant data. 
This data includes chromosome location, name, locus tag, start codon location, stop codon location, 
direction of transcription, and the entire gene sequence. Listing genomic data in this fashion improves
bioinformatics related workflows where gene sequences are a focus. 

In [None]:
from Bio import SeqIO
from Bio import Seq
import csv
import numpy as np
import pandas as pd

def gene_finder(file, chromosome = np.NaN):
    chromosome_number = chromosome
    filename = file
    genes = pd.DataFrame(columns=['chromosome', 'gene names', 'locus tag', 'start', 'stop', 'direction', 'sequence'])
    record = SeqIO.read(filename, "genbank")
    currentChromosome = record.seq
    sequence = ""
    name = ''
    locus = ''
    start = ''
    stop = ''
    direction = ''
    sequences = ''
    
    for feature in record.features:
        if(feature.type == "gene"):
            # gene name
            name = str(feature.qualifiers.get("gene"))    
        if(feature.type == "gene"):
            # locus tag (unique)
            locus =  str(feature.qualifiers.get("locus_tag"))
        if(feature.type == "gene"):
            # start codon
            original_location = str(feature.location)
            edit1 = original_location.replace("[", "")
            edit2 = edit1.replace("<","")
            edit3 = edit2.replace(">","")
            edit4 = edit3.replace("]","")
            edit5 = edit4.replace("(-)","")
            edit6 = edit5.replace("(+)","")
            splitWords = edit6.split(":")
            start = str(splitWords[0])
        if(feature.type == "gene"):
            original_location = str(feature.location)
            edit1 = original_location.replace("[", "")
            edit2 = edit1.replace("<","")
            edit3 = edit2.replace(">","")
            edit4 = edit3.replace("]","")
            edit5 = edit4.replace("(-)","")
            edit6 = edit5.replace("(+)","")
            splitWords = edit6.split(":")
            stop = str(splitWords[1])
        if(feature.type == "gene"):
            original_location = str(feature.location)
            splitWords = original_location.split("]")
            direction = str(splitWords[1])
        if(feature.type=="gene"):
            original_location = str(feature.location)
            edit1 = original_location.replace("[", "")
            edit2 = edit1.replace("<","")
            edit3 = edit2.replace(">","")
            edit4 = edit3.replace("]","")
            edit5 = edit4.replace("(-)","")
            edit6 = edit5.replace("(+)","")
            splitWords = edit6.split(":")
            lower = int(splitWords[0])
            upper = int(splitWords[1])
            sequence = currentChromosome[lower:upper]
            sequences = sequence
        genes = genes.append({'chromosome': chromosome_number, 'gene names': name, 'locus tag': locus, 
        'start': start, 'stop': stop, 'direction': direction, 'sequence': sequences}, ignore_index=True)
    

In [None]:
from Bio import SeqIO
from Bio import Seq
import csv
import numpy
import pandas as pd

filename = "chromosomes/chromosome 0.flat"
df = pd.DataFrame(columns=['chromosome', 'gene names', 'locus tag', 'start', 'stop', 'direction', 'sequence'])

for i in range(15):
    filenumber = i + 1
    filename = filename.replace(str(i), str(i + 1))
    record = SeqIO.read(filename, "genbank")
    currentChromosome = record.seq
    sequence = ""
    name = ''
    locus = ''
    start = ''
    stop = ''
    direction = ''
    sequences = ''
    
    for feature in record.features:
        if(feature.type == "gene"):
            # gene name
            name = str(feature.qualifiers.get("gene"))    
        if(feature.type == "gene"):
            # locus tag (unique)
            locus =  str(feature.qualifiers.get("locus_tag"))
        if(feature.type == "gene"):
            # start codon
            original_location = str(feature.location)
            edit1 = original_location.replace("[", "")
            edit2 = edit1.replace("<","")
            edit3 = edit2.replace(">","")
            edit4 = edit3.replace("]","")
            edit5 = edit4.replace("(-)","")
            edit6 = edit5.replace("(+)","")
            splitWords = edit6.split(":")
            start = str(splitWords[0])
        if(feature.type == "gene"):
            original_location = str(feature.location)
            edit1 = original_location.replace("[", "")
            edit2 = edit1.replace("<","")
            edit3 = edit2.replace(">","")
            edit4 = edit3.replace("]","")
            edit5 = edit4.replace("(-)","")
            edit6 = edit5.replace("(+)","")
            splitWords = edit6.split(":")
            stop = str(splitWords[1])
        if(feature.type == "gene"):
            original_location = str(feature.location)
            splitWords = original_location.split("]")
            direction = str(splitWords[1])
        if(feature.type=="gene"):
            original_location = str(feature.location)
            edit1 = original_location.replace("[", "")
            edit2 = edit1.replace("<","")
            edit3 = edit2.replace(">","")
            edit4 = edit3.replace("]","")
            edit5 = edit4.replace("(-)","")
            edit6 = edit5.replace("(+)","")
            splitWords = edit6.split(":")
            lower = int(splitWords[0])
            upper = int(splitWords[1])
            sequence = currentChromosome[lower:upper]
            sequences = sequence
        df = df.append({'chromosome': filenumber, 'gene names': name, 'locus tag': locus, 
        'start': start, 'stop': stop, 'direction': direction, 'sequence': sequences}, ignore_index=True)

df.drop_duplicates(subset = "locus tag", keep = "first", inplace = True)
df.dropna(subset = ["locus tag"], inplace=True)

df.to_csv("chromosomes.csv", index = False)
print(df)