In [12]:
import collections
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

if os.getcwd().endswith('notebook'):
    os.chdir('..')
    
from Bio import SeqIO
from Bio.Seq import Seq
import gffutils

In [2]:
sns.set(palette='colorblind', font_scale=1.3)

In [3]:
test_path = os.path.join(os.getcwd(), 'data/dataset_balanced_test.csv')
train_path = os.path.join(os.getcwd(), 'data/dataset_balanced_train.csv')
ncbi_folder = os.path.join(os.getcwd(), 'data/ncbi/raw/')

In [4]:
data_df = pd.concat([pd.read_csv(train_path), pd.read_csv(test_path)])
data_df.shape

(30413, 13)

In [5]:
data_df.head()

Unnamed: 0,specie_name,seqid,gene_name,start_inclusive,end_exclusive,length,strand,temperature,temperature_range,sequence,gc_content,ag_content,gt_content
0,Acetobacter aceti,NZ_CP014692.1,rnpB,888876,889266,390,-,26.0,mesophilic,CCAGACGGTCGGGCGATCGCTGTTGCCTTTCAGGTGATGGAGGAAA...,0.638462,0.566667,0.487179
1,Acetobacter aceti,NZ_CP014692.1,ssrA,1260173,1260502,329,+,26.0,mesophilic,GACCTTGCGGAAGGTGATGCATACCCCTATCTTCATGAGTGCAGGA...,0.580547,0.522796,0.504559
2,Acetobacter aceti,NZ_CP014692.1,ffs,2223427,2223522,95,-,26.0,mesophilic,AGAGGCCTGTGATGGACGGGCGCCTTGCCAACCCGGTCAGATCCGG...,0.610526,0.526316,0.536842
3,Acetobacter aceti,NZ_CP014692.1,rrf,2846207,2846323,116,-,26.0,mesophilic,CCTGGTGGCTATGGCGGGGAGAGATCCACCCGATCCCATCCCGAAC...,0.62931,0.508621,0.482759
4,Acetobacter orleanensis,NZ_BAMY01000192.1,murA,8234,9497,1263,+,28.0,mesophilic,ATGGACCGGTTTATTATTCGGGGCGGTCGCCCCCTGCATGGTGAGA...,0.605701,0.501188,0.513856


## Find Open Reading Frames (ORF)

In [21]:
data_df.iloc[0]['sequence']

'CCAGACGGTCGGGCGATCGCTGTTGCCTTTCAGGTGATGGAGGAAAGTCCGGGCTCCACGGGAGAACGGTGCCGGCTAACGGCCGGCGAGGGTGACCTCAGGGAAAGTGCCACAGAAAACAAACCACCTCCCGGGCCTGCTCGCAGGCTGGAGGCAAGGGTGAAACGGTGCGGTAAGAGCGCACCGCGCTTCCGGTAACGGCGGCGGCAGGGCAAACCCCACCGGGAGCAAGACCGAATAGGGACGGTAGGCAGCCTGACTGCCAAAGGTTCTCCCGCCTTCCGTCCGGGTTGGTCGCGCGAGGCGTGTTGCAAGACACGTCCCAGAGGAATGATCGTCACGGTCTCTTCGGAGAACGGACAGAACCCGGCTTACAGACCGTCTGGCACA'

In [27]:
def find_longest_orf(mrna_seq):
    """
    20.1.13  Identifying open reading frames from http://biopython.org/DIST/docs/tutorial/Tutorial.html
    """
    longest_protein = ''
    frame = None
    
    for strand, nuc in [(+1, mrna_seq)]:
        for frame in range(3):
            length = 3 * ((len(mrna_seq) - frame) // 3)  # Multiple of three
            for pro in nuc[frame:frame + length].translate().split("*"):
                if len(pro) > len(longest_protein):
                    longest_protein = pro
                    frame = frame
    return longest_protein, frame

In [29]:
prot, frame = find_longest_orf(Seq(data_df.iloc[0]['sequence']))
prot._data, frame

('ERTALPVTAAAGQTPPGARPNRDGRQPDCQRFSRLPSGLVARGVLQDTSQRNDRHGLFGERTEPGLQTVWH', 2)

## Load CDS

Most of the CDS annotations seem rubbish unfortunately.

In [7]:
def load_fasta_records(path):
    records = []
    with open(path, 'r') as f:
        for record in SeqIO.parse(f, 'fasta'):
            records.append(record)
    return records

In [12]:
def extract_cds(data_df, ncbi_folder):
    specie_names = data_df['specie_name'].unique()
    n_species = len(specie_names)
    
    gene_names = set(data_df['gene_name'].unique().tolist())
    
    columns = [
        'specie_name', 
        'seqid', 
        'gene_name', 
        'start_inclusive',
        'end_exclusive',
        'length',
        'protein_sequence',
        'valid_translation',
    ]
    
    output = []
    for specie_idx, specie_name in enumerate(specie_names):
        print(f'{specie_idx + 1} / {n_species}: {specie_name}')
        
        specie_name_formatted = specie_name.lower().replace(' ', '_')
        fasta_path = os.path.join(ncbi_folder, f'{specie_name_formatted}.fasta')
        gff_path = os.path.join(ncbi_folder, f'{specie_name_formatted}.gff')
        annotations = gffutils.create_db(gff_path, ':memory:', merge_strategy='replace')
        
        records = {
            r.name: r
            for r in load_fasta_records(fasta_path)
        }
        
        for i, f in enumerate(annotations.features_of_type('CDS')):
            if 'gene' not in f.attributes:
                continue

            gene_name = f.attributes['gene'][0]

            if gene_name in gene_names:
                strand = f.strand
                if strand == '+':
                    sequence = records[f.seqid].seq[f.start:f.end]
                elif strand == '-':
                    sequence = records[f.seqid].seq[f.start:f.end].reverse_complement()
                else:
                    raise ValueError(f'Unknown strand "{strand}"')
                    
                protein_seq = sequence.translate()
                
                valid_translation = (
                    len(sequence._data) % 3 == 0 and
                    len([i for i, v in enumerate(protein_seq) if v == '*']) == 0
                )
                
                output.append([
                    specie_name,
                    f.seqid,
                    gene_name,
                    f.start,
                    f.end,
                    len(protein_seq),
                    protein_seq._data,
                    valid_translation,
                ])
                
    return pd.DataFrame(output, columns=columns)

In [13]:
protein_df = extract_cds(data_df, ncbi_folder)
protein_df.head()

1 / 1190: Acetobacter aceti
2 / 1190: Acetobacter orleanensis
3 / 1190: Acetobacter malorum
4 / 1190: Acetobacter syzygii
5 / 1190: Acetobacter orientalis
6 / 1190: Asaia platycodi
7 / 1190: Asaia prunellae
8 / 1190: Belnapia moabensis
9 / 1190: Acidisphaera rubrifaciens
10 / 1190: Roseomonas stagni
11 / 1190: Granulibacter bethesdensis
12 / 1190: Komagataeibacter hansenii
13 / 1190: Komagataeibacter oboediens
14 / 1190: Acholeplasma laidlawii
15 / 1190: Succiniclasticum ruminis
16 / 1190: Acidobacterium capsulatum
17 / 1190: Terriglobus roseus
18 / 1190: Granulicella pectinivorans
19 / 1190: Actinomyces israelii
20 / 1190: Schaalia odontolytica
21 / 1190: Actinomyces viscosus
22 / 1190: Actinomyces gerencseriae
23 / 1190: Schaalia turicensis
24 / 1190: Schaalia cardiffensis
25 / 1190: Schaalia vaccimaxillae
26 / 1190: Actinomyces dentalis
27 / 1190: Actinomyces timonensis
28 / 1190: Mobiluncus mulieris
29 / 1190: Mobiluncus curtisii
30 / 1190: Varibaculum cambriense
31 / 1190: Trueper

233 / 1190: Paraburkholderia phytofirmans
234 / 1190: Paraburkholderia phenoliruptrix
235 / 1190: Paraburkholderia ginsengisoli
236 / 1190: Paraburkholderia sediminicola
237 / 1190: Burkholderia latens
238 / 1190: Burkholderia contaminans
239 / 1190: Paraburkholderia oxyphila
240 / 1190: Cupriavidus basilensis
241 / 1190: Cupriavidus oxalaticus
242 / 1190: Cupriavidus gilardii
243 / 1190: Cupriavidus pinatubonensis
244 / 1190: Pandoraea thiooxydans
245 / 1190: Polynucleobacter sinensis
246 / 1190: Polynucleobacter duraquae
247 / 1190: Ralstonia mannitolilytica
248 / 1190: Arcobacter cryaerophilus
249 / 1190: Arcobacter nitrofigilis
250 / 1190: Arcobacter butzleri
251 / 1190: Arcobacter skirrowii
252 / 1190: Arcobacter thereius
253 / 1190: Campylobacter ureolyticus
254 / 1190: Campylobacter jejuni
255 / 1190: Campylobacter upsaliensis
256 / 1190: Campylobacter hyointestinalis
257 / 1190: Campylobacter hominis
258 / 1190: Campylobacter volucris
259 / 1190: Sulfurospirillum arsenophilum
2

456 / 1190: Alkalilimnicola ehrlichii
457 / 1190: Ectothiorhodospira marina
458 / 1190: Halorhodospira halophila
459 / 1190: Thioalkalivibrio nitratireducens
460 / 1190: Acidiferrobacter thiooxydans
461 / 1190: Buttiauxella agrestis
462 / 1190: Buttiauxella ferragutiae
463 / 1190: Buttiauxella noackiae
464 / 1190: Citrobacter amalonaticus
465 / 1190: Citrobacter koseri
466 / 1190: Citrobacter werkmanii
467 / 1190: Citrobacter sedlakii
468 / 1190: Dickeya dadantii
469 / 1190: Lelliottia amnigena
470 / 1190: Enterobacter asburiae
471 / 1190: Enterobacter cancerogenus
472 / 1190: Lelliottia nimipressuralis
473 / 1190: Franconibacter pulveris
474 / 1190: Erwinia tracheiphila
475 / 1190: Erwinia pyrifoliae
476 / 1190: Erwinia billingiae
477 / 1190: Erwinia toletana
478 / 1190: Erwinia piriflorinigrans
479 / 1190: Erwinia typographi
480 / 1190: Escherichia coli
481 / 1190: Escherichia albertii
482 / 1190: Hafnia alvei
483 / 1190: Hafnia paralvei
484 / 1190: Klebsiella variicola
485 / 1190: K

689 / 1190: Paenibacillus macquariensis
690 / 1190: Paenibacillus wynnii
691 / 1190: Paenibacillus xylanexedens
692 / 1190: Paenibacillus glacialis
693 / 1190: Paenibacillus macerans
694 / 1190: Thermobacillus composti
695 / 1190: Pasteurella skyensis
696 / 1190: Thermincola ferriacetica
697 / 1190: Desulfallas geothermicus
698 / 1190: Desulfofundulus thermocisternus
699 / 1190: Desulfofundulus australicus
700 / 1190: Desulfohalotomaculum alkaliphilum
701 / 1190: Desulfallas arcticus
702 / 1190: Desulfofundulus thermosubterraneus
703 / 1190: Desulfotomaculum hydrothermale
704 / 1190: Tepidibacter formicigenes
705 / 1190: Picrophilus torridus
706 / 1190: Thiomicrorhabdus arctica
707 / 1190: Zavarzinella formosa
708 / 1190: Planococcus halocryophilus
709 / 1190: Sporosarcina globispora
710 / 1190: Sporosarcina psychrophila
711 / 1190: Pseudoalteromonas denitrificans
712 / 1190: Pseudoalteromonas undina
713 / 1190: Pseudoalteromonas tetraodonis
714 / 1190: Pseudoalteromonas prydzensis
715

909 / 1190: Lutibacter profundi
910 / 1190: Pseudoalteromonas gelatinilytica
911 / 1190: Acidobacterium ailaaui
912 / 1190: Sanguibacter gelidistatuariae
913 / 1190: Vibrio cidicii
914 / 1190: Colwellia mytili
915 / 1190: Streptococcus himalayensis
916 / 1190: Pseudomonas canadensis
917 / 1190: Cyclobacterium amurskyense
918 / 1190: Streptomyces himastatinicus
919 / 1190: Helicobacter pullorum
920 / 1190: Shewanella baltica
921 / 1190: Rhizobium rhizogenes
922 / 1190: Oxalobacter formigenes
923 / 1190: Chromobacterium violaceum
924 / 1190: Paraglaciecola chathamensis
925 / 1190: Paraglaciecola psychrophila
926 / 1190: Helicobacter mustelae
927 / 1190: Psychroflexus torquis
928 / 1190: Caldimonas manganoxidans
929 / 1190: Leifsonia rubra
930 / 1190: Shewanella violacea
931 / 1190: Acidothermus cellulolyticus
932 / 1190: Methyloprofundus sedimenti
933 / 1190: Photobacterium piscicola
934 / 1190: Phormidium formosum
935 / 1190: Spirulina subsalsa
936 / 1190: Crinalium epipsammum
937 / 119

1133 / 1190: Planomicrobium glaciei
1134 / 1190: Pseudoalteromonas tunicata
1135 / 1190: Leisingera aquimarina
1136 / 1190: Marinovum algicola
1137 / 1190: Roseisalinus antarcticus
1138 / 1190: Planktotalea frisia
1139 / 1190: Thalassospira profundimaris
1140 / 1190: Elstera litoralis
1141 / 1190: Shewanella algae
1142 / 1190: Shewanella frigidimarina
1143 / 1190: Tuberibacillus calidus
1144 / 1190: Thermus filiformis
1145 / 1190: Tepidanaerobacter syntrophicus
1146 / 1190: Moorella glycerini
1147 / 1190: Thermosulfurimonas dismutans
1148 / 1190: Coprothermobacter proteolyticus
1149 / 1190: Thermomonospora curvata
1150 / 1190: Marinitoga piezophila
1151 / 1190: Pseudothermotoga elfii
1152 / 1190: Defluviitoga tunisiensis
1153 / 1190: Tsukamurella pseudospumae
1154 / 1190: Sporolituus thermophilus
1155 / 1190: Photobacterium kishitanii
1156 / 1190: Aliivibrio salmonicida
1157 / 1190: Vibrio gigantis
1158 / 1190: Luteibacter rhizovicinus
1159 / 1190: Stenotrophomonas maltophilia
1160 / 1

Unnamed: 0,specie_name,seqid,gene_name,start_inclusive,end_exclusive,length,protein_sequence,valid_translation
0,Acetobacter orleanensis,NZ_BAMY01000192.1,murA,8235,9497,420,WTGLLFGAVAPCMVRSPLVARKTPG*NCWLQGSSLPSGWC*PTCRI...,False
1,Acetobacter orleanensis,NZ_BAMY01000040.1,pyrF,10954,11658,234,MTRKIGLIAALDTQNAATAQTWVQAVAPSAGAVKLGLEFAYAAGFQ...,False
2,Acetobacter orleanensis,NZ_BAMY01000036.1,groL,7368,9008,546,MAAKDVKFGGDARQRMLRGVDILADAVKVTLGPKGRNVVLDKSFGA...,False
3,Acetobacter orleanensis,NZ_BAMY01000036.1,efp,14606,15172,188,MKQQANLIRAGQVIEHDGRRWTVLKQQIITPGKGGAFIQVEMRDLK...,False
4,Acetobacter orleanensis,NZ_BAMY01000031.1,fusA,156,2243,695,VSAKSDLSKIRNIGITAHIDAGKTTTTERILYYTGVSHKIGEVHDG...,False


In [14]:
protein_df.head()

Unnamed: 0,specie_name,seqid,gene_name,start_inclusive,end_exclusive,length,protein_sequence,valid_translation
0,Acetobacter orleanensis,NZ_BAMY01000192.1,murA,8235,9497,420,WTGLLFGAVAPCMVRSPLVARKTPG*NCWLQGSSLPSGWC*PTCRI...,False
1,Acetobacter orleanensis,NZ_BAMY01000040.1,pyrF,10954,11658,234,MTRKIGLIAALDTQNAATAQTWVQAVAPSAGAVKLGLEFAYAAGFQ...,False
2,Acetobacter orleanensis,NZ_BAMY01000036.1,groL,7368,9008,546,MAAKDVKFGGDARQRMLRGVDILADAVKVTLGPKGRNVVLDKSFGA...,False
3,Acetobacter orleanensis,NZ_BAMY01000036.1,efp,14606,15172,188,MKQQANLIRAGQVIEHDGRRWTVLKQQIITPGKGGAFIQVEMRDLK...,False
4,Acetobacter orleanensis,NZ_BAMY01000031.1,fusA,156,2243,695,VSAKSDLSKIRNIGITAHIDAGKTTTTERILYYTGVSHKIGEVHDG...,False


In [18]:
n_valid = len(protein_df[protein_df['valid_translation'] == True])
n_invalid = len(protein_df[protein_df['valid_translation'] == False])
100 * n_valid / (n_valid + n_invalid)

0.12843478554770676

In [19]:
protein_df[protein_df['valid_translation'] == True]

Unnamed: 0,specie_name,seqid,gene_name,start_inclusive,end_exclusive,length,protein_sequence,valid_translation
203,Acidisphaera rubrifaciens,NZ_BANB01000272.1,carB,1,1465,488,MPKRTDIRSILIIGAGPIVIGQACEFDYSGAQACKALRAEGYRVIL...,True
655,Actinomyces dentalis,NZ_AUBL01000016.1,lepB,49317,50340,341,MGHRLARGGPARRGRVRGVGGRWTGLAGRRTRAGDTGDRRGGRGAR...,True
993,Aeromonas molluscorum,NZ_AQGQ01000033.1,tuf,1,625,208,MSKEKFERNKPHVNVGTIGHVDHGKTTLTAAITNVLAKHYGGKAFA...,True
996,Aeromonas molluscorum,NZ_AQGQ01000099.1,nusA,1,1420,473,MNKEILLVVDAVSHEKAVPREKIFEALETALATATKKKYEGEIEVR...,True
1044,Aeromonas bivalvium,NZ_CDBT01000035.1,tuf,2072,2210,46,VSKEKFERSKPHVNVGTIGHVDHGKTTLTAAITNVLAKHYGGKAFA,True
1639,Paraglaciecola polaris,NZ_BAER01000001.1,tuf,1,289,96,MAKAKFERTKPHVNVGTIGHVDHGKTTLTAAITTVLAKTYGGSASA...,True
10619,Deinococcus frigens,NZ_JNIW01000055.1,tuf,1,388,129,MAKGTFSRSKPHVNIGTIGHVDHGKTTLTAAITFTAAAMDPTIETM...,True
10625,Deinococcus frigens,NZ_JNIW01000092.1,tuf,1,388,129,MAKGTFSRSKPHVNIGTIGHVDHGKTTLTAAITFTAAAMDPTIETM...,True
12554,Mixta calida,NZ_MLFO01000014.1,tuf,1,211,70,MAKEQFQRNKPHVNVGTIGHVDHGKTTLTAAITTVLSKTYGGQARA...,True
12563,Mixta calida,NZ_MLFO01000042.1,purE,1,256,85,MSLNAAPTRIAIVMGSKSDWATMQFAAEILTALAVPFHVEIVSAHR...,True


In [20]:
protein_df[protein_df['valid_translation'] == False].head()

Unnamed: 0,specie_name,seqid,gene_name,start_inclusive,end_exclusive,length,protein_sequence,valid_translation
0,Acetobacter orleanensis,NZ_BAMY01000192.1,murA,8235,9497,420,WTGLLFGAVAPCMVRSPLVARKTPG*NCWLQGSSLPSGWC*PTCRI...,False
1,Acetobacter orleanensis,NZ_BAMY01000040.1,pyrF,10954,11658,234,MTRKIGLIAALDTQNAATAQTWVQAVAPSAGAVKLGLEFAYAAGFQ...,False
2,Acetobacter orleanensis,NZ_BAMY01000036.1,groL,7368,9008,546,MAAKDVKFGGDARQRMLRGVDILADAVKVTLGPKGRNVVLDKSFGA...,False
3,Acetobacter orleanensis,NZ_BAMY01000036.1,efp,14606,15172,188,MKQQANLIRAGQVIEHDGRRWTVLKQQIITPGKGGAFIQVEMRDLK...,False
4,Acetobacter orleanensis,NZ_BAMY01000031.1,fusA,156,2243,695,VSAKSDLSKIRNIGITAHIDAGKTTTTERILYYTGVSHKIGEVHDG...,False


In [21]:
protein_df[protein_df['valid_translation'] == False].head().values[0]

array(['Acetobacter orleanensis', 'NZ_BAMY01000192.1', 'murA', 8235, 9497,
       420,
       'WTGLLFGAVAPCMVRSPLVARKTPG*NCWLQGSSLPSGWC*PTCRILPILPPWSPCCASTA*RWKPWGGIRQSCPLGVKLPARKRLTISSPKCGPPFWCSARCWPAAGRRVSPYPAVAPLAPARLTST*KGWKRLGPKSRWKTATSTPAHPTA*QGTELCCRLPPWGPPKT**WPPHWRMAARKLSTQRVSQKWWIWQTA*TPWARALPAQAPAILLLMA*KLCTGLNIASCPTVLNAAPMPVPQPLPAVTSALLAATWTIWALWSELLRKPAWKSRRKVT*CASGVPARCVALIL*RNLIRVSRPICRHSSWRCSLSPKARP*LRKPFLKTVSCMCRNLIGWVRALTSTAHPPLFAVCIACLAPRLWRPTCGPLSP*FWQGSPPMGKPSSAVSIIWTGAMKR*NASLPPVVPRLSA*KT',
       False], dtype=object)