In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import math
import time
import datetime
from collections import Counter
pd.set_option('max_columns',None)
from matplotlib import pyplot as plt

## Version history
* V1. Started
* V2. Removing downloaded files
* V3. No change
* V4. Fixed a discrepancy in filename for Streptococcus pneumoniae

# Introduction

The so-called fractional base content (FBC) spectrum for each species (genome) is the basis for simulating measurements by drawing samples (BOC reads) from the distribution. Therefore, it should be useful either for EDA or modeling to have handy a database of such spectra for all the bacteria of interest. 

The following code is largely verbatim from the authors' implementation that accompanies their [paper](https://www.frontiersin.org/articles/10.3389/fmicb.2020.00257/full):

[https://github.com/rlwphd/DNAFingerprints](https://github.com/rlwphd/DNAFingerprints)

I only adjusted the code slightly by collecting the spectra into two dataframes `df_train` and `df_test` at the end. The designations "train" and "test" refer to the original paper; I do not know if the same species/strains are used in the train and test sets in the current competition.

In [None]:
!wget https://github.com/rlwphd/DNAFingerprints/archive/refs/heads/master.zip
import zipfile
with zipfile.ZipFile('master.zip', 'r') as zip_ref:
    zip_ref.extractall('.')    

In [None]:
# fixing a discrepancy in the original file name
!mv ./DNAFingerprints-master/Bacteria/Training/Streptococcus_pneumonia.txt ./DNAFingerprints-master/Bacteria/Training/Streptococcus_pneumoniae.txt

In [None]:
def str_count(str_part):
    A = str_part.count('A')
    T = str_part.count('T')
    G = str_part.count('G')
    C = str_part.count('C')

    return (A, T, G, C), (T, A, C, G)

def kmer_fingerprints(whole_str,dna_length,kmer_range):
    str_part = (whole_str[ii:ii+dna_length] for ii in range(len(whole_str)-dna_length+1))
    kmer_list = [item for string in str_part for item in str_count(string)]
    kmer_dict = Counter(kmer_list)
    results = [kmer_dict[val] for val in kmer_range]

    return results

def title_extraction(header, resistance):
    heading = header.split(' ')
    seq_record_id = heading[0]
    genus = heading[1]
    if len(heading) >= 3:
        species = heading[2]
        name = genus + ' ' + species
    else:
        name = genus
        
    for word in heading:
        if 'plasmid' in word:
            dna_type = 'Plasmid'
            break
        elif 'genome' in word:
            dna_type = 'Genome'
            break
        elif 'sequence'in word:
            dna_type = 'Sequence'
            break
        else:
            dna_type = ""
    
    data_add = [seq_record_id[1:], resistance, name, genus, dna_type, header]
    
    return data_add

def multiple_str_check(file):
    header = []
    start = []
    with open(file, 'r') as f:
        for ii, line in enumerate(f):
            if '>' == line[0]:
                header.append(line)
                start.extend([ii+1])
            end = ii+1
        start.extend([end])
    return header, start

def str_extraction(file,start,end,dna_length):
    string = []
    with open(file,'r') as f:
        for ii, line in enumerate(f):
            if ii >= start and ii < end:
                string.append(line)
            elif ii == end:
                break
                
    
    section = ''.join(line.strip() for line in string)
    whole_string = ''.join([section,section[0:dna_length-1]])
    return whole_string

def kmer_main(file,dna_length,kmer_range,data_index,df_Genome, resistance):
    header,start = multiple_str_check(file)
    if len(header) == 1:
        whole_str = str_extraction(file,start[0],start[1]-1,dna_length)
        species_data = title_extraction(header[0], resistance)
        kmer_results = kmer_fingerprints(whole_str,dna_length,kmer_range)
        species_data.extend(kmer_results)
        df_kmer = pd.DataFrame(species_data, index=data_index)
        if species_data[4] != 'Plasmid':
            df_Genome = df_Genome.append(df_kmer.T, ignore_index=True)
    
    type_change = {}
    for ii,name in enumerate(data_index):
        if ii < 8:
            type_change[name] = 'object'
        else:
            type_change[name]='int32'
    df_Genome = df_Genome.astype(type_change)
    
    return df_Genome


In [None]:
train_dir = './DNAFingerprints-master/Bacteria/Training'
test_dir = './DNAFingerprints-master/Bacteria/Testing'
extra_dir = './DNAFingerprints-master/Bacteria/Extras'

In [None]:
dna_length = 10
# creating the correct tuples for how many A, T, G and C's are in each bin
kmer_range = [(aa, tt, gg, cc) for aa in range(dna_length + 1) for tt in range(dna_length + 1) for gg in range(dna_length + 1) for cc in range(dna_length + 1) if aa + tt + cc + gg == dna_length]
# creating the categorical labels for storing data
data_categories = ["A%sT%sG%sC%s" % (str(aa), str(tt), str(gg), str(cc)) for aa in range(dna_length + 1) for tt in range(dna_length + 1) for gg in range(dna_length + 1) for cc in range(dna_length + 1) if aa + tt + cc + gg == dna_length]
# creating the labels for the non-numerical information
data_index = ['Seq Record ID', 'Resistance', 'Name', 'Genus', 'DNA Type', 'Notes']
# combining the the non-numerical and categorical labels for the pandas dataframe
data_index.extend(data_categories)

In [None]:
%%time

# getting the list of all of the folders and files that have the DNA sequences in them
file_list = [(os.path.join(root,name),root[3:].split('/')[-1],name) for root, dirs, files in os.walk(train_dir) for name in files if name.endswith(".txt")]

# creating the empty dataframes to store the data in
df_train = pd.DataFrame(columns=data_index)

# Running through all of the DNA sequence files
for file in file_list:
    # creating the 10mer data files for the DNA sequences
    resistance = file[2][:-4]
    df_train = kmer_main(file[0],dna_length,kmer_range, data_index, df_train, resistance)
    print('Completed %s' % file[2][:-4])

In [None]:
%%time

# getting the list of all of the folders and files that have the DNA sequences in them
file_list = [(os.path.join(root,name),root[3:].split('/')[-1],name) for root, dirs, files in os.walk(test_dir) for name in files if name.endswith(".txt")]
file_list += [(os.path.join(root,name),root[3:].split('/')[-1],name) for root, dirs, files in os.walk(extra_dir) for name in files if name.endswith(".txt")]

# creating the empty dataframes to store the data in
df_test = pd.DataFrame(columns=data_index)

# Running through all of the DNA sequence files
for file in file_list:
    # creating the 10mer data files for the DNA sequences
    resistance = file[2][:-4]
    df_test = kmer_main(file[0],dna_length,kmer_range, data_index, df_test, resistance)
    print('Completed %s' % file[2][:-4])

In [None]:
df_train.iloc[:,-286:] = df_train.iloc[:,-286:]/np.sum(df_train.iloc[:,-286:].to_numpy(),axis=1,keepdims=True)
df_test.iloc[:,-286:] = df_test.iloc[:,-286:]/np.sum(df_test.iloc[:,-286:].to_numpy(),axis=1,keepdims=True)

In [None]:
# Sort bacteria in alphabetical order
df_train = df_train.sort_values(by=['Resistance']).reset_index(drop=True)
df_test = df_test.sort_values(by=['Resistance']).reset_index(drop=True)

# The DataFrames

This is how the dataframes look at the end.

In [None]:
df_train

In [None]:
df_test

In [None]:
df_train.to_csv('train_ref_fbc_spec.csv',index=False)
df_test.to_csv('test_ref_fbc_spec.csv',index=False)

In [None]:
!rm ./master.zip
!rm -r ./DNAFingerprints-master

# "Training" FBC spectra at a glance

In [None]:
fig = plt.figure(figsize=(20,len(df_train)*3//2))
for j in range(len(df_train)):
    fig.add_subplot(len(df_train),1,j+1)
    plt.bar(np.arange(286),df_train.iloc[j,-286:])
    plt.xticks([int(i*285/8) for i in range(9)],[data_categories[int(i*285/8)] for i in range(9)])
    plt.title(df_train.iloc[j]['Name'])
plt.tight_layout()
plt.show()

# "Testing" FBC spectra at a glance

In [None]:
fig = plt.figure(figsize=(20,len(df_test)*3//2))
for j in range(len(df_test)):
    fig.add_subplot(len(df_test),1,j+1)
    plt.bar(np.arange(286),df_test.iloc[j,-286:])
    plt.xticks([int(i*285/8) for i in range(9)],[data_categories[int(i*285/8)] for i in range(9)])
    plt.title(df_test.iloc[j]['Name'])
plt.tight_layout()
plt.show()