In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import torch

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/taxonomy-dataset/16S_sequences.fasta


In [2]:
fasta_path = "/kaggle/input/taxonomy-dataset/16S_sequences.fasta"

sequences = []
seq_records = set()


def read_from_file(fasta_path, min_length, remove_duplicates):
    seq = ""
    header = ""
    with open(fasta_path, "r") as f:
        for line in f:
            if line.startswith(">"):
                header = line[1:].strip()
                if seq:
                    seq = seq.upper()
                    seq = seq.replace("N", "")
                    if len(seq) >= min_length:
                        if remove_duplicates:
                            if seq not in seq_records:
                                seq_records.add(seq)
                                specie_id = header[3:].split()[0]
                                specie_id = specie_id[:-2]
                                species_name = " ".join(header.split()[1:3])
                                sequences.append((specie_id, species_name, seq))
                            else:
                                sequences.append((specie_id, species_name, seq))
                        seq = ""
                    header = ""
            else:
                seq += line.strip()
    return sequences

data = read_from_file(fasta_path, 100, True)

In [3]:
df = pd.DataFrame(data, columns=['label_id', 'label', 'sequence'])
df.head()

Unnamed: 0,label_id,label,sequence
0,118899,Actinomyces bovis,GGTCTATACCGGATATAACAACTCATGGCATGGTTGGTAGTGGAAA...
1,74334,Archaeoglobus fulgidus,GGGTGAGTAACACGTGAGTAACCTGCCCCACTTCTGGATAACCGCT...
2,118890,Actinokineospora fastidiosa,ATTCTGGTTGATCCTGCCAGAGGCCGCTGCTATCCGGCTGGGACTA...
3,44838,Atopobium minutum,TACTTTGGGATAAGCCTGGGAAACTGGGTCTATACCGGATATGACA...
4,118908,Amycolatopsis methanolica,TTGAACGGAGAGTTCGACCTGGCTCAGGATGAACGCTGGCGGCGCG...


In [4]:
from sklearn.model_selection import train_test_split

X = df["sequence"]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
)

In [5]:
def tokenize_kmers(k, df):
    sequences = df['sequence']
    
    kmers = [" ".join([seq[i:i+k] for i in range(len(seq) - k + 1)]) for seq in sequences]
    
    return kmers

tokenize_kmers(3, df)[:1]

['GGT GTC TCT CTA TAT ATA TAC ACC CCG CGG GGA GAT ATA TAT ATA TAA AAC ACA CAA AAC ACT CTC TCA CAT ATG TGG GGC GCA CAT ATG TGG GGT GTT TTG TGG GGT GTA TAG AGT GTG TGG GGA GAA AAA AAG AGC GCT CTC TCC CCG CGG GGC GCG CGG GGT GTA TAC ACG CGG GGG GGA GAT ATG TGA GAG AGC GCC CCC CCG CGC GCG CGG GGC GCC CCT CTA TAT ATC TCA CAG AGC GCT CTT TTG TGT GTT TTG TGG GGT GTG TGG GGG GGG GGT GTA TAA AAT ATG TGG GGC GCC CCT CTA TAC ACC CCA CAA AAG AGG GGC GCG CGA GAC ACG CGA GAC ACG CGG GGG GGT GTA TAG AGC GCC CCG CGG GGC GCC CCT CTG TGA GAG AGA GAG AGG GGG GGT GTG TGA GAC ACC CCG CGG GGC GCA CAC ACA CAC ACT CTG TGG GGG GGA GAC ACT CTG TGA GAG AGA GAC ACA CAC ACG CGG GGC GCC CCA CAG AGA GAC ACT CTC TCC CCT CTA TAC ACG CGG GGA GAG AGG GGA GAG AGC GCA CAG AGT GTG TGG GGG GGG GGA GAA AAT ATA TAT ATT TTG TGC GCC CCA CAA AAT ATG TGG GGG GGC GCG CGA GAA AAA AAG AGC GCC CCT CTA TAT ATG TGC GCA CAG AGC GCG CGA GAC ACG CGC GCC CCG CGC GCG CGT GTG TGA GAG AGG GGG GGA GAT ATG TGA GAC ACG CGG GGC GCT CTT TTC TCG CG