This is an example run of implemented models for this project.

### Imports

In [1]:
import numpy as np
import shutil


from Bio import SeqIO
from Bio.Seq import Seq

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score

# custom packages
from packages.metagenomics import sampling2, encoding2
from packages.linear_model.MulticlassLogisticRegression import MulticlassLogisticRegression
from packages.generative_model.naive_bayes import run_naive_bayes

## Sampling and Encoding the fragment dataset
Metagenomics data must be standardized for use in classification.

### Fragment Generation

In [2]:
# parameters
output_dir = 'data/example/2000-lengths-dataset'
seq_file = 'data/train_small-db_toy-2000.fasta'
taxid_file = 'data/train_small-db_toy-2000.taxid'
sample_length = 200
coverage = 400
seed = 42

# delete output directory if it previously exists
try:
    shutil.rmtree(output_dir)
except FileNotFoundError:
    print('Existing directory was not found. Process will generate a directory.')

# build fragments
print('Building fragments...')
sampling2.generate_fragment_data(seq_file, taxid_file, output_dir, sample_length, coverage, seed)

Building fragments...


### Fragment Encoding

In [3]:
# parameters
pattern = 'fragments*.npy'
k = 6
X_train, X_test, y_train, y_test = None, None, None, None

# encode data and labels
fragments = sampling2.read_fragments(output_dir, pattern)
X_enc, y = encoding2.encode_fragment_dataset(fragments, k)
le = preprocessing.LabelEncoder()
y_enc = le.fit_transform(y)

print('Encoded fragments...')
print(X_enc.shape)

# perform check so that randomly split training and test sets both contain all classes in the data
n_classes = len(np.unique(y_enc))
n_classes_train = 0
n_classes_test = 0
X_train, X_test, y_train, y_test = None, None, None, None
count = 0
while n_classes_train < n_classes or n_classes_test < n_classes:
    if n_classes_train != 0:
        print('Encoding failed')

    # split data into test and training
    X_train, X_test, y_train, y_test = train_test_split(X_enc, y_enc, test_size=0.33, random_state=seed)
    n_classes_train = len(np.unique(y_train))
    n_classes_test = len(np.unique(y_test))
    count += 1

    if count > 1000:
        # there must be an issue and we are stuck in an infinite loop
        msg = 'Not possible for both training and test sets to contain all classes.'
        msg2 = ' (n_classes, training set length, test set length):'
        raise ValueError(msg + msg2 + str(n_classes), len(y_train), len(y_test))

print('Encoding succeeded.')

Encoded fragments...
(16282, 88349)
Encoding succeeded.


## Logistic Regression implementation

In [4]:
# parameters
eta = 0.1
epsilon = 0.01
penalty = None
l2_lambda = 0
max_iter = 200


# train model
mlr = MulticlassLogisticRegression(eta=eta,
                                   epsilon=epsilon,
                                   penalty=penalty,
                                   l2_lambda=l2_lambda,
                                   max_iter=max_iter,
                                   verbose=True)
mlr.fit(X_train, y_train)
y_pred = mlr.predict(X_test)
score = recall_score(y_test, y_pred, average='weighted')
print(score)

n_classifiers 5
training classifier 0
training classifier 1
training classifier 2
training classifier 3
training classifier 4
0.8987718645329363


## Naïve Bayes implementation

In [5]:
# train model
score = run_naive_bayes(X_train, X_test, y_train, y_test)
print(score)

0.8639746929661333
