In [None]:
from phages2050.features.io.fasta import FastaReader
from phages2050.features.transformers.kmers import KMersTransformer, GenomeAvgTransformer
from phages2050.embeddings.nucleotides.word2vec import Word2VecModelManager, Word2VecEmbedding

In [None]:
# Download the newest Word2Vec embedding model for nucleotides
# word2vec_model directory will be created
model_dir = Word2VecModelManager().download_model()
model_pkl_file = f'{model_dir}/word2vec-embedding-21.07.2020.pkl'

In [None]:
# Example sample in FASTA format (or multi-FASTA)
fasta_file = 'NC_001604.fasta'

In [None]:
# Read FASTA file and return as pandas DataFrame
fasta_reader = FastaReader(fasta_file)
sample = fasta_reader.to_df()

In [None]:
# Transform genome sequence into k-mers sequence
kmt = KMersTransformer()
X_kmt = kmt.transform(sample)

In [None]:
# Load Word2Vec embedding model
w2v = Word2VecEmbedding(
    model_pkl_file=model_pkl_file
)

In [None]:
# Transform Bacteriophage genome into averaged vector space using Word2Vec model
genone_avg = GenomeAvgTransformer(gensim_model=w2v.model)

# Return pandas DataFrame with fixed-length numeric vector
# ready for 3D plot exploration or Machine Learning classification
genone_avg.transform(X_kmt)