In [1]:
import fasttext

import pandas as pd

import joblib

from tools.sarscov2vec import (
    FastaReader,
    KMersTransformer
)

### Requirements

This notebook requires Python 3.8+ virtual environment with packages from the `requirements.txt` file.

Furthermore, you need to download NLP model and SVM classification model with label encoder before cells execution below:
* `fasttext_unsupervised_kmer7_25k_samples.28.02.2022.bin`
* `svm_supervised_36k_samples.28.02.2022.joblib`
* `label_encoder_36k_samples.28.02.2022.joblib`

#### Load FASTA and extract nucleotide sequence (step 1)

In [2]:
virus_sample = FastaReader(fasta_file_path='ON114122.fasta')

In [3]:
virus_sequence = virus_sample.get_sequence()

In [4]:
# Genome length
len(virus_sequence)

29724

In [5]:
# Unique chars in genome sequence
set(virus_sequence)

{'A', 'C', 'G', 'N', 'T'}

In [6]:
# How much "N" chars exists
virus_sequence.count('N')

7

#### Transform genome sequence to sentence with tokens (step 2)

In [7]:
kmer_trans = KMersTransformer(size=7, sliding_window=1)

In [8]:
df = pd.DataFrame([[virus_sequence]], columns=['sequence'])
df.head()

Unnamed: 0,sequence
0,TTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTC...


In [9]:
virus_sentence = kmer_trans.transform(df)

In [10]:
virus_sentence

0    TTGTAGA TGTAGAT GTAGATC TAGATCT AGATCTG GATCTG...
Name: sequence, dtype: object

#### Execute NLP model to extract 200-dim numerical vector (step 3)

In [11]:
nlp_model = fasttext.load_model('fasttext_unsupervised_kmer7_25k_samples.28.02.2022.bin')



In [12]:
virus_vector = nlp_model.get_sentence_vector(virus_sentence.iloc[0])
virus_vector.shape

(200,)

In [13]:
# Vector reshaping
virus_vector = virus_vector.reshape(1, -1)
virus_vector.shape

(1, 200)

In [14]:
virus_vector

array([[ 0.00629183,  0.03516009, -0.0051704 , -0.01445506,  0.05645438,
         0.06074398,  0.02379811, -0.01511069,  0.02883584,  0.01477559,
        -0.05239385,  0.02924786, -0.00778445,  0.03246634, -0.00512506,
         0.00505514,  0.00276198, -0.02956945,  0.05233557, -0.04806691,
        -0.06677174,  0.00399255, -0.0475532 , -0.08454312,  0.02418624,
         0.04419548,  0.01253094,  0.02593768, -0.007507  , -0.00216588,
         0.0005841 , -0.03582927,  0.10901446,  0.00524319, -0.01559883,
         0.00057139, -0.01283794,  0.03267699, -0.00717529,  0.03690799,
        -0.05053783,  0.00597926, -0.0233377 , -0.01120885, -0.00446461,
         0.08259936, -0.06927563, -0.0570327 , -0.00740768,  0.01789206,
        -0.00453163,  0.01745039, -0.02976232, -0.00581325, -0.00492681,
        -0.00397132, -0.03027195, -0.04675391, -0.04120129, -0.04190159,
         0.05046935,  0.01874417, -0.01484625, -0.00105026, -0.0735895 ,
        -0.02080752, -0.01627279,  0.02954617, -0.0

#### Extract Machine Learning classification model to get the prediction results (step 4)

In [15]:
ml_model = joblib.load('svm_supervised_36k_samples.28.02.2022.joblib')

In [16]:
lable_encoder = joblib.load('label_encoder_36k_samples.28.02.2022.joblib')

In [17]:
predicted_sarscov2_variant = lable_encoder.inverse_transform(ml_model.predict(virus_vector))[0]
predicted_sarscov2_variant

'BA.2'

In [18]:
prediction_probabilities = ml_model.predict_proba(virus_vector)[0]
prediction_probabilities

array([0.00000000e+00, 0.00000000e+00, 2.55102041e-05, 1.28794868e-03,
       9.98686541e-01, 0.00000000e+00])

In [19]:
best_probability = round(max(prediction_probabilities) * 100, 2)
f"{best_probability}% for {predicted_sarscov2_variant} (sub)variant"

'99.87% for BA.2 (sub)variant'