# Data Preprocessing

The data preprocessing stage consists of the following steps:

1. creating numerical encodings for the influenza protein sequences,
2. splitting the data into training, validation, and testing.

We consider two encoding schemes: one-hot and $k$-mer. For each one, a different dataset is generated.

The partitions into training, validation, and testing are done beforehand so that no algorithm benefits from potential special "tweaking" due to partitioning the datasets differently.

In [38]:
import os

# SADR: import all auxiliary functions.
from utils import *

from sklearn.utils import shuffle
import pickle

## Loading Data

We begin by parsing the `*.fasta` files corresponding to the sequences into `DataFrames`. The most relevant metadata is extracted. The labels `-1` for the animal and `+1` for the human sequences are also assigned.

In [40]:
# SADR: path to data sets.
animal_sequences = os.path.join("data_animal", "sequences_20250325_2832148.fasta")
human_sequences = os.path.join("data_human", "sequences_20250325_9135018.fasta")

# SADR: reading datasets to memory.
df_animal, seqs_animal = parse_fasta(animal_sequences, -1)
df_human, seqs_human = parse_fasta(human_sequences, 1)

# SADR: creating a single dataframe with all sequences.
df_sequences = pd.concat([df_animal, df_human], ignore_index=True)
df_sequences["sequence"] = seqs_animal + seqs_human

In [41]:
df_sequences.head()

Unnamed: 0,seq_id,virus_info,location,host,label,is_partial,sequence
0,ABV48048.1,"hemagglutinin, partial [Influenza A virus (A/c...",China,Alectoris chukar,-1,1,DKICIGYQSTNSTETVDTLTETNVPVTQAKELLHTEHNGMLCATNL...
1,QZF77349.1,hemagglutinin [Influenza A virus],Uganda,Gallus gallus,-1,0,MEAIPLLTLLLVVTTSSADKICVGHQSTNSTETVDTLTEANVPVTQ...
2,AWV94998.1,hemagglutinin [Influenza A virus],USA,Arenaria interpres,-1,0,MEAKLFVLFCTFTVLKADTICVGYHANNSTDTVDTVLEKNVTVTHS...
3,ACM17970.1,hemagglutinin [Influenza A virus (A/chicken/Ne...,USA,Gallus gallus,-1,0,MNIQILAFIACVLTGAKGDKICLGHHAVANGTKVNTLTEKGIEVVN...
4,ACR27188.1,"hemagglutinin, partial [Influenza A virus (A/t...",France,Meleagris gallopavo,-1,1,ETAYVSVVASHYNRRFTPEIARRPKIRDQEGRINYYWTLLEPSDTI...


In [42]:
df_sequences.tail()

Unnamed: 0,seq_id,virus_info,location,host,label,is_partial,sequence
3995,UWV21073.1,hemagglutinin [Influenza A virus],USA,Homo sapiens,1,0,MKTIIALSNILCLVFAQKIPGNDNSTATLCLGHHAVPNGTIVKTIT...
3996,UUV82183.1,hemagglutinin [Influenza A virus],USA,Homo sapiens,1,0,MKTIIALSNILCLVFAQKIPGNDNSTATLCLGHHAVPNGTVVKTIT...
3997,WOP34589.1,hemagglutinin [Influenza A virus],USA,Homo sapiens,1,0,MKAILVVMLYTFTTANADTLCIGYHANNSTDTVDTVLEKNVTVTHS...
3998,WBR18717.1,hemagglutinin [Influenza A virus],Germany,Homo sapiens,1,0,MKTIIALSNILCLVFAQKIPGNDNSTATLCLGHHAVPNGTIVKTIT...
3999,ABI21112.1,hemagglutinin [Influenza A virus (A/Wellington...,New Zealand,Homo sapiens,1,0,MKTIIALSYILCLVFAQKLPGNDNSTATLCLGHHAVPNGTIVKTIT...


## Encoding

### One-hot

In [43]:
df_sequences["one_hot"] = df_sequences["sequence"].apply(lambda x: one_hot_encoding(x))
df_sequences.head()

Unnamed: 0,seq_id,virus_info,location,host,label,is_partial,sequence,one_hot
0,ABV48048.1,"hemagglutinin, partial [Influenza A virus (A/c...",China,Alectoris chukar,-1,1,DKICIGYQSTNSTETVDTLTETNVPVTQAKELLHTEHNGMLCATNL...,"[[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,QZF77349.1,hemagglutinin [Influenza A virus],Uganda,Gallus gallus,-1,0,MEAIPLLTLLLVVTTSSADKICVGHQSTNSTETVDTLTEANVPVTQ...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2,AWV94998.1,hemagglutinin [Influenza A virus],USA,Arenaria interpres,-1,0,MEAKLFVLFCTFTVLKADTICVGYHANNSTDTVDTVLEKNVTVTHS...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
3,ACM17970.1,hemagglutinin [Influenza A virus (A/chicken/Ne...,USA,Gallus gallus,-1,0,MNIQILAFIACVLTGAKGDKICLGHHAVANGTKVNTLTEKGIEVVN...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
4,ACR27188.1,"hemagglutinin, partial [Influenza A virus (A/t...",France,Meleagris gallopavo,-1,1,ETAYVSVVASHYNRRFTPEIARRPKIRDQEGRINYYWTLLEPSDTI...,"[[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


In [44]:
# SADR: creating a vector for each matrix representing the one-hot encoding of the sequences.
# The vectors for all training instances are stored in a single matrix.
X = np.array([seq.flatten() for seq in df_sequences["one_hot"]])

# SADR: vector with labels.
y = df_sequences["label"].values

# SADR: shuffling the data.
X_shuffled, y_shuffled = shuffle(X, y, random_state=42)

# SADR: splitting into training and testing (80/20 proportion).
split_idx = int(0.8 * len(X_shuffled))
X_train_full, y_train_full = X_shuffled[:split_idx], y_shuffled[:split_idx]
X_test, y_test = X_shuffled[split_idx:], y_shuffled[split_idx:]

# SADR: splitting the training set into training and validation (80/20 proportion).
split_idx = int(0.8 * len(X_train_full))
X_train, y_train = X_train_full[:split_idx], y_train_full[:split_idx]
X_val, y_val = X_train_full[split_idx:], y_train_full[split_idx:]

# SADR: saving the training data in disk.
dataset_one_hot = {"X_train": X_train, "y_train": y_train, "X_val": X_val, "y_val": y_val, "X_test": X_test, "y_test": y_test}
_save_path = os.path.join("preprocessed_datasets", "dataset_one_hot.pkl")
with open(_save_path, 'wb') as f:
    pickle.dump(dataset_one_hot, f)

#### $k$-mer

In [46]:
# SADR: getting the k-mer encoding of the sequences.
encoding, dict = k_mer_encoding(df_sequences["sequence"].values, 2)

# SADR: separating features and labels.
X = encoding
y = df_sequences["label"].values

# SADR: shuffling the data.
X_shuffled, y_shuffled = shuffle(X, y, random_state=42)

# SADR: splitting into training and testing (80/20 proportion).
split_idx = int(0.8 * len(X_shuffled))
X_train_full, y_train_full = X_shuffled[:split_idx], y_shuffled[:split_idx]
X_test, y_test = X_shuffled[split_idx:], y_shuffled[split_idx:]

# SADR: splitting the training set into training and validation (80/20 proportion).
split_idx = int(0.8 * len(X_train_full))
X_train, y_train = X_train_full[:split_idx], y_train_full[:split_idx]
X_val, y_val = X_train_full[split_idx:], y_train_full[split_idx:]

# SADR: saving the training data in disk.
dataset_k_mer = {"X_train": X_train, "y_train": y_train, "X_val": X_val, "y_val": y_val, "X_test": X_test, "y_test": y_test}
_save_path = os.path.join("preprocessed_datasets", "dataset_k_mer.pkl")
with open(_save_path, 'wb') as f:
    pickle.dump(dataset_k_mer, f)