# Classify a text document as True/False based on whether it contains gene sequence or not.

## Import libraries

In [1]:
import os
from typing import Union, Tuple, Optional, Set

In [2]:
from pypdf import PdfReader
from docx import Document

In [3]:
import torch

In [4]:
import pandas as pd
import numpy as np

In [5]:
import string
import random

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


In [8]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

In [38]:
random.seed = 42

In [11]:
import pickle as pkl

In [125]:
import joblib

## Predefined functions

In [37]:
def split_train_test(df):
    # Compute split index - train and test
    split_idx = int(len(df) * 0.80)  # first 80%
    
    # First 80% and remaining 20%
    train = df[:split_idx]
    test = df[split_idx:]
    return train,test

In [13]:
def shuffle_lists(list_of_lists):
    out = []
    for lst in list_of_lists:
        lst_copy = lst[:]            # avoid modifying original
        random.shuffle(lst_copy)     # shuffle in place
        out.append(lst_copy)
    return out

In [39]:
# generate random strings of text

CHARS = string.ascii_letters + string.digits + " .,!?$%&*#@/\n"

def random_garbage(n):
    return ''.join(random.choices(CHARS, k=n))

def generate_fake_content():
    prefix_len = random.randint(0, 5000)
    suffix_len = random.randint(0, 5000)

    prefix = random_garbage(prefix_len)
    suffix = random_garbage(suffix_len)

    return f"{prefix}{suffix}"

In [15]:
# A set of common bioinformatics/genomic file extensions.
# To identify files matching these extensions are classified as 'genomic data'.
GENOMIC_EXTENSION = [
    '.fastq', '.fq', '.fasta', '.fa', '.fna', '.gb', '.gff', '.gff3', '.gtf',
    '.sam', '.bam', '.cram', '.vcf', '.bcf', '.wig', '.bed', '.bigwig', '.tbi',
    '.tabix', '.h5', '.hdf5', # HDF5 often used for single-cell data (e.g., Anndata)
    ]

# A set of common document extensions for text extraction.
DOCUMENT_EXTENSIONS = ['.txt', '.pdf', '.doc', '.docx']

def analyze_file(file_path):
    """
    Analyzes a file path to determine if it is genomic data or a document,
    and returns the classification or the document content.

    Args:
        file_path: The full path to the file.

    Returns:
        A tuple (is_genomic_extension, file_extension, content_or_none_or_emptymessage).
    """
    if not os.path.exists(file_path):
        return (False, None, None)

    # Check for empty file before reading
    if os.path.getsize(file_path) == 0:
        return (False, None, "EMPTY FILE")

    # Get the file name and extension
    file_name = os.path.basename(file_path)
    _, ext = os.path.splitext(file_name)
    ext = ext.lower()

    # 1. Check for Genomic Data
    if ext in GENOMIC_EXTENSIONS:
        return (True, ext, None)

    # 2. Check for Document Data and Extract Content
    elif ext in DOCUMENT_EXTENSIONS:
        if ext == '.txt':
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()
                if not content.strip():
                    return (False, ext, "Empty or non-text content")
                return (False, ext, content.strip())
            except Exception as e:
                return (False, ext, f"Could not read text file: {e}")

        elif ext == '.pdf':
            reader = PdfReader(file_path)
            content = "".join([page.extract_text() for page in reader.pages])
            if not content.strip():
                    return (False, ext, "Empty or non-text content")
            return (False, ext, content)

        elif ext in ['.doc', '.docx']:
            document = Document(file_path)
            content = "\n".join([paragraph.text for paragraph in document.paragraphs])
            if not content.strip():
                    return (False, ext, "Empty or non-text content")
            return (False, ext, content)

    # 3. Handle Unknown/Other Files
    return (False, ext, None)


In [37]:
# is_genomic, file_type, content = analyze_file("xyz.txt")

In [87]:
adversarial_attempt_type = {1:"Sequence (continuous) with English Text", 
                            2:"Sequence (continuous) with Random Text (All positive samples)", 
                            3:"Sequence with alternate random characters, bounded by English Text",
                            4:"Sequence compressed and obfuscated with English Text",
                            5:"Sequence compressed and replaced by other characters",
                            6:"Sequence replaced by other characters",
                            7:"Fake DNA sequences (All negative samples)",
                            8:"Sequence replaced with multi-character mapping(length:1-5) and bounded with English Text",
                            9:"Sequence broken with random spaces, bounded by English Text",
                            10:"Sequence broken by random strings of text (variable length)",
                            11:"Randomly generated characters (All negative samples)"}

## Load data, prepare train and test data

In [45]:
pos_train=[]
neg_train=[]

pos_test=[]
pos_test_type=[] # adversarial attack type

neg_test=[]
neg_test_type=[] # adversarial attack type

In [46]:
p1=[]
p2=[]
n1=[]

path = "english_and_seq"
for file_name in os.listdir(path):
    file_path = path + "/" + file_name
    if (not (file_name.endswith(".csv") | file_name.endswith(".txt"))):
        continue
    df = pd.read_csv(file_path)
    p1.append(df['sequence'].values[0])
    p2.append(df['text_with_dna'].values[0])
    n1.append(df['text_without_dna'].values[0])

p1_train, p1_test = split_train_test(p1)
p2_train, p2_test = split_train_test(p2)
n1_train, n1_test = split_train_test(n1)
# add to global lists
pos_train = pos_train + p1_train + p2_train
pos_test = pos_test + p1_test + p2_test
pos_test_type = pos_test_type + [1]*len(p1_test + p2_test)
neg_train = neg_train + n1_train
neg_test = neg_test + n1_test
neg_test_type = neg_test_type + [1]*len(n1_test)


In [53]:
p1=[]
p2=[]

path = "text_and_seq"
for file_name in os.listdir(path):
    file_path = path + "/" + file_name
    if (not (file_name.endswith(".csv") | file_name.endswith(".txt"))):
        continue
    df = pd.read_csv(file_path)
    p1.append(df['sequence'].values[0])
    p2.append(df['generated'].values[0])

p1_train, p1_test = split_train_test(p1)
p2_train, p2_test = split_train_test(p2)
# add to global lists
pos_train = pos_train + p1_train + p2_train
pos_test = pos_test + p1_test + p2_test
pos_test_type = pos_test_type + [2]*len(p1_test + p2_test)



In [55]:
p1=[]
p2=[]
n1=[]

path = "text_and_seq_alternate_random_characters"
for file_name in os.listdir(path):
    file_path = path + "/" + file_name
    if (not (file_name.endswith(".csv") | file_name.endswith(".txt"))):
        continue
    df = pd.read_csv(file_path)
    p1.append(df['sequence'].values[0])
    p2.append(df['text_with_dna'].values[0])
    n1.append(df['text_without_dna'].values[0])

p1_train, p1_test = split_train_test(p1)
p2_train, p2_test = split_train_test(p2)
n1_train, n1_test = split_train_test(n1)
# add to global lists
pos_train = pos_train + p1_train + p2_train
pos_test = pos_test + p1_test + p2_test
pos_test_type = pos_test_type + [3]*len(p1_test + p2_test)
neg_train = neg_train + n1_train
neg_test = neg_test + n1_test
neg_test_type = neg_test_type + [3]*len(n1_test)

In [56]:
p1=[]
p2=[]
p3=[]
n1=[]

path = "text_and_seq_compressed"
for file_name in os.listdir(path):
    file_path = path + "/" + file_name
    if (not (file_name.endswith(".csv") | file_name.endswith(".txt"))):
        continue
    df = pd.read_csv(file_path)
    p1.append(df['sequence'].values[0])
    p2.append(df['compressed'].values[0])
    p3.append(df['text_with_dna'].values[0])
    n1.append(df['text_without_dna'].values[0])

p1_train, p1_test = split_train_test(p1)
p2_train, p2_test = split_train_test(p2)
p3_train, p3_test = split_train_test(p3)
n1_train, n1_test = split_train_test(n1)
# add to global lists
pos_train = pos_train + p1_train + p2_train + p3_train
pos_test = pos_test + p1_test + p2_test + p3_test
pos_test_type = pos_test_type + [4]*len(p1_test + p2_test + p3_test)
neg_train = neg_train + n1_train
neg_test = neg_test + n1_test
neg_test_type = neg_test_type + [4]*len(n1_test)


In [57]:
p1=[]
p2=[]
p3=[]
n1=[]

path = "text_and_seq_compressed_and_replaced"
for file_name in os.listdir(path):
    file_path = path + "/" + file_name
    if (not (file_name.endswith(".csv") | file_name.endswith(".txt"))):
        continue
    df = pd.read_csv(file_path)
    p1.append(df['sequence'].values[0])
    p2.append(df['compressed'].values[0])
    p3.append(df['text_with_dna'].values[0])
    n1.append(df['text_without_dna'].values[0])

p1_train, p1_test = split_train_test(p1)
p2_train, p2_test = split_train_test(p2)
p3_train, p3_test = split_train_test(p3)
n1_train, n1_test = split_train_test(n1)
# add to global lists
pos_train = pos_train + p1_train + p2_train + p3_train
pos_test = pos_test + p1_test + p2_test + p3_test
pos_test_type = pos_test_type + [5]*len(p1_test + p2_test + p3_test)
neg_train = neg_train + n1_train
neg_test = neg_test + n1_test
neg_test_type = neg_test_type + [5]*len(n1_test)


In [58]:
p1=[]
p2=[]
n1=[]

path = "text_and_seq_replaced_with_other_characters"
for file_name in os.listdir(path):
    file_path = path + "/" + file_name
    if (not (file_name.endswith(".csv") | file_name.endswith(".txt"))):
        continue
    df = pd.read_csv(file_path)
    p1.append(df['sequence'].values[0])
    p2.append(df['text_with_dna'].values[0])
    n1.append(df['text_without_dna'].values[0])

p1_train, p1_test = split_train_test(p1)
p2_train, p2_test = split_train_test(p2)
n1_train, n1_test = split_train_test(n1)
# add to global lists
pos_train = pos_train + p1_train + p2_train
pos_test = pos_test + p1_test + p2_test
pos_test_type = pos_test_type + [6]*len(p1_test + p2_test)
neg_train = neg_train + n1_train
neg_test = neg_test + n1_test
neg_test_type = neg_test_type + [6]*len(n1_test)

In [60]:
n1 = []
for file in os.listdir('single_dna'):
    try:
        with open('single_dna/'+file, "r", encoding="utf-8", errors="ignore") as f:
            text = f.read()
    except:
        continue
    n1.append(text)
n1 = [s for s in n1 if len(s) <= 1000]
n1 = random.sample(n1, 100)
n1_train, n1_test = split_train_test(n1)
neg_train = neg_train + n1_train
neg_test = neg_test + n1_test
neg_test_type = neg_test_type + [7]*len(n1_test)

In [None]:
8) multicharacter mapping -> pos (actual), pos (multicharacter + english), neg (english)
                          200           200                             200                

9) sequence with random space and english text -> pos(actual), pos(actual with random space), pos (actual with random space + english), neg (english)
                                                200         200                            200                                       200


10) sequence with with text breaks -> pos(actual), pos(actual with english text breaks), neg(english)
                                  200          200                                   200

11) randomly generated text -> neg
                               4100  (calculated based on above imbalance of pos and neg)


In [66]:
df = pd.read_pickle('multicharacter_mapping.pkl') #sequence, text_with_dna, text_without_dna
p1 = list(df['sequence'].values)
p2 = list(df['text_with_dna'].values)
n1 = list(df['text_without_dna'].values)
p1_train, p1_test = split_train_test(p1)
p2_train, p2_test = split_train_test(p2)
n1_train, n1_test = split_train_test(n1)
# add to global lists
pos_train = pos_train + p1_train + p2_train
pos_test = pos_test + p1_test + p2_test
pos_test_type = pos_test_type + [8]*len(p1_test + p2_test)
neg_train = neg_train + n1_train
neg_test = neg_test + n1_test
neg_test_type = neg_test_type + [8]*len(n1_test)

In [68]:
df = pd.read_pickle('sequence_with_random_space_and_english_text.pkl') #sequence, text_with_dna, text_without_dna
p1 = list(df['sequence'].values)
p2 = list(df['text_with_dna'].values)
p3 = list(df['sequence_with_random_space'].values)
n1 = list(df['text_without_dna'].values)
p1_train, p1_test = split_train_test(p1)
p2_train, p2_test = split_train_test(p2)
p3_train, p3_test = split_train_test(p3)
n1_train, n1_test = split_train_test(n1)
# add to global lists
pos_train = pos_train + p1_train + p2_train + p3_train
pos_test = pos_test + p1_test + p2_test + p3_test
pos_test_type = pos_test_type + [9]*len(p1_test + p2_test + p3_test)
neg_train = neg_train + n1_train
neg_test = neg_test + n1_test
neg_test_type = neg_test_type + [9]*len(n1_test)

In [69]:
df = pd.read_pickle('sequence_with_text_breaks.pkl') #sequence, text_with_dna, text_without_dna
p1 = list(df['sequence'].values)
p2 = list(df['text_with_dna'].values)
n1 = list(df['text_without_dna'].values)
p1_train, p1_test = split_train_test(p1)
p2_train, p2_test = split_train_test(p2)
n1_train, n1_test = split_train_test(n1)
# add to global lists
pos_train = pos_train + p1_train + p2_train
pos_test = pos_test + p1_test + p2_test
pos_test_type = pos_test_type + [10]*len(p1_test + p2_test)
neg_train = neg_train + n1_train
neg_test = neg_test + n1_test
neg_test_type = neg_test_type + [10]*len(n1_test)

In [70]:
n1 = [] 

for i in range(4100):
    n1.append(generate_fake_content())

n1_train, n1_test = split_train_test(n1)
neg_train = neg_train + n1_train
neg_test = neg_test + n1_test
neg_test_type = neg_test_type + [11]*len(n1_test)

In [71]:
len(pos_train), len(pos_test), len(pos_test_type), len(neg_train), len(neg_test), len(neg_test_type)

(4640, 1160, 1160, 4640, 1160, 1160)

### Build train and test set from lists

In [74]:
len(pos_train), len(pos_test), len(neg_train), len(neg_test)

(4640, 1160, 4640, 1160)

In [75]:
train_pos = pos_train
test_pos = pos_test
train_neg = neg_train
test_neg = neg_test

In [92]:
# TRAIN SET

# create pos dataframe
df_train_pos = pd.DataFrame(train_pos, columns=["data"])
df_train_pos["label"] = 1

# create neg dataframe
df_train_neg = pd.DataFrame(train_neg, columns=["data"])
df_train_neg["label"] = 0

# combine
df_combined = pd.concat([df_train_pos, df_train_neg], ignore_index=True)

#shuffle
df_shuffled = df_combined.sample(frac=1, random_state=42).reset_index(drop=True)
df_shuffled.to_pickle('df_train_final.pkl')

X = df_shuffled["data"].values
y = df_shuffled["label"].values

X_train, X_val, y_train, y_val = train_test_split(X,y,
                     test_size=0.1,
                     random_state=42,
                     stratify=y)

In [93]:
# TEST SET

# create pos dataframe
df_test_pos = pd.DataFrame({"data": test_pos, "type": pos_test_type})
df_test_pos["label"] = 1

# create neg dataframe
df_test_neg = pd.DataFrame({"data": test_neg, "type": neg_test_type})
df_test_neg["label"] = 0

# combine
df_combined = pd.concat([df_test_pos, df_test_neg], ignore_index=True)

df_combined.to_pickle('df_test_final.pkl')

X_test = df_combined["data"].values
y_test = df_combined["label"].values
type_test = df_combined["type"].values

## Training and Predictions - 1. Preprocessing + ML (Logistic Regression)

In [78]:

###############################################
# Preprocessing
###############################################

def preprocess_text(text):
    """
    Very minimal preprocessing:
    - Lowercase
    - Remove whitespace newlines
    We DO NOT filter characters, because DNA may be obfuscated.
    """
    return text.lower().replace("\n", " ").replace("\t", " ")


###############################################
# K-mer Extraction
###############################################

def get_kmers(text, k=3):
    """Extract k-mers without filtering characters."""
    kmers = []
    for i in range(len(text) - k + 1):
        kmers.append(text[i:i+k])
    return kmers


###############################################
# Entropy Calculation
###############################################

def shannon_entropy(text):
    """
    Compute Shannon entropy over all characters.
    """
    if len(text) == 0:
        return 0.0

    from math import log2
    freq = {}
    for c in text:
        freq[c] = freq.get(c, 0) + 1

    entropy = 0
    for c in freq:
        p = freq[c] / len(text)
        entropy -= p * log2(p)

    return entropy


###############################################
# Markov Transition Features
###############################################

def markov_features(text, alphabet):
    """
    Build a Markov transition probability vector from:
    P(next_char | current_char)

    The output is flattened row-major into a single vector.
    """
    if len(text) < 2:
        return np.zeros(len(alphabet) * len(alphabet))

    index = {c: i for i, c in enumerate(alphabet)}

    # Transition counts
    trans = np.zeros((len(alphabet), len(alphabet)))

    for a, b in zip(text[:-1], text[1:]):
        if a in index and b in index:
            trans[index[a], index[b]] += 1

    row_sums = trans.sum(axis=1, keepdims=True)
    row_sums[row_sums == 0] = 1  
    trans = trans / row_sums

    return trans.flatten()


###############################################
# Single-document Feature Extraction
###############################################

def extract_features_single(
    text,
    k,
    tfidf_vectorizer,
    alphabet,
    fit_tfidf=False
):
    """
    Extract TF-IDF + entropy + Markov vector.
    All vectors are guaranteed fixed length if:
    - TF-IDF was fit on the full training corpus
    - alphabet is global
    """
    pre = preprocess_text(text)
    kmers = get_kmers(pre, k)

    entropy = shannon_entropy(pre)
    markov_vec = markov_features(pre, alphabet)

    kmers_str = " ".join(kmers)

    if fit_tfidf:
        tfidf_vec = tfidf_vectorizer.fit_transform([kmers_str]).toarray()[0]
    else:
        tfidf_vec = tfidf_vectorizer.transform([kmers_str]).toarray()[0]

    return np.concatenate([tfidf_vec, [entropy], markov_vec])


###############################################
# TRAINING PIPELINE
###############################################

def train_pipeline_whole_docs(texts, labels, k=3):
    """
    Train using full documents (no windowing).
    Ensures fixed feature vector sizes for all samples.
    """
    processed_docs = [preprocess_text(t) for t in texts]

    # Build global alphabet across all training docs
    global_alphabet = sorted(list(set("".join(processed_docs))))

    # Build kmers for TF-IDF fitting
    kmers_docs = [" ".join(get_kmers(doc, k)) for doc in processed_docs]

    # Fit a single TF-IDF over ALL training documents
    tfidf_vec = TfidfVectorizer(analyzer="word")
    tfidf_vec.fit(kmers_docs)

    # Extract features
    feature_list = []
    for doc in processed_docs:
        f = extract_features_single(
            doc,
            k,
            tfidf_vec,
            global_alphabet,
            fit_tfidf=False
        )
        feature_list.append(f)

    X = np.vstack(feature_list)
    y = np.array(labels)

    # Standardize
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Train classifier
    clf = LogisticRegression(
        penalty="l2",
        C=1.0,
        solver="liblinear",
        max_iter=1000
    )
    clf.fit(X_scaled, y)

    return clf, scaler, tfidf_vec, global_alphabet


###############################################
# TESTING – WITH WINDOWING IF INPUT TOO BIG
###############################################

def classify_document(
    text,
    clf,
    scaler,
    tfidf_vec,
    alphabet,
    k=3,
    max_window_chars=20000
):
    """
    If document is short -> classify whole.
    If too long -> split into windows.
    If ANY window predicts 1 -> output 1 (DNA found).
    """
    if len(text) <= max_window_chars:
        # No windowing
        feats = extract_features_single(
            text, k, tfidf_vec, alphabet, fit_tfidf=False
        )
        feats = scaler.transform([feats])
        pred = clf.predict(feats)[0]
        return pred

    # WINDOWING
    windows = [
        text[i:i + max_window_chars]
        for i in range(0, len(text), max_window_chars)
    ]

    for w in windows:
        feats = extract_features_single(
            w, k, tfidf_vec, alphabet, fit_tfidf=False
        )
        feats = scaler.transform([feats])
        pred = clf.predict(feats)[0]
        if pred == 1:
            return 1

    return 0  # none of the windows detected DNA




In [79]:
print(len(X_train), len(X_val), len(X_test))

print("Start training...")

clf, scaler, tfidf_vec, alphabet = train_pipeline_whole_docs(X_train, y_train, k=3)

print("Training done.")

preds=[]

for test_doc in list(X_test):
    pred = classify_document(
    test_doc,
    clf,
    scaler,
    tfidf_vec,
    alphabet,
    k=3)
    preds.append(pred)

preds = np.array(preds, dtype=np.int64)
print("Predictions stored.")


8352 928 2320
Start training...
Training done.
Predictions stored.


In [127]:
# store trained model

NLP_model = {
    "clf": clf,
    "tfidf": tfidf_vec,
    "scaler": scaler,
    "alphabet": alphabet
}
joblib.dump(NLP_model, "NLP_model.pkl")

['NLP_model.pkl']

In [128]:
# to read
model = joblib.load("models/NLP_model.pkl")
clf = model["clf"]
tfidf_vec = model["tfidf"]
scaler = model["scaler"]
alphabet = model["alphabet"]

### Test performance metrics

In [80]:
# With updated dataset (final - all adversarial cases)

y_pred = preds
acc  = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec  = recall_score(y_test, y_pred)
f1   = f1_score(y_test, y_pred)

print("Accuracy :", acc)
print("Precision:", prec)
print("Recall   :", rec)
print("F1 Score :", f1)

Accuracy : 0.9629310344827586
Precision: 0.9890710382513661
Recall   : 0.9362068965517242
F1 Score : 0.9619131975199291


In [91]:
for i in list(set(type_test)):
    print("Adversarial Attempt Type:", adversarial_attempt_type[i])
    y_pred_temp = y_pred[type_test==i]
    y_test_temp = y_test[type_test==i]
    # if (len(set(y_test_temp))<2):
    #     continue
    acc  = accuracy_score(y_test_temp, y_pred_temp)
    prec = precision_score(y_test_temp, y_pred_temp)
    rec  = recall_score(y_test_temp, y_pred_temp)
    f1   = f1_score(y_test_temp, y_pred_temp)
    print("Accuracy :", acc)
    print("Precision:", prec)
    print("Recall   :", rec)
    print("F1 Score :", f1)

Adversarial Attempt Type: Sequence (continuous) with English Text
Accuracy : 0.9833333333333333
Precision: 1.0
Recall   : 0.975
F1 Score : 0.9873417721518988
Adversarial Attempt Type: Sequence (continuous) with Random Text (All positive samples)
Accuracy : 0.9425
Precision: 1.0
Recall   : 0.9425
F1 Score : 0.9703989703989704
Adversarial Attempt Type: Sequence with alternate random characters, bounded by English Text
Accuracy : 1.0
Precision: 1.0
Recall   : 1.0
F1 Score : 1.0
Adversarial Attempt Type: Sequence compressed and obfuscated with English Text
Accuracy : 1.0
Precision: 1.0
Recall   : 1.0
F1 Score : 1.0
Adversarial Attempt Type: Sequence compressed and replaced by other characters
Accuracy : 0.84375
Precision: 1.0
Recall   : 0.7916666666666666
F1 Score : 0.8837209302325582
Adversarial Attempt Type: Sequence replaced by other characters
Accuracy : 0.9166666666666666
Precision: 1.0
Recall   : 0.875
F1 Score : 0.9333333333333333
Adversarial Attempt Type: Fake DNA sequences (All ne

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [96]:
# With updated dataset (more adversarial cases) (test_new, train_new)

y_pred = preds
acc  = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec  = recall_score(y_test, y_pred)
f1   = f1_score(y_test, y_pred)

print("Accuracy :", acc)
print("Precision:", prec)
print("Recall   :", rec)
print("F1 Score :", f1)

Accuracy : 0.9651515151515152
Precision: 0.9904153354632588
Recall   : 0.9393939393939394
F1 Score : 0.9642301710730948


In [100]:
# With final dataset (test_final, train_final)
y_pred = preds
acc  = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec  = recall_score(y_test, y_pred)
f1   = f1_score(y_test, y_pred)

print("Accuracy :", acc)
print("Precision:", prec)
print("Recall   :", rec)
print("F1 Score :", f1)

Accuracy : 0.9629310344827586
Precision: 0.9890710382513661
Recall   : 0.9362068965517242
F1 Score : 0.9619131975199291


## Use simple heuristics for flagging

In [95]:
from heuristics.simple_check import *

In [96]:
preds_heuristics=[]

for test_doc in list(X_test):
    pred = check(test_doc)
    preds_heuristics.append(pred)

preds_heuristics = np.array(preds_heuristics, dtype=np.int64)
print("Predictions stored.")

Predictions stored.


In [101]:
# With final dataset

y_pred = preds_heuristics
acc  = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec  = recall_score(y_test, y_pred)
f1   = f1_score(y_test, y_pred)

print("Accuracy :", acc)
print("Precision:", prec)
print("Recall   :", rec)
print("F1 Score :", f1)

Accuracy : 0.8685344827586207
Precision: 0.9776536312849162
Recall   : 0.7543103448275862
F1 Score : 0.851581508515815


In [102]:
for i in list(set(type_test)):
    print("Adversarial Attempt Type:", adversarial_attempt_type[i])
    y_pred_temp = y_pred[type_test==i]
    y_test_temp = y_test[type_test==i]
    # if (len(set(y_test_temp))<2):
    #     continue
    acc  = accuracy_score(y_test_temp, y_pred_temp)
    prec = precision_score(y_test_temp, y_pred_temp)
    rec  = recall_score(y_test_temp, y_pred_temp)
    f1   = f1_score(y_test_temp, y_pred_temp)
    print("Accuracy :", acc)
    print("Precision:", prec)
    print("Recall   :", rec)
    print("F1 Score :", f1)

Adversarial Attempt Type: Sequence (continuous) with English Text
Accuracy : 1.0
Precision: 1.0
Recall   : 1.0
F1 Score : 1.0
Adversarial Attempt Type: Sequence (continuous) with Random Text (All positive samples)
Accuracy : 1.0
Precision: 1.0
Recall   : 1.0
F1 Score : 1.0
Adversarial Attempt Type: Sequence with alternate random characters, bounded by English Text
Accuracy : 0.6666666666666666
Precision: 1.0
Recall   : 0.5
F1 Score : 0.6666666666666666
Adversarial Attempt Type: Sequence compressed and obfuscated with English Text
Accuracy : 0.50625
Precision: 1.0
Recall   : 0.3416666666666667
F1 Score : 0.5093167701863354
Adversarial Attempt Type: Sequence compressed and replaced by other characters
Accuracy : 0.5125
Precision: 1.0
Recall   : 0.35
F1 Score : 0.5185185185185185
Adversarial Attempt Type: Sequence replaced by other characters
Accuracy : 0.675
Precision: 1.0
Recall   : 0.5125
F1 Score : 0.6776859504132231
Adversarial Attempt Type: Fake DNA sequences (All negative samples)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [104]:
adv = pd.read_pickle('final_test_adversarial.pkl')

In [120]:
def preprocess_text(text):
    """
    Very minimal preprocessing:
    - Lowercase
    - Remove whitespace newlines
    We DO NOT filter characters, because DNA may be obfuscated.
    """
    return text.lower().replace("\n", " ").replace("\t", " ")

sequences = list(adv['Adversarial sequence'].values)
sequences = [preprocess_text(s) for s in sequences]

In [121]:
types = list(set(adv["type"].values))

In [123]:
for t in types:
    print(t)
    df = adv[adv['type']==t]
    seq = list(df['Adversarial sequence'].values)
    seq = [preprocess_text(s) for s in seq]

    preds=[]

    for test_doc in seq:
        pred = classify_document(
        test_doc,
        clf,
        scaler,
        tfidf_vec,
        alphabet,
        k=3)
        preds.append(pred)

    preds = np.array(preds, dtype=np.int64)
    print("Accuracy:",(preds.sum()/len(preds)))

Columnar Split
Accuracy: 0.0
Unicode Homoglyph Substitution
Accuracy: 0.0
Log/JSON Format
Accuracy: 1.0
Zero width character Injection
Accuracy: 0.0
Compressed Sequence
Accuracy: 1.0
Alternate Random Character
Accuracy: 0.2
Linguistic Camouflage
Accuracy: 0.1
Base64 Encoding
Accuracy: 0.1
Multicharacter Mapping
Accuracy: 0.7
Reverse Complement
Accuracy: 1.0


In [124]:
for t in types:
    print(t)
    df = adv[adv['type']==t]
    seq = list(df['Adversarial sequence'].values)
    seq = [preprocess_text(s) for s in seq]

    preds=[]

    for test_doc in seq:
        pred = check(test_doc)
        preds.append(pred)

    preds = np.array(preds, dtype=np.int64)
    print("Accuracy:",(preds.sum()/len(preds)))

Columnar Split
Accuracy: 1.0
Unicode Homoglyph Substitution
Accuracy: 0.0
Log/JSON Format
Accuracy: 0.0
Zero width character Injection
Accuracy: 0.0
Compressed Sequence
Accuracy: 0.0
Alternate Random Character
Accuracy: 0.4
Linguistic Camouflage
Accuracy: 0.1
Base64 Encoding
Accuracy: 0.0
Multicharacter Mapping
Accuracy: 0.0
Reverse Complement
Accuracy: 1.0


In [None]:
for i in list(set(type_test)):
    print("Adversarial Attempt Type:", adversarial_attempt_type[i])
    y_pred_temp = y_pred[type_test==i]
    y_test_temp = y_test[type_test==i]

In [107]:
list_adv = list(adv['Adversarial sequence'].values)

In [108]:
preds=[]

for test_doc in list_adv:
    pred = classify_document(
    test_doc,
    clf,
    scaler,
    tfidf_vec,
    alphabet,
    k=3)
    preds.append(pred)

preds = np.array(preds, dtype=np.int64)
print("Predictions stored.")


Predictions stored.


In [110]:
y_test = [1]*len(preds)
y_pred = preds
acc  = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec  = recall_score(y_test, y_pred)
f1   = f1_score(y_test, y_pred)

print("Accuracy :", acc)
print("Precision:", prec)
print("Recall   :", rec)
print("F1 Score :", f1)

Accuracy : 0.41
Precision: 1.0
Recall   : 0.41
F1 Score : 0.5815602836879432


## Training and Predictions - 2. 1-dimensional CNN

Trained in the following Google Colab Notebook: https://colab.research.google.com/drive/1k7wWlBhRVAuD_biPt6sI9bulKVAwaKhm?usp=sharing

In [None]:
df_train = pd.read_pickle('df_train_final.pkl')
df_test = pd.read_pickle('df_test_final.pkl')

X = df_train["data"].values
y = df_train["label"].values

X_train, X_val, y_train, y_val = train_test_split(X,y,
                     test_size=0.1,
                     random_state=42,
                     stratify=y)

X_test = df_test["data"].values
y_test = df_test["label"].values
type_test = df_test["type"].values

#######################################
#Preprocessing
#######################################
def unicode_dna_preprocessor(text: str) -> str:
    """
    Normalizes text by stripping invisible characters and transliterating 
    Unicode homoglyphs to ASCII to ensure clean genomic data.
    """
    # Standard Normalize (Fixes wide text 'Ａ' -> 'A')
    text = unicodedata.normalize('NFKC', text)
    
    # Remove Invisible Characters (Category Cf = Format, Cc = Control)
    text = "".join(ch for ch in text if unicodedata.category(ch) not in ["Cf", "Cc"])
    
    # Maps unicode to nearest ASCII
    text = unidecode(text)
    text = text.upper()

    return text


def preprocess_text(text):
    """
    Very minimal preprocessing:
    - Lowercase
    - Remove whitespace newlines
    We DO NOT filter characters, because DNA may be obfuscated.
    - Unicode detection processing
    """
    text = text.lower().replace("\n", " ").replace("\t", " ")
    text = unicode_dna_preprocessor(text)
    return text

#######################################
# Shannon entropy
#######################################
def shannon_entropy(text):
    if len(text) == 0:
        return 0.0
    from math import log2
    freq = {}
    for c in text:
        freq[c] = freq.get(c, 0) + 1
    entropy = 0
    for c in freq:
        p = freq[c] / len(text)
        entropy -= p * log2(p)
    return entropy

#######################################
# Character Encoder (no LabelEncoder)
#######################################
class CharEncoder:
    """
    Maps characters to integer IDs.
    0 is reserved for padding.
    Unknown characters map to 0 at test time.
    """
    def __init__(self):
        self.char2id = {}
        self.id2char = {}
        self.fitted = False
        self.vocab_size = 0

    def fit(self, texts):
        chars = sorted(list(set("".join(texts))))
        # index 0 = padding
        self.char2id = {c:i+1 for i,c in enumerate(chars)}
        self.id2char = {i+1:c for i,c in enumerate(chars)}
        self.vocab_size = len(self.char2id) + 1  # +1 for padding
        self.fitted = True

    def transform(self, text):
        if not self.fitted:
            raise ValueError("CharEncoder not fitted.")
        # unknown characters mapped to 0
        return np.array([self.char2id.get(c, 0) for c in text], dtype=np.int64)

    def fit_transform(self, texts):
        self.fit(texts)
        return [self.transform(t) for t in texts]

#######################################
# CNN model
#######################################
class DNASequenceCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim=32, num_classes=2,
                 kernel_sizes=[3,5,7], num_filters=64):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)

        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=embed_dim,
                      out_channels=num_filters,
                      kernel_size=k)
            for k in kernel_sizes
        ])

        self.fc_entropy = nn.Linear(1, 16)
        self.fc = nn.Linear(len(kernel_sizes)*num_filters + 16, num_classes)

    def forward(self, x, entropy):
        """
        x: [batch, seq_len] integer-encoded text
        entropy: [batch, 1] global numeric feature
        """
        emb = self.embedding(x)           # [B, L, E]
        emb = emb.transpose(1,2)          # [B, E, L]

        conv_outs = [F.relu(conv(emb)) for conv in self.convs]
        pooled = [F.adaptive_max_pool1d(c,1).squeeze(-1) for c in conv_outs]

        cnn_feat = torch.cat(pooled, dim=1)
        entropy_feat = F.relu(self.fc_entropy(entropy))
        feat = torch.cat([cnn_feat, entropy_feat], dim=1)

        return self.fc(feat)



import torch
import torch.nn.functional as F

import torch
from torch.utils.data import Dataset, DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# -------------------------------
# TRAINING
# -------------------------------

# -------------------------------
# PREPROCESSING
# -------------------------------
texts_proc = [preprocess_text(t) for t in X_train]

# Encode characters
char_encoder = CharEncoder()
sequences = char_encoder.fit_transform(texts_proc)

# Pad sequences
max_len = max(len(seq) for seq in sequences)
sequences_pad = [np.pad(seq, (0, max_len - len(seq)), constant_values=0)
                 for seq in sequences]

# Compute entropy features
entropies_list = [[shannon_entropy(t)] for t in texts_proc]

# Labels
labels_list = y_train

# -------------------------------
# DATASET & DATALOADER
# -------------------------------
class DNADataset(Dataset):
    def __init__(self, sequences, entropies, labels):
        self.sequences = sequences
        self.entropies = entropies
        self.labels = labels

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return (
            torch.tensor(self.sequences[idx], dtype=torch.long),
            torch.tensor(self.entropies[idx], dtype=torch.float),
            torch.tensor(self.labels[idx], dtype=torch.long)
        )

dataset = DNADataset(sequences_pad, entropies_list, labels_list)
loader = DataLoader(dataset, batch_size=4, shuffle=True)

# -------------------------------
# MODEL, LOSS, OPTIMIZER
# -------------------------------
model = DNASequenceCNN(vocab_size=char_encoder.vocab_size,
                       embed_dim=32,
                       num_classes=2).to(device)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Mixed precision scaler
scaler = torch.cuda.amp.GradScaler()

# -------------------------------
# TRAINING LOOP WITH AMP
# -------------------------------
for epoch in range(30):
    model.train()
    running_loss = 0.0

    for X_batch, entropy_batch, y_batch in loader:
        X_batch = X_batch.to(device)
        entropy_batch = entropy_batch.to(device)
        y_batch = y_batch.to(device)

        optimizer.zero_grad()

        # Mixed precision context
        with torch.cuda.amp.autocast():
            logits = model(X_batch, entropy_batch)
            loss = criterion(logits, y_batch)

        # Scaled backward pass
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        running_loss += loss.item() * X_batch.size(0)

    epoch_loss = running_loss / len(dataset)
    print(f"Epoch {epoch+1}/10, Loss: {epoch_loss:.4f}")

print("Training complete.")


# -------------------------------
# Prediction
# -------------------------------
preds_cnn = []
probs_cnn = []

for test_doc in list(X_test):
    with torch.no_grad():
        test_doc_proc = preprocess_text(test_doc)

        # Transform + pad
        test_seq = char_encoder.transform(test_doc_proc)
        test_seq = np.pad(test_seq, (0, max(0, max_len - len(test_seq))),
                          constant_values=0)
        test_seq = torch.tensor([test_seq], dtype=torch.long).to(device)

        test_entropy = torch.tensor([[shannon_entropy(test_doc_proc)]],
                                    dtype=torch.float).to(device)

        pred = model(test_seq, test_entropy)
        probs = F.softmax(pred, dim=1)

    preds_cnn.append(torch.argmax(pred, dim=1).item())
    probs_cnn.append(probs.cpu())  # move back to CPU if needed


model

# save trained model 
torch.save(model.state_dict(), "CNN_model.pth")



tp = 0
tn = 0
fn = 0
fp = 0
for i in range(len(preds_cnn)):
  if(preds_cnn[i] == 1 and y_test[i] == 1):
    tp +=1
  elif(preds_cnn[i] == 1 and y_test[i] == 0):
    fp +=1
  elif(preds_cnn[i] == 0 and y_test[i] == 1):
    fn +=1
  elif(preds_cnn[i] == 0 and y_test[i] == 0):
    tn +=1

(tp )/(tp + fn )

# this is with level 2 -> df_train_new, df_test_new

preds_np = preds_cnn
labels_np = y_test

# Compute metrics
accuracy = accuracy_score(labels_np, preds_np)
precision = precision_score(labels_np, preds_np)
recall = recall_score(labels_np, preds_np)
f1 = f1_score(labels_np, preds_np)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# this is with final dataframe

preds_np = preds_cnn
labels_np = y_test

# Compute metrics
accuracy = accuracy_score(labels_np, preds_np)
precision = precision_score(labels_np, preds_np)
recall = recall_score(labels_np, preds_np)
f1 = f1_score(labels_np, preds_np)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

adversarial_attempt_type = {1:"Sequence (continuous) with English Text",
                            2:"Sequence (continuous) with Random Text (All positive samples)",
                            3:"Sequence with alternate random characters, bounded by English Text",
                            4:"Sequence compressed and obfuscated with English Text",
                            5:"Sequence compressed and replaced by other characters",
                            6:"Sequence replaced by other characters",
                            7:"Fake DNA sequences (All negative samples)",
                            8:"Sequence replaced with multi-character mapping(length:1-5) and bounded with English Text",
                            9:"Sequence broken with random spaces, bounded by English Text",
                            10:"Sequence broken by random strings of text (variable length)",
                            11:"Randomly generated characters (All negative samples)"}

y_pred = preds_cnn
y_pred = np.array(y_pred)
for i in list(set(type_test)):
    print("Adversarial Attempt Type:", adversarial_attempt_type[i])
    y_pred_temp = y_pred[type_test==i]
    y_test_temp = y_test[type_test==i]
    # if (len(set(y_test_temp))<2):
    #     continue
    acc  = accuracy_score(y_test_temp, y_pred_temp)
    prec = precision_score(y_test_temp, y_pred_temp)
    rec  = recall_score(y_test_temp, y_pred_temp)
    f1   = f1_score(y_test_temp, y_pred_temp)
    print("Accuracy :", acc)
    print("Precision:", prec)
    print("Recall   :", rec)
    print("F1 Score :", f1)



def preprocess_text_for_test(X_test, y_test):
  texts_proc_test = [preprocess_text(t) for t in X_test]
  sequences_test = [char_encoder.transform(t) for t in texts_proc_test]
  sequences_pad_test = [np.pad(seq, (0, max_len - len(seq)), constant_values=0) for seq in sequences_test]
  entropies_list_test = [[shannon_entropy(t)] for t in texts_proc_test]
  labels_list_test = y_test

  return DNADataset(sequences_pad_test, entropies_list_test, labels_list_test)