In [25]:
import os
os.environ['OPENBLAS_NUM_THREADS'] = '4'
import pandas as pd
import yaml
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.neighbors import KNeighborsClassifier

In [44]:
df_train = pd.read_csv('../data/FM179-FM181_fingerprints.csv')
df_test = pd.read_csv('../data/FM165/no_sample_id/20241120_1928_MN43023_FAY46018_68f388f5/fastq_pass/fingerprints.csv')
df_test_bp_to_rt = {
    1: "INDURO",
    2: "ProtoScript",
    3: "Marathon",
    4: "GoScript",
    5: "EpiScript",
}
df_test['RT'] = df_test['barcode_num'].map(df_test_bp_to_rt)

In [51]:
df_test = df_test[df_test['isoform'] == 'E']
df_test['experiment'] = 'FM165'
df_test['cap'] = 'Unknown'
df_test

Unnamed: 0,barcode,isoform,num_reads,num_A,num_C,num_G,num_T,num_DEL,num_INS,barcode_num,...,T%_INSDEL,INS%_INSDEL,DEL%_INSDEL,A%,C%,G%,T%,RT,cap,experiment
5,barcode01,E,16622,5745,1170,2887,847,5145,828,1,...,0.050957,0.049814,0.30953,0.539487,0.109869,0.271105,0.079538,INDURO,Unknown,FM165
25,barcode02,E,1706,707,132,110,107,545,105,2,...,0.06272,0.061547,0.319461,0.669508,0.125,0.104167,0.101326,ProtoScript,Unknown,FM165
54,barcode03,E,143171,106860,4696,10749,6417,6549,7900,3,...,0.044821,0.055179,0.045743,0.830161,0.036482,0.083506,0.049852,Marathon,Unknown,FM165
81,barcode04,E,40523,13149,2081,2968,2416,16660,3249,4,...,0.05962,0.080177,0.411125,0.637867,0.100951,0.14398,0.117202,GoScript,Unknown,FM165
112,barcode05,E,155480,95708,5202,10119,6696,16758,20997,5,...,0.043067,0.135046,0.107782,0.812979,0.044188,0.085955,0.056878,EpiScript,Unknown,FM165


In [54]:
# Get unique RT names from all datasets
all_rt_names = df_train['RT'].unique()
all_caps = df_train['cap'].unique()

# Function to create feature vector for each sample
def create_features(df):
    features = []
    labels = []
    
    # Group by cap type and experiment
    for (cap_type, experiment), group in df.groupby(['cap', 'experiment']):
        # Initialize feature vector with zeros
        feature_vec = np.zeros(len(all_rt_names) * len(all_caps))

        # For each RT in this group, add its ACGT percentages
        for _, row in group.iterrows():
            rt_idx = np.where(all_rt_names == row['RT'])[0][0]
            base_idx = rt_idx * len(all_caps)
            feature_vec[base_idx:base_idx+len(all_caps)] = row[['A%', 'C%', 'G%', 'T%']]
        
        features.append(feature_vec)
        labels.append(cap_type)

    return np.array(features), np.array(labels)

# Create features and labels for each dataset
X_train, y_train = create_features(df_train)
X_test, y_test = create_features(df_test)

print('X_train.shape', X_train.shape)
print('X_test.shape', X_test.shape)

# Predict using 5-NN with cosine similarity and masked training set depending on the RTs present in the test sample
def predict(X_test_sample, X_train, y_train):
    knn = KNeighborsClassifier(n_neighbors=8, metric='cosine')
    mask = X_test_sample != 0
    X_train_masked = X_train.copy()
    X_train_masked[:, ~mask] = 0  # Use only training RTs that are present in the test sample
    knn.fit(X_train_masked[:, mask], y_train)
    
    # Get distances and indices of 5 nearest neighbors
    distances, indices = knn.kneighbors(X_test_sample[mask].reshape(1, -1))
    
    # Convert distances to similarities (1 - distance)
    similarities = 1 - distances[0]
    
    # Get the corresponding labels
    neighbor_labels = y_train[indices[0]]
    
    return list(zip(neighbor_labels, similarities))

# Make predictions
test_predictions = [predict(x, X_train, y_train) for x in X_test]

# Print predictions with similarities
print("\nTest predictions with similarities:")
for true, preds in zip(y_test, test_predictions):
    print(f"\nTrue cap: {true}")
    for pred, sim in preds:
        print(f"Predicted: {pred:8} Similarity: {sim:.3f}")

X_train.shape (8, 20)
X_test.shape (1, 20)

Test predictions with similarities:

True cap: Unknown
Predicted: NAD-U1   Similarity: 0.974
Predicted: NAD-U1   Similarity: 0.968
Predicted: TMG-U1   Similarity: 0.962
Predicted: TMG-U1   Similarity: 0.959
Predicted: m⁷Gp₃A-U1 Similarity: 0.919
Predicted: m⁷Gp₃A-U1 Similarity: 0.838
Predicted: Ap₄A-U1  Similarity: 0.791
Predicted: Ap₄A-U1  Similarity: 0.725
