### Task1
#### Mapping the random_int_1.txt names to the actual sequences

In [12]:
import pandas as pd

def parse_fasta(filename, strip_char=None):
    """Parse FASTA file into a dictionary, optionally stripping characters from sequences"""
    sequences = {}
    current_id = None
    current_seq = []
    
    with open(filename, 'r') as f:
        for line in f:
            line = line.strip()
            if line.startswith('>'):
                if current_id is not None:
                    # Save previous sequence
                    full_seq = ''.join(current_seq)
                    if strip_char:
                        full_seq = full_seq.rstrip(strip_char)
                    sequences[current_id] = full_seq
                # Get new ID (remove '>' and any leading/trailing whitespace)
                current_id = line[1:].split()[0].strip()
                current_seq = []
            else:
                current_seq.append(line)
        
        # Add the last sequence
        if current_id is not None:
            full_seq = ''.join(current_seq)
            if strip_char:
                full_seq = full_seq.rstrip(strip_char)
            sequences[current_id] = full_seq
    
    return sequences

# Load the pairs file
try:
    pairs_df = pd.read_csv('random_int_1.txt', sep='\t', header=None, 
                          names=['dengue_id', 'human_id'], dtype='string')
    print("Successfully loaded pairs:")
    print(pairs_df.head())
except Exception as e:
    print(f"Error loading pairs file: {e}")
    exit()

# Parse sequence files
print("\nParsing dengue sequences...")
dengue_sequences = parse_fasta('dengue_ni_sequences.txt')
print(f"Found {len(dengue_sequences)} dengue sequences")

print("\nParsing human sequences...")
human_sequences = parse_fasta('identical_sequences.txt', strip_char='*')
print(f"Found {len(human_sequences)} human sequences")

# Match pairs with sequences
matched_data = []
missing_dengue = 0
missing_human = 0

for idx, row in pairs_df.iterrows():
    dengue_id = row['dengue_id']
    human_id = row['human_id']
    
    # Check both sequences exist
    has_dengue = dengue_id in dengue_sequences
    has_human = human_id in human_sequences
    
    if not has_dengue:
        missing_dengue += 1
    if not has_human:
        missing_human += 1
    
    if has_dengue and has_human:
        matched_data.append({
            'dengue_id': dengue_id,
            'human_id': human_id,
            'dengue_sequence': dengue_sequences[dengue_id],
            'human_sequence': human_sequences[human_id]
        })

# Create DataFrame and save
if matched_data:
    output_df = pd.DataFrame(matched_data)
    output_df.to_csv('task1_sequences.csv', index=False)
    print(f"\nSuccessfully saved {len(output_df)} matched pairs")
    print("Sample output:")
    print(output_df.head())
else:
    print("\nNo matching pairs found!")

# Print summary stats
print("\nProcessing Summary:")
print(f"Total input pairs: {len(pairs_df)}")
print(f"Matched pairs: {len(matched_data)}")
print(f"Pairs missing dengue: {missing_dengue}")
print(f"Pairs missing human: {missing_human}")
print(f"Unmatched rate: {(missing_dengue + missing_human)/len(pairs_df)*100:.1f}%")


Successfully loaded pairs:
                   dengue_id human_id
0  YP_001531165.2_capsid_DV3    HOXB7
1  YP_001531165.2_capsid_DV3   SPACA3
2  YP_001531165.2_capsid_DV3   RNF168
3  YP_001531165.2_capsid_DV3     SCO2
4  YP_001531165.2_capsid_DV3  HMGCLL1

Parsing dengue sequences...
Found 60 dengue sequences

Parsing human sequences...
Found 17806 human sequences

Successfully saved 1980 matched pairs
Sample output:
                   dengue_id human_id  \
0  YP_001531165.2_capsid_DV3    HOXB7   
1  YP_001531165.2_capsid_DV3   SPACA3   
2  YP_001531165.2_capsid_DV3   RNF168   
3  YP_001531165.2_capsid_DV3     SCO2   
4  YP_001531165.2_capsid_DV3  HMGCLL1   

                                     dengue_sequence  \
0  MNNQRKKTGKPSINMLKRVRNRVSTGSQLAKRFSKGLLNGQGPMKL...   
1  MNNQRKKTGKPSINMLKRVRNRVSTGSQLAKRFSKGLLNGQGPMKL...   
2  MNNQRKKTGKPSINMLKRVRNRVSTGSQLAKRFSKGLLNGQGPMKL...   
3  MNNQRKKTGKPSINMLKRVRNRVSTGSQLAKRFSKGLLNGQGPMKL...   
4  MNNQRKKTGKPSINMLKRVRNRVSTGSQLAKRFSKGLLNGQGPMKL... 

In [13]:
df = pd.read_csv('task1_sequences.csv')
print(df.head())

                   dengue_id human_id  \
0  YP_001531165.2_capsid_DV3    HOXB7   
1  YP_001531165.2_capsid_DV3   SPACA3   
2  YP_001531165.2_capsid_DV3   RNF168   
3  YP_001531165.2_capsid_DV3     SCO2   
4  YP_001531165.2_capsid_DV3  HMGCLL1   

                                     dengue_sequence  \
0  MNNQRKKTGKPSINMLKRVRNRVSTGSQLAKRFSKGLLNGQGPMKL...   
1  MNNQRKKTGKPSINMLKRVRNRVSTGSQLAKRFSKGLLNGQGPMKL...   
2  MNNQRKKTGKPSINMLKRVRNRVSTGSQLAKRFSKGLLNGQGPMKL...   
3  MNNQRKKTGKPSINMLKRVRNRVSTGSQLAKRFSKGLLNGQGPMKL...   
4  MNNQRKKTGKPSINMLKRVRNRVSTGSQLAKRFSKGLLNGQGPMKL...   

                                      human_sequence  
0  MSSLYYANTLFSKYPASSSVFATGAFPEQTSCAFASNPQRPGYGAG...  
1  MVSALRGAPLIRVCLAYFTSGFNAAALDYEADGSTNNGIFQINSRR...  
2  MALPKDAIPSLSECQCGICMEILVEPVTLPCNHTLCKPCFQSTVEK...  
3  MLLLTRSPTAWHRLSQLKPRVLPGTLGGQALHLRSWLLSRQGPAET...  
4  MGNVPSAVKHCLSYQQLLREHLWIGDSVAGALDPAQETSQLSGLPE...  


### Task 2
#### Now we will take the file from task1 and get the top 100 kmers and map them to the corresponding kmer embeddings

In [26]:
import pandas as pd
import numpy as np
import tensorflow as tf
from collections import defaultdict

# Configuration
KMER_LENGTH = 4
TOP_KMERS = 100
EMBEDDING_DIM = 128  # Dimension from k-mer embedding file
LATENT_DIM = 128     # Final embedding dimension

def load_kmer_embeddings(filename):
    """Load k-mer embeddings with validation"""
    kmer_emb = {}
    with open(filename, 'r') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) != EMBEDDING_DIM + 1:  # 1 k-mer + 128 values
                continue
            kmer = parts[0]
            emb = list(map(float, parts[1:]))
            kmer_emb[kmer] = emb
    print(f"Loaded {len(kmer_emb)} k-mers with {EMBEDDING_DIM}D embeddings")
    return kmer_emb

def generate_kmers(sequence, k):
    """Generate k-mers with length validation"""
    return [sequence[i:i+k] for i in range(len(sequence)-k+1) if len(sequence[i:i+k]) == k]

def get_top_kmer_features(sequence, kmer_emb, k=4, top_n=100):
    """Get top k-mers with proper frequency handling"""
    kmers = generate_kmers(sequence, k)
    freq = defaultdict(int)
    for kmer in kmers:
        freq[kmer] += 1
    
    # Sort by frequency then alphabetically
    sorted_kmers = sorted(freq.items(), 
                         key=lambda x: (-x[1], x[0]))[:top_n*2]
    
    # Select unique k-mers with embeddings
    selected = []
    for kmer, _ in sorted_kmers:
        if kmer in kmer_emb and kmer not in selected:
            selected.append(kmer)
            if len(selected) == top_n:
                break
    
    # Fill missing with zero vectors
    embeddings = []
    for kmer in selected[:top_n]:
        embeddings.append(kmer_emb.get(kmer, [0.0]*EMBEDDING_DIM))
    
    # Pad if needed
    while len(embeddings) < top_n:
        embeddings.append([0.0]*EMBEDDING_DIM)
    
    return np.array(embeddings)

def process_sequences(df, model, kmer_emb, id_col, sequence_col):
    """Process sequences and generate embeddings"""
    results = []
    for _, row in df.iterrows():
        seq_id = row[id_col]
        sequence = row[sequence_col].replace('*', '')
        
        # Get k-mer features
        kmer_features = get_top_kmer_features(sequence, kmer_emb)
        
        # Reshape for model input: (batch_size, timesteps, features)
        input_data = kmer_features.reshape(1, TOP_KMERS, EMBEDDING_DIM)
        
        # Get latent embedding
        embedding = model.predict(input_data, verbose=0)
        
        # Extract final 128D embedding (last timestep)
        latent_embedding = embedding[0, -1, :].tolist()
        
        # Create row with separate columns for each embedding dimension
        emb_dict = {f"emb_{i+1}": val for i, val in enumerate(latent_embedding)}
        results.append({id_col: seq_id, **emb_dict})
    
    return pd.DataFrame(results)

# Load resources
print("Loading k-mer embeddings...")
kmer_embeddings = load_kmer_embeddings('kmers_embedding_human_dengue.txt')

print("Loading models...")
human_ae = tf.keras.models.load_model('drop_Human_bilstm_autoencoder_model.keras')
dengue_ae = tf.keras.models.load_model('drop_Dengue_bilstm_autoencoder_model.keras')

# Verify model output shapes
print("\nModel Architecture Verification:")
print("Human model input shape:", human_ae.input_shape)
print("Human model output shape:", human_ae.output_shape)
print("Dengue model input shape:", dengue_ae.input_shape)
print("Dengue model output shape:", dengue_ae.output_shape)

# Load sequences
task1_df = pd.read_csv('task1_sequences.csv')

# Process human sequences
print("\nProcessing human sequences...")
human_emb_df = process_sequences(task1_df, human_ae, kmer_embeddings, 
                                'human_id', 'human_sequence')
human_emb_df.to_csv('human_embeddings_128d.csv', index=False)
print(f"Saved {len(human_emb_df)} human embeddings")

# Process dengue sequences
print("\nProcessing dengue sequences...")
dengue_emb_df = process_sequences(task1_df, dengue_ae, kmer_embeddings,
                                 'dengue_id', 'dengue_sequence')
dengue_emb_df.to_csv('dengue_embeddings_128d.csv', index=False)
print(f"Saved {len(dengue_emb_df)} dengue embeddings")


Loading k-mer embeddings...
Loaded 157689 k-mers with 128D embeddings
Loading models...

Model Architecture Verification:
Human model input shape: (None, 100, 128)
Human model output shape: (None, 100, 128)
Dengue model input shape: (None, 100, 128)
Dengue model output shape: (None, 100, 128)

Processing human sequences...
Saved 1980 human embeddings

Processing dengue sequences...
Saved 1980 dengue embeddings


In [31]:
df = pd.read_csv('human_embeddings_128d.csv')
print(df.head())

  human_id     emb_1     emb_2     emb_3     emb_4     emb_5     emb_6  \
0    HOXB7  0.000000  0.111181  0.000888  0.000000  0.649469  0.150634   
1   SPACA3  0.000000  0.148559  0.000000  0.000000  0.132296  0.000000   
2   RNF168  0.098054  0.000000  0.000000  0.730736  0.286459  0.000000   
3     SCO2  0.495526  0.000000  0.000000  0.149431  0.699470  0.000000   
4  HMGCLL1  0.000000  0.000000  0.149232  0.000000  0.000000  0.244725   

      emb_7     emb_8  emb_9  ...   emb_119  emb_120   emb_121   emb_122  \
0  0.407491  0.442823    0.0  ...  0.056331      0.0  0.198479  0.000000   
1  0.000000  0.771057    0.0  ...  0.000000      0.0  0.000000  0.000000   
2  0.080425  0.000000    0.0  ...  0.000000      0.0  0.000000  0.007437   
3  0.099769  0.000000    0.0  ...  0.000000      0.0  0.000000  0.000000   
4  0.020758  0.000000    0.0  ...  0.000000      0.0  0.000000  0.239860   

    emb_123   emb_124   emb_125  emb_126   emb_127  emb_128  
0  0.000000  0.000000  0.000000     

In [32]:
df = pd.read_csv('dengue_embeddings_128d.csv')
print(df.head())

                   dengue_id     emb_1     emb_2  emb_3  emb_4  emb_5  \
0  YP_001531165.2_capsid_DV3  0.065721  0.105939    0.0    0.0    0.0   
1  YP_001531165.2_capsid_DV3  0.065721  0.105939    0.0    0.0    0.0   
2  YP_001531165.2_capsid_DV3  0.065721  0.105939    0.0    0.0    0.0   
3  YP_001531165.2_capsid_DV3  0.065721  0.105939    0.0    0.0    0.0   
4  YP_001531165.2_capsid_DV3  0.065721  0.105939    0.0    0.0    0.0   

      emb_6  emb_7    emb_8  emb_9  ...   emb_119  emb_120  emb_121  emb_122  \
0  0.259708    0.0  0.12811    0.0  ...  0.045609      0.0      0.0      0.0   
1  0.259708    0.0  0.12811    0.0  ...  0.045609      0.0      0.0      0.0   
2  0.259708    0.0  0.12811    0.0  ...  0.045609      0.0      0.0      0.0   
3  0.259708    0.0  0.12811    0.0  ...  0.045609      0.0      0.0      0.0   
4  0.259708    0.0  0.12811    0.0  ...  0.045609      0.0      0.0      0.0   

   emb_123  emb_124   emb_125  emb_126   emb_127  emb_128  
0      0.0      0.0 

### Task3
#### Get the label for the given file

In [33]:
import pandas as pd
import numpy as np
import tensorflow as tf

# Configuration
HUMAN_EMBED_FILE = 'human_embeddings_128d.csv'
DENGUE_EMBED_FILE = 'dengue_embeddings_128d.csv'
PAIRS_FILE = 'task1_sequences.csv'
MODEL_PATH = 'fc_final_model.keras'
OUTPUT_FILE = 'final_predictions.csv'

def load_embeddings(file_path, id_col='human_id'):
    """Load embeddings from wide-format CSV with emb_1 to emb_128 columns"""
    df = pd.read_csv(file_path)
    
    # Extract embedding columns
    emb_cols = [col for col in df.columns if col.startswith('emb_')]
    
    # Validate embedding dimensions
    if len(emb_cols) != 128:
        raise ValueError(f"Invalid embedding dimensions in {file_path}: "
                         f"Expected 128 columns, found {len(emb_cols)}")
    
    # Convert to numpy arrays
    embeddings = {}
    for _, row in df.iterrows():
        emb_array = np.array([row[col] for col in emb_cols], dtype=np.float32)
        if emb_array.shape != (128,):
            raise ValueError(f"Invalid embedding shape for {row[id_col]}: "
                             f"Expected (128,), got {emb_array.shape}")
        embeddings[row[id_col]] = emb_array
    
    print(f"Loaded {len(embeddings)} embeddings from {file_path}")
    return embeddings

# Load pretrained model
print("Loading prediction model...")
model = tf.keras.models.load_model(MODEL_PATH)
expected_input_dim = model.input_shape[1]
print(f"Model expects input dimension: {expected_input_dim}")

# Load embeddings
print("\nLoading human embeddings...")
human_embeddings = load_embeddings(HUMAN_EMBED_FILE, 'human_id')
print("Loading dengue embeddings...")
dengue_embeddings = load_embeddings(DENGUE_EMBED_FILE, 'dengue_id')

# Load interaction pairs
pairs_df = pd.read_csv(PAIRS_FILE)

# Process pairs and make predictions
predictions = []
missing_count = 0

print(f"\nProcessing {len(pairs_df)} pairs...")
for idx, row in pairs_df.iterrows():
    dengue_id = row['dengue_id']
    human_id = row['human_id']
    
    # Get embeddings
    human_emb = human_embeddings.get(human_id)
    dengue_emb = dengue_embeddings.get(dengue_id)
    
    if human_emb is None or dengue_emb is None:
        missing_count += 1
        continue
    
    # Combine embeddings
    combined = np.concatenate([human_emb, dengue_emb])
    if combined.shape != (expected_input_dim,):
        raise ValueError(f"Dimension mismatch: Expected {expected_input_dim}, "
                         f"got {combined.shape[0]}")
    
    # Predict
    pred_prob = model.predict(combined.reshape(1, -1), verbose=0)[0][0]
    predictions.append({
        'dengue_id': dengue_id,
        'human_id': human_id,
        'predicted_label': int(pred_prob >= 0.5),
        'confidence': float(pred_prob)
    })

# Save results
if predictions:
    results_df = pd.DataFrame(predictions)
    results_df.to_csv(OUTPUT_FILE, index=False)
    print(f"\nSaved {len(predictions)} predictions to {OUTPUT_FILE}")
    print("Sample predictions:")
    print(results_df.head())
else:
    print("\nNo predictions made - check embedding files")

# Print summary
print("\nProcessing Summary:")
print(f"Total pairs: {len(pairs_df)}")
print(f"Successful predictions: {len(predictions)}")
print(f"Missing embeddings: {missing_count}")
print(f"Prediction rate: {len(predictions)/len(pairs_df)*100:.1f}%")


Loading prediction model...
Model expects input dimension: 256

Loading human embeddings...
Loaded 1817 embeddings from human_embeddings_128d.csv
Loading dengue embeddings...
Loaded 53 embeddings from dengue_embeddings_128d.csv

Processing 1980 pairs...

Saved 1980 predictions to final_predictions.csv
Sample predictions:
                   dengue_id human_id  predicted_label    confidence
0  YP_001531165.2_capsid_DV3    HOXB7                0  0.000000e+00
1  YP_001531165.2_capsid_DV3   SPACA3                0  1.580929e-36
2  YP_001531165.2_capsid_DV3   RNF168                0  4.733572e-22
3  YP_001531165.2_capsid_DV3     SCO2                0  0.000000e+00
4  YP_001531165.2_capsid_DV3  HMGCLL1                0  4.948700e-38

Processing Summary:
Total pairs: 1980
Successful predictions: 1980
Missing embeddings: 0
Prediction rate: 100.0%
