In [14]:
import pandas as pd
import numpy as np

In [105]:
df = pd.read_csv('../train.csv')

In [106]:
amino_acids = "ACDEFGHIKLMNPQRSTVWXY"  # 20 standard amino acids

# Create a dictionary to map amino acids to indices
aa_dict = {aa: idx for idx, aa in enumerate(amino_acids)}

# Function to convert sequence to one-hot encoding
def seq_to_one_hot(sequence):
    one_hot = np.zeros(len(amino_acids))
   
    if sequence in aa_dict:
        one_hot[aa_dict[sequence]] = 1
    return one_hot

In [107]:
df.head()

Unnamed: 0,mutant,DMS_score
0,M0Y,0.273
1,M0W,0.2857
2,M0V,0.2153
3,M0T,0.3122
4,M0S,0.218


In [108]:
df['one_hot_sequence'] = df['mutant'].apply(lambda seq: seq_to_one_hot(seq[-1]).tolist())

In [109]:
df['pos'] = df['mutant'].apply(lambda seq: int(seq[1:-1]))

In [110]:
df

Unnamed: 0,mutant,DMS_score,one_hot_sequence,pos
0,M0Y,0.2730,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0
1,M0W,0.2857,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0
2,M0V,0.2153,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0
3,M0T,0.3122,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0
4,M0S,0.2180,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0
...,...,...,...,...
1135,P347D,0.3876,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",347
1136,P347C,0.1837,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",347
1137,P347A,0.4611,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",347
1138,P347M,0.2412,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",347


In [143]:
from sklearn.model_selection import train_test_split
# Sample data (replace this with your actual data)
# Assuming one_hot_sequences is a list of lists (one-hot encoded sequences)
one_hot_sequences = df['one_hot_sequence']
positions = df['pos']
scores = df['DMS_score']

# Create a DataFrame from the one-hot sequences
df_one_hot = pd.DataFrame(df['one_hot_sequence'].tolist(), index=df.index)

# Add the position column
df_one_hot['position'] = positions

# Define features and target variable
X = df_one_hot  # Features: one-hot sequences + position
y = pd.Series(scores)  # Target: 'score'

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.stats import spearmanr
from sklearn.metrics import mean_squared_error

# Prepare data as before
# ...

# Custom Spearman correlation objective function
def spearman_objective(preds, train_data):
    labels = train_data.get_label()
    
    # Calculate gradient and hessian for a ranking-focused objective
    # This is a simplified approximation
    grad = np.zeros_like(preds)
    hess = np.ones_like(preds)
    
    # Return gradient and hessian
    return grad, hess

# Define evaluation function for validation
def spearman_eval(preds, train_data):
    labels = train_data.get_label()
    corr, _ = spearmanr(labels, preds)
    # Return name, value, is_higher_better
    return 'spearman', corr, True

# Set parameters for LightGBM
params = {
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    "objective": spearman_objective,
    "early_stopping_rounds":10,

}

# Create a LightGBM dataset
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# Train the model with custom objective - note the proper parameter name
model = lgb.train(
    params, 
    train_data, 
    num_boost_round=100,
    valid_sets=[test_data],
    feval=spearman_eval,
)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate with Spearman correlation
spearman_corr, _ = spearmanr(y_test, y_pred)
print(f'Spearman Correlation: {spearman_corr}')

# Also check MSE for comparison
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

In [151]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from scipy.stats import spearmanr


# Create a LightGBM dataset
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)

# Set parameters for LightGBM
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
}

# Train the model
model = lgb.train(params, train_data, num_boost_round=100)

# Make predictions
y_pred = model.predict(X_test)

# You can evaluate the model here (e.g., using RMSE)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# Calculate Spearman's rank correlation coefficient
spearman_corr, _ = spearmanr(y_test, y_pred)
print(f'Spearman Correlation: {spearman_corr}')


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000174 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 100
[LightGBM] [Info] Number of data points in the train set: 912, number of used features: 21
[LightGBM] [Info] Start training from score 0.220504
Mean Squared Error: 0.042470157596825826
Spearman Correlation: 0.4115466061476755


In [153]:
df_test = pd.read_csv('../test.csv')

In [155]:
df_test['one_hot_sequence'] = df_test['mutant'].apply(lambda seq: seq_to_one_hot(seq[-1]).tolist())
df_test['pos'] = df['mutant'].apply(lambda seq: int(seq[1:-1]))

In [157]:
one_hot_sequences = df_test['one_hot_sequence']
positions = df_test['pos']

# Create a DataFrame from the one-hot sequences
df_test_one_hot = pd.DataFrame(df_test['one_hot_sequence'].tolist(), index=df_test.index)

# Add the position column
df_test_one_hot['position'] = positions

# Define features and target variable
X = df_test_one_hot  # Features: one-hot sequences + position


y_pred = model.predict(X)


0.4560411065183519

In [116]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.stats import spearmanr

# Define the MLP model
class SequencePositionMLP(nn.Module):
    def __init__(self, input_size, hidden_sizes=[128, 64], dropout_rate=0.2):
        super(SequencePositionMLP, self).__init__()
        
        # Create a list of layers
        layers = []
        
        # Input layer
        layers.append(nn.Linear(input_size, hidden_sizes[0]))
        layers.append(nn.ReLU())
        layers.append(nn.Dropout(dropout_rate))
        
        # Hidden layers
        for i in range(len(hidden_sizes)-1):
            layers.append(nn.Linear(hidden_sizes[i], hidden_sizes[i+1]))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout_rate))
        
        # Output layer (regression)
        layers.append(nn.Linear(hidden_sizes[-1], 1))
        
        # Sequential model with all layers
        self.mlp = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.mlp(x).squeeze()

# Data preparation
# Load and prepare the data as in the previous example
df = pd.read_csv('../train.csv')

# Extract amino acid and position from the mutant code
amino_acids = "ACDEFGHIKLMNPQRSTVWXY"  # 20 standard amino acids
aa_dict = {aa: idx for idx, aa in enumerate(amino_acids)}

# Function to convert sequence to one-hot encoding
def seq_to_one_hot(sequence):
    one_hot = np.zeros(len(amino_acids))
    if sequence in aa_dict:
        one_hot[aa_dict[sequence]] = 1
    return one_hot

# Convert mutant's last character (the mutated amino acid) to one-hot encoding
df['one_hot_sequence'] = df['mutant'].apply(lambda seq: seq_to_one_hot(seq[-1]).tolist())
df['pos'] = df['mutant'].apply(lambda seq: int(seq[1:-1]))

# Create a DataFrame from the one-hot sequences
df_one_hot = pd.DataFrame(df['one_hot_sequence'].tolist(), index=df.index)

# Add the position column
df_one_hot['position'] = df['pos']

# Define features and target variable
X = df_one_hot.values.astype(np.float32)  # Features: one-hot sequences + position
y = df['DMS_score'].values.astype(np.float32)  # Target: 'score'

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

# Create DataLoader for batching
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Define model
input_size = X_train.shape[1]  # Size of feature vector (one-hot encoding + position)
model = SequencePositionMLP(input_size, hidden_sizes=[128, 64, 128,32])

# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 100
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print(f"Training MLP model on {device}...")
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        
        # Forward pass
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    # Print loss every 10 epochs
    if (epoch+1) % 10 == 0:
        avg_loss = total_loss / len(train_loader)
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}')

# Evaluation
model.eval()
with torch.no_grad():
    X_test_tensor = X_test_tensor.to(device)
    y_test_tensor = y_test_tensor.to(device)
    
    # Make predictions
    y_pred_tensor = model(X_test_tensor)
    
    # Move to CPU for sklearn metrics
    y_pred = y_pred_tensor.cpu().numpy()
    y_test_np = y_test_tensor.cpu().numpy()
    
    # Calculate Mean Squared Error
    mse = mean_squared_error(y_test_np, y_pred)
    print(f'Mean Squared Error: {mse}')
    
    # Calculate Spearman's rank correlation coefficient
    spearman_corr, _ = spearmanr(y_test_np, y_pred)
    print(f'Spearman Correlation: {spearman_corr}')



Training MLP model on cpu...
Epoch [10/100], Loss: 0.0438
Epoch [20/100], Loss: 0.0415
Epoch [30/100], Loss: 0.0416
Epoch [40/100], Loss: 0.0414
Epoch [50/100], Loss: 0.0408
Epoch [60/100], Loss: 0.0414
Epoch [70/100], Loss: 0.0418
Epoch [80/100], Loss: 0.0412
Epoch [90/100], Loss: 0.0409
Epoch [100/100], Loss: 0.0414
Mean Squared Error: 0.049658264964818954
Spearman Correlation: 0.05434081824184723


In [120]:
X_train[100]

array([  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   1., 302.],
      dtype=float32)

In [None]:
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.stats import spearmanr
from sklearn.utils import resample
from tqdm import tqdm

# Assuming one_hot_sequences is a list of lists (one-hot encoded sequences)
one_hot_sequences = df['one_hot_sequence']
positions = df['pos']
scores = df['DMS_score']

# Create a DataFrame from the one-hot sequences
df_one_hot = pd.DataFrame(df['one_hot_sequence'].tolist(), index=df.index)

# Add the position column
df_one_hot['position'] = positions

# Define features and target variable
X = df_one_hot  # Features: one-hot sequences + position
y = pd.Series(scores)  # Target: 'score'

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Set parameters for LightGBM
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
}

# Number of bootstrapped models
num_bootstraps = 50
lgb_boot_models = []
lgb_predictions = []

print("Training bootstrapped LightGBM models...")
# For each bootstrap iteration
for i in range(num_bootstraps):
    # Bootstrap resampling - sample with replacement
    X_resampled, y_resampled = resample(X_train, y_train, replace=True, n_samples=len(X_train), random_state=i)
    
    # Create a LightGBM dataset with the bootstrapped data
    train_data = lgb.Dataset(X_resampled, label=y_resampled)
    
    # Train a LightGBM model
    model = lgb.train(params, train_data, num_boost_round=100)
    
    # Store the trained model
    lgb_boot_models.append(model)
    
    # Make predictions on test set
    y_pred = model.predict(X_test)
    lgb_predictions.append(y_pred)
    
    # Print progress
    if (i+1) % 10 == 0:
        print(f"Completed {i+1}/{num_bootstraps} models")

# Convert predictions to numpy array
lgb_predictions = np.array(lgb_predictions)

# Average predictions from all models (simple ensemble)
y_pred_ensemble = np.mean(lgb_predictions, axis=0)

# Calculate Mean Squared Error
mse = mean_absolute_error(y_test, y_pred_ensemble)
print(f'Mean Squared Error: {mse}')

# Calculate Spearman's rank correlation coefficient
spearman_corr, _ = spearmanr(y_test, y_pred_ensemble)
print(f'Spearman Correlation: {spearman_corr}')

# Calculate confidence intervals for predictions
y_pred_std = np.std(lgb_predictions, axis=0)
y_pred_lower = y_pred_ensemble - 1.96 * y_pred_std
y_pred_upper = y_pred_ensemble + 1.96 * y_pred_std

print("\nConfidence intervals statistics:")
print(f"Average prediction std: {np.mean(y_pred_std)}")
print(f"Min prediction std: {np.min(y_pred_std)}")
print(f"Max prediction std: {np.max(y_pred_std)}")

Training bootstrapped LightGBM models...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000032 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 100
[LightGBM] [Info] Number of data points in the train set: 912, number of used features: 21
[LightGBM] [Info] Start training from score 0.220641
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000026 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 100
[LightGBM] [Info] Number of data points in the train set: 912, number of used features: 21
[LightGBM] [Info] Start training from score 0.224235
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000036 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 100
[LightGBM] [Info] Number of data points in the train set: 912, number of used features

In [98]:
lgb_predictions[:,0]

array([0.16356444, 0.14476905, 0.11281036, 0.09485927, 0.09168587,
       0.20377155, 0.12590746, 0.19075705, 0.16040629, 0.18118763,
       0.15667145, 0.12943331, 0.22832457, 0.09808456, 0.10546505,
       0.16508613, 0.10623332, 0.16546032, 0.13178738, 0.1549277 ,
       0.18829339, 0.21908317, 0.10839295, 0.11321101, 0.12214463,
       0.22398831, 0.10451026, 0.16439929, 0.22207656, 0.12021556,
       0.16913478, 0.15624356, 0.14467362, 0.13311758, 0.15305299,
       0.14922218, 0.14000777, 0.09545491, 0.13571391, 0.19115838,
       0.23031212, 0.19122742, 0.12590653, 0.21584535, 0.1493009 ,
       0.11636909, 0.17904823, 0.13094013, 0.2050524 , 0.11800595])

dtype('O')

In [31]:
test = pd.read_parquet('../protein_embeddings.parquet')

In [32]:
from torch.nn import CosineSimilarity
import torch

In [46]:
best = 1
for i in range(1000):
    for j in range(i+1, 1000):
        best = min(best, cos(torch.tensor(test.loc[i]['Embedding']), torch.tensor(test.loc[j]['Embedding'])))

print(best)

tensor(0.9994)
