In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer

# Step 1: Load the data
a_domains_df = pd.read_csv('data/a_domains.tsv', sep='\t')
amino_acid_codes_df = pd.read_csv('data/amino_acids.tsv', sep='\t')

# Step 2: Preprocess the data
# Merge the amino acid codes with the A-domain data
a_domains_df = pd.merge(a_domains_df, amino_acid_codes_df, left_on='amino_acid', right_on='three_letter_code', how='left')
a_domains_df = a_domains_df.rename(columns={'three_letter_code': 'recruited_amino_acid'})

# Step 3: Split the data
X = a_domains_df['sequence']
y = a_domains_df['recruited_amino_acid']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Choose a model
model = RandomForestClassifier()

# Step 5: Train the model
# Vectorize the input sequences
vectorizer = CountVectorizer(analyzer='char')
X_train_vec = vectorizer.fit_transform(X_train)
# Encode the target variable
label_encoder = LabelEncoder()
y_train_enc = label_encoder.fit_transform(y_train)
# Train the model
model.fit(X_train_vec, y_train_enc)

# Step 6: Evaluate the model
# Vectorize the test sequences
X_test_vec = vectorizer.transform(X_test)
# Encode the test labels
y_test_enc = label_encoder.transform(y_test)
# Make predictions
y_pred = model.predict(X_test_vec)
# Decode the predicted labels
y_pred_decoded = label_encoder.inverse_transform(y_pred)
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_decoded)
print("Accuracy:", accuracy)

# Calculate F1-score and recall
report = classification_report(y_test, y_pred_decoded)
print("Classification Report:\n", report)

# Step 7: Checking the predictions
test_sequences = ['LFTTFDVCYQESSLITAGEHNHYGPSETHVVTTC', 'SWNLFDAFALTTVFMLGGEMNAYGPTESSVMATY']
test_sequences_vec = vectorizer.transform(test_sequences)
test_predictions_enc = model.predict(test_sequences_vec)
test_predictions_decoded = label_encoder.inverse_transform(test_predictions_enc)
print("Test Sequences Predictions:", test_predictions_decoded)

Accuracy: 0.6597222222222222
Classification Report:
               precision    recall  f1-score   support

         aad       1.00      0.40      0.57         5
         ala       0.71      0.91      0.80        22
         arg       1.00      0.67      0.80         3
         asn       0.67      0.67      0.67         3
         asp       0.50      0.33      0.40         3
         bht       1.00      1.00      1.00         1
         cys       0.50      0.50      0.50         2
         dhb       0.93      1.00      0.97        14
         dpg       0.67      1.00      0.80         2
         gln       0.75      0.75      0.75         4
         glu       0.00      0.00      0.00         4
         gly       0.60      1.00      0.75         3
         his       1.00      0.50      0.67         2
         hpg       0.75      1.00      0.86         3
         hrn       0.00      0.00      0.00         2
         ile       0.33      0.50      0.40         2
         leu       0.60     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [2]:
from sklearn.model_selection import cross_val_score

# Step 3: Prepare the data
X = a_domains_df['sequence']
y = a_domains_df['recruited_amino_acid']
# Vectorize the input sequences
vectorizer = CountVectorizer(analyzer='char')
X_vec = vectorizer.fit_transform(X)
# Encode the target variable
label_encoder = LabelEncoder()
y_enc = label_encoder.fit_transform(y)

# Step 4: Choose a model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Step 5: Perform k-fold cross-validation
cv_scores = cross_val_score(model, X_vec, y_enc, cv=5)

# Step 6: Print the average accuracy and individual fold scores
print("Cross-Validation Accuracy: {:.2f}%".format(100 * cv_scores.mean()))
print("Individual Fold Scores:")
for i, score in enumerate(cv_scores):
    print("Fold {}: {:.2f}%".format(i+1, 100 * score))



Cross-Validation Accuracy: 58.80%
Individual Fold Scores:
Fold 1: 61.11%
Fold 2: 65.03%
Fold 3: 59.44%
Fold 4: 65.03%
Fold 5: 43.36%


In [3]:
from sklearn.svm import SVC

# Step 4: Choose a model
model = SVC()

# Step 5: Train the model
# Vectorize the input sequences
vectorizer = CountVectorizer(analyzer='char')
X_train_vec = vectorizer.fit_transform(X_train)
# Encode the target variable
label_encoder = LabelEncoder()
y_train_enc = label_encoder.fit_transform(y_train)
# Train the model
model.fit(X_train_vec, y_train_enc)

# Step 6: Evaluate the model
# Vectorize the test sequences
X_test_vec = vectorizer.transform(X_test)
# Encode the test labels
y_test_enc = label_encoder.transform(y_test)
# Make predictions
y_pred = model.predict(X_test_vec)
# Decode the predicted labels
y_pred_decoded = label_encoder.inverse_transform(y_pred)
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_decoded)
print("Accuracy:", accuracy)

# Calculate F1-score and recall
report = classification_report(y_test, y_pred_decoded)
print("Classification Report:\n", report)

# Step 7: Checking the predictions
test_sequences = ['LFTTFDVCYQESSLITAGEHNHYGPSETHVVTTC', 'SWNLFDAFALTTVFMLGGEMNAYGPTESSVMATY']
test_sequences_vec = vectorizer.transform(test_sequences)
test_predictions_enc = model.predict(test_sequences_vec)
test_predictions_decoded = label_encoder.inverse_transform(test_predictions_enc)
print("Test Sequences Predictions:", test_predictions_decoded)

Accuracy: 0.5555555555555556
Classification Report:
               precision    recall  f1-score   support

         aad       1.00      0.20      0.33         5
         ala       0.71      0.91      0.80        22
         arg       0.00      0.00      0.00         3
         asn       0.50      0.33      0.40         3
         asp       0.00      0.00      0.00         3
         bht       0.00      0.00      0.00         1
         cys       0.50      0.50      0.50         2
         dhb       0.93      1.00      0.97        14
         dpg       0.00      0.00      0.00         2
         gln       0.00      0.00      0.00         4
         glu       0.00      0.00      0.00         4
         gly       0.75      1.00      0.86         3
         his       0.00      0.00      0.00         2
         hpg       1.00      0.67      0.80         3
         hrn       0.00      0.00      0.00         2
         ile       0.00      0.00      0.00         2
         leu       1.00     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [4]:
from sklearn.neural_network import MLPClassifier

# Step 4: Choose a model
model = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000)

# Step 5: Train the model
# Vectorize the input sequences
vectorizer = CountVectorizer(analyzer='char')
X_train_vec = vectorizer.fit_transform(X_train)
# Encode the target variable
label_encoder = LabelEncoder()
y_train_enc = label_encoder.fit_transform(y_train)
# Train the model
model.fit(X_train_vec, y_train_enc)

# Step 6: Evaluate the model
# Vectorize the test sequences
X_test_vec = vectorizer.transform(X_test)
# Encode the test labels
y_test_enc = label_encoder.transform(y_test)
# Make predictions
y_pred = model.predict(X_test_vec)
# Decode the predicted labels
y_pred_decoded = label_encoder.inverse_transform(y_pred)
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_decoded)
print("Accuracy:", accuracy)

# Calculate F1-score and recall
report = classification_report(y_test, y_pred_decoded)
print("Classification Report:\n", report)

# Step 7: Checking the predictions
test_sequences = ['LFTTFDVCYQESSLITAGEHNHYGPSETHVVTTC', 'SWNLFDAFALTTVFMLGGEMNAYGPTESSVMATY']
test_sequences_vec = vectorizer.transform(test_sequences)
test_predictions_enc = model.predict(test_sequences_vec)
test_predictions_decoded = label_encoder.inverse_transform(test_predictions_enc)
print("Test Sequences Predictions:", test_predictions_decoded)

Accuracy: 0.6180555555555556
Classification Report:
               precision    recall  f1-score   support

         aad       1.00      0.40      0.57         5
         ala       0.66      0.86      0.75        22
         arg       1.00      0.67      0.80         3
         asn       1.00      0.33      0.50         3
         asp       0.20      0.33      0.25         3
         bht       1.00      1.00      1.00         1
         cys       0.50      0.50      0.50         2
         dhb       1.00      1.00      1.00        14
         dpg       1.00      1.00      1.00         2
         gln       0.50      0.50      0.50         4
         glu       0.40      0.50      0.44         4
         gly       0.50      1.00      0.67         3
         his       0.33      0.50      0.40         2
         hpg       1.00      0.67      0.80         3
         hrn       0.00      0.00      0.00         2
         ile       0.50      0.50      0.50         2
         leu       0.60     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [5]:
import xgboost as xgb

# Step 4: Choose a model
model = xgb.XGBClassifier()

# Step 5: Train the model
# Vectorize the input sequences
vectorizer = CountVectorizer(analyzer='char')
X_train_vec = vectorizer.fit_transform(X_train)
# Encode the target variable
label_encoder = LabelEncoder()
y_train_enc = label_encoder.fit_transform(y_train)
# Train the model
model.fit(X_train_vec, y_train_enc)

# Step 6: Evaluate the model
# Vectorize the test sequences
X_test_vec = vectorizer.transform(X_test)
# Encode the test labels
y_test_enc = label_encoder.transform(y_test)
# Make predictions
y_pred = model.predict(X_test_vec)
# Decode the predicted labels
y_pred_decoded = label_encoder.inverse_transform(y_pred)
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_decoded)
print("Accuracy:", accuracy)

# Calculate F1-score and recall
report = classification_report(y_test, y_pred_decoded)
print("Classification Report:\n", report)

# Step 7: Checking the predictions
test_sequences = ['LFTTFDVCYQESSLITAGEHNHYGPSETHVVTTC', 'SWNLFDAFALTTVFMLGGEMNAYGPTESSVMATY']
test_sequences_vec = vectorizer.transform(test_sequences)
test_predictions_enc = model.predict(test_sequences_vec)
test_predictions_decoded = label_encoder.inverse_transform(test_predictions_enc)
print("Test Sequences Predictions:", test_predictions_decoded)

Accuracy: 0.6319444444444444
Classification Report:
               precision    recall  f1-score   support

         aad       0.50      0.40      0.44         5
         ala       0.70      0.86      0.78        22
         arg       1.00      0.67      0.80         3
         asn       0.50      0.33      0.40         3
         asp       0.33      0.33      0.33         3
         bht       1.00      1.00      1.00         1
         cys       0.33      0.50      0.40         2
         dab       0.00      0.00      0.00         0
         dhb       0.93      1.00      0.97        14
         dpg       1.00      1.00      1.00         2
         gln       1.00      0.75      0.86         4
         glu       0.00      0.00      0.00         4
         gly       1.00      1.00      1.00         3
         his       0.50      0.50      0.50         2
         hpg       0.75      1.00      0.86         3
         hrn       0.00      0.00      0.00         2
         ile       0.33     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
import torch
import torch.nn as nn
import torch.optim as optim

# Step 5: Define the model architecture
class AminoAcidClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(AminoAcidClassifier, self).__init__()
        self.embedding = nn.Embedding(input_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        embedded = self.embedding(x)
        embedded_mean = embedded.mean(dim=1)
        output = self.fc(embedded_mean)
        return output

input_dim = 21  # Number of unique amino acids including the gap
hidden_dim = 100
output_dim = len(amino_acid_codes_df)  # Number of output classes

model = AminoAcidClassifier(input_dim, hidden_dim, output_dim)

# Step 6: Train the model
# Vectorize the input sequences
vectorizer = CountVectorizer(analyzer='char')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Encode the target variable
label_encoder = LabelEncoder()
y_train_enc = label_encoder.fit_transform(y_train)
y_test_enc = label_encoder.transform(y_test)

# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train_vec.toarray(), dtype=torch.long)
y_train_tensor = torch.tensor(y_train_enc, dtype=torch.long)
X_test_tensor = torch.tensor(X_test_vec.toarray(), dtype=torch.long)
y_test_tensor = torch.tensor(y_test_enc, dtype=torch.long)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
batch_size = 32

for epoch in range(num_epochs):
    for i in range(0, len(X_train_tensor), batch_size):
        optimizer.zero_grad()
        batch_X = X_train_tensor[i:i+batch_size]
        batch_y = y_train_tensor[i:i+batch_size]
        output = model(batch_X)
        loss = criterion(output, batch_y)
        loss.backward()
        optimizer.step()

# Step 7: Evaluate the model
model.eval()
with torch.no_grad():
    output = model(X_test_tensor)
    _, predicted = torch.max(output, dim=1)
    y_pred_decoded = label_encoder.inverse_transform(predicted.numpy())
    accuracy = (predicted == y_test_tensor).sum().item() / len(y_test_tensor)
    print("Accuracy:", accuracy)
    
    # Calculate F1-score and recall
    report = classification_report(y_test, y_pred_decoded)
    print("Classification Report:\n", report)

# Step 7: Checking predictions
test_sequences = ['LFTTFDVCYQESSLITAGEHNHYGPSETHVVTTC', 'SWNLFDAFALTTVFMLGGEMNAYGPTESSVMATY']
test_sequences_vec = vectorizer.transform(test_sequences)
test_sequences_tensor = torch.tensor(test_sequences_vec.toarray(), dtype=torch.long)

model.eval()
with torch.no_grad():
    output = model(test_sequences_tensor)
    _, predicted = torch.max(output, dim=1)
    test_predictions_decoded = label_encoder.inverse_transform(predicted.numpy())
    print("Test Sequences Predictions:", test_predictions_decoded)

Accuracy: 0.2222222222222222
Classification Report:
               precision    recall  f1-score   support

         aad       0.00      0.00      0.00         5
         ala       0.31      0.73      0.43        22
         arg       0.00      0.00      0.00         3
         asn       0.00      0.00      0.00         3
         asp       0.00      0.00      0.00         3
         bht       0.00      0.00      0.00         1
         cys       0.00      0.00      0.00         2
         dhb       0.26      0.50      0.34        14
         dpg       0.00      0.00      0.00         2
         gln       0.00      0.00      0.00         4
         glu       0.00      0.00      0.00         4
         gly       0.00      0.00      0.00         3
         his       0.00      0.00      0.00         2
         hpg       0.00      0.00      0.00         3
         hrn       0.00      0.00      0.00         2
         ile       0.00      0.00      0.00         2
         leu       0.00     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
