## Protein Embeddings Manipulation

In [2]:
from pathlib import Path
import matplotlib.pyplot as plt
from tqdm import tqdm 
import pandas as pd

from protera_stability.data import ProteinStabilityDataset


data_path = Path("../data") 

df = pd.read_csv(data_path / "fireprotdb_results.csv", low_memory=False)
df = df.drop_duplicates()
total_prots = len(df.sequence.unique())

df = df[df.tm.isin(df.tm.dropna())]
tm_prots = len(df.sequence.unique())

mutations_per_prot = df.groupby(by="sequence").mutation.unique()

sequences = [] # set()

#TODO: TEST IF MUTATION IS CORRECT
nans = 0
for row in tqdm(df.itertuples(), total = len(df)):
    mutation = row.mutation
    position = row.position
    original_seq = row.sequence

    new_sequence = original_seq[:position - 1] + mutation + original_seq[position:]
    if mutation == original_seq[position - 1] or str(new_sequence) == str(original_seq) or (new_sequence in sequences):
        # print(row.Index)
        new_sequence = None
        nans += 1
    sequences.append(new_sequence)

df["mutated_sequences"] = sequences

100%|██████████| 7481/7481 [00:00<00:00, 34629.28it/s]


In [3]:
most_mutateds = df.groupby("sequence").mutated_sequences.count().sort_values(ascending=False)
most_mutated_seq = most_mutateds.index[2]

most_mutated_prot = df[df.sequence == most_mutated_seq]
most_mutated_prot = most_mutated_prot[most_mutated_prot.mutated_sequences.isin(most_mutated_prot.mutated_sequences.dropna())]

print(most_mutated_prot.protein_name.unique())

['Aspartate aminotransferase family protein']


## Predicting Tm

In [7]:
from protera_stability.data import ProteinStabilityDataset
from protera_stability.utils.decomposition import dim_reduction

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import RidgeCV
from sklearn.svm import SVR
from sklearn.base import clone

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

from joblib import load

train_set = ProteinStabilityDataset(proteins_path=data_path / "tm_fireprot.h5")


results = {}
for mutated_seq in most_mutateds.index[:5]:
    most_mutated_prot = df[df.sequence == mutated_seq]
    most_mutated_prot = most_mutated_prot[most_mutated_prot.mutated_sequences.isin(most_mutated_prot.mutated_sequences.dropna())]

    all_seqs = list(map(lambda x : x.decode("utf8"), train_set.sequences))
    other_seqs = set(all_seqs).difference(set(most_mutated_prot.mutated_sequences.values))
    idxs_most_mutated = [all_seqs.index(mutation) for mutation in most_mutated_prot.mutated_sequences.values]
    idxs_other_mutations = [all_seqs.index(mutation) for mutation in other_seqs]

    most_mutated_X = train_set.X[idxs_most_mutated]
    most_mutated_y = train_set.y[idxs_most_mutated]

    other_mutations_X = train_set.X[idxs_other_mutations]
    other_mutations_y = train_set.y[idxs_other_mutations]
    
    most_mutated_X = dim_reduction(most_mutated_X, y=most_mutated_y, plot_viz=False, n_components=32)
    other_mutations_X = dim_reduction(other_mutations_X, y=other_mutations_y, plot_viz=False, n_components=32)

    mlp = MLPRegressor(**{'activation': 'tanh', 'hidden_layer_sizes': (100, 100), 'solver': 'adam', "max_iter": 1000})
    rf = RandomForestRegressor(**{'criterion': 'mse', 'max_depth': None, 'n_estimators': 100, "n_jobs": 10})
    svr = SVR(**{'C': 10.0, 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf'})
    linear_model = RidgeCV()

    ereg = VotingRegressor(
    estimators=
        [
            ('rf', clone(rf)), 
            ('svr', clone(svr)), 
            ('mlp', clone(mlp)),
            ('ridge', clone(linear_model))
        ],
    weights=[0.5, 0.2, 0.2, 0.1]
    )

    # fit with other_mutations
    ereg.fit(other_mutations_X, other_mutations_y)
    print("fitted ensemble...")

    mlp.fit(other_mutations_X, other_mutations_y)
    print("fitted mlp...")

    rf.fit(other_mutations_X, other_mutations_y)
    print("fitted rf...")

    svr.fit(other_mutations_X, other_mutations_y)
    print("fitted svr...")

    linear_model.fit(other_mutations_X, other_mutations_y)
    print("fitted linear_model...")

    # predict most_mutated
    ereg_preds = ereg.predict(most_mutated_X)
    mlp_preds = mlp.predict(most_mutated_X)
    rf_preds = rf.predict(most_mutated_X)
    svr_preds = svr.predict(most_mutated_X)
    linear_preds = linear_model.predict(most_mutated_X)
    

    protein_name = most_mutated_prot.protein_name.unique()
    protein_results = {
        "ereg" : r2_score(most_mutated_y, ereg_preds),
        "mlp"  : r2_score(most_mutated_y, mlp_preds),
        "rf"   : r2_score(most_mutated_y, rf_preds),
        "svr"  : r2_score(most_mutated_y, svr_preds),
        "linear_reg" : r2_score(most_mutated_y, linear_preds),
    }
    
    # tm_preds = ereg.partial_fit(most_mutated_X)

    # reg1 = RandomForestRegressor(**{'criterion': 'mse', 'max_depth': None, 'n_estimators': 200})
    # reg2 = SVR(**{'C': 10.0, 'degree': 3, 'gamma': 'scale', 'kernel': 'poly'})
    # reg3 = MLPRegressor(**{'activation': 'tanh', 'hidden_layer_sizes': (100, 100), 'solver': 'adam'})
    # reg4 = Ridge(alpha=1e2)

    # ereg = VotingRegressor(
    #     estimators=
    #         [
    #             ('rf', reg1), 
    #             ('svr', reg2), 
    #             ('mlp', reg3),
    #             ('ridge', reg4)
    #         ],
    #     weights=[0.5, 0.2, 0.2, 0.1]
    # )
    # ereg.fit(X_train, y_train)
    # tm_preds = ereg.predict(X_valid)

    # r2_preds = r2_score(y_valid, tm_preds)

    print(most_mutated_prot.protein_name.unique(), protein_results)
    results[most_mutated_prot.protein_name.unique()[0]] = protein_results

fitted ensemble...
fitted mlp...
fitted rf...
fitted svr...
fitted linear_model...
['Thermonuclease'] {'ereg': -1.3589037869271041, 'mlp': -2.8954370702535863, 'rf': -2.058781857183315, 'svr': -0.6709185206701058, 'linear_reg': -1.869168318806766}
fitted ensemble...
fitted mlp...
fitted rf...
fitted svr...
fitted linear_model...
['Endolysin'] {'ereg': -0.3375598084647651, 'mlp': -0.474430518054777, 'rf': -0.2973100574408185, 'svr': -1.1713593904721575, 'linear_reg': -0.2774411630857676}
fitted ensemble...
fitted mlp...
fitted rf...
fitted svr...
fitted linear_model...
['Aspartate aminotransferase family protein'] {'ereg': -119.97413757881671, 'mlp': -53.609283367719854, 'rf': -78.469987182137, 'svr': -213.67096301790806, 'linear_reg': -113.26537251569945}
fitted ensemble...
fitted mlp...
fitted rf...
fitted svr...
fitted linear_model...
['ADHA'] {'ereg': -0.5331784780405724, 'mlp': -1.9060271922431862, 'rf': -0.7336757166566863, 'svr': -2.1890743738724634, 'linear_reg': -0.092879524493

## Predicting dTm


$$
dTm = Tm_{original} - Tm_{mutation}
$$

### Compute $Tm_{original}$

In [185]:
import numpy as np
idx = 1

inverse_delta_op = lambda value, delta : value + np.sign(delta) * delta if np.sign(delta) == -1. else value - delta
unscaled_thermonuclease_tm = inverse_delta_op(most_mutated_df.iloc[idx]["tm"], most_mutated_df.iloc[idx]["dTm"])
unscaled_thermonuclease_tm

51.4

### Predict $Tm_{mutation}$

In [201]:
reg1 = load("./models/best_MLP.joblib")
reg1.estimator.partial_fit(X_tm, y_tm)

tm_preds = reg1.predict(X_tm)

r2_score(y_tm, tm_preds)

-4898.574947854435

In [188]:
from joblib import load
ereg = load("./models/ensemble_tm.joblib")
tm_preds = ereg.partial_fit(X_tm)

r2_score(y_tm, tm_preds)

AttributeError: 'VotingRegressor' object has no attribute 'partial_fit'

In [142]:
train_set = ProteinStabilityDataset(proteins_path=data_path / "tm_fireprot.h5")
unscaled_preds_tm = (tm_preds * train_set.y_scaler.var_) + train_set.y_scaler.mean_

unscaled_dtm = np.round(unscaled_thermonuclease_tm - unscaled_preds_tm, 2)

In [152]:
train_set.y.shape

(4338,)

In [153]:
scaled_dtm = train_set.y_scaler.transform(unscaled_dtm.reshape(-1, 1)).reshape(unscaled_dtm.shape)
r2_score(y_train, scaled_dtm)

-388.44524069765606