# TrRosetta 

## Method

TrRosetta is a deep residual convolutional network takes in input a MSA and outputs the relative distance and orientation of each residue pair.
The output includes the probability for the distance range 2 to 20 A, binned in 36 bins of 0.5 A each (plus 1 bin for no contact), and other bins for angles.

## Installation

Easiest way is to use singularity with the tf1 CPU container (needs a lot of RAM)

```
singularity pull docker://tensorflow/tensorflow:1.15.5
```

Clone the repository and download the trained model.

```
git clone https://github.com/gjoni/trRosetta
cd trRosetta
wget https://files.ipd.uw.edu/pub/trRosetta/model2019_07.tar.bz2
tar xf model2019_07.tar.bz2
```


## How to run

Go to the root of the repository and run:

```
singularity run singularity_containers/tensorflow_1.15.5.sif
python ./network/predict.py -m ./model2019_07 <a3m MSA input> <npz file output>
```

## Extract distance matrices

Run the script `./tr_rosetta_get_distances.py` on the npz output from the network.

## Notes

In general the trimmed MSA seem to be better than the full lenght ones.

## Creation of a dataframe with distances for trRosetta predictions and PDB structures

In [1]:
import joblib
import numpy as np
import pandas as pd


def get_distance_long_df(distance_map):
    dist_dict = {
        "distance": list(),
        "position_1": list(),
        "position_2": list(),
    }
    for i, row in enumerate(distance_map):
        for j, el in enumerate(row):
            if i >= j:
                dist_dict["distance"].append(el)
                dist_dict["position_1"].append(i)
                dist_dict["position_2"].append(j)
    dist_df = pd.DataFrame(dist_dict)
    assert len(dist_df) == (
        (((distance_map.shape[0] ** 2) - distance_map.shape[0]) / 2)
        + distance_map.shape[0]
    )
    return dist_df


def get_column_value(df, colname, id_val, id_col="dms_id"):
    values = set(df[df[id_col] == id_val][colname])
    assert len(values) == 1
    return values.pop()


dms_datasets_df = pd.read_csv("../../dataset/dms_datasets.csv")
tr_rosetta_long_df = pd.DataFrame()
uniprot_long_df = pd.DataFrame()
for study in set(dms_datasets_df.dms_id):
    trrosetta_basename = get_column_value(
        dms_datasets_df, "feature_basename_trrosetta", study
    )
    trrosetta_uniprot_first = get_column_value(
        dms_datasets_df, "mutated_domain_uniprot_first", study
    )
    tr_rosetta_distances = joblib.load(
        "../../processing/tr_rosetta/{}_trRosetta_distance_mat.joblib.xz".format(
            trrosetta_basename
        )
    )

    # the predicted distances
    curr_tr_rosetta_long_df = get_distance_long_df(tr_rosetta_distances)
    # since uniprot_first is 1-indexed, also positions are now 1-indexed
    curr_tr_rosetta_long_df["position_1"] = (
        curr_tr_rosetta_long_df["position_1"] + trrosetta_uniprot_first
    )
    curr_tr_rosetta_long_df["position_2"] = (
        curr_tr_rosetta_long_df["position_2"] + trrosetta_uniprot_first
    )
    curr_tr_rosetta_long_df["dms_id"] = study

    # the experimental distances
    uniprot_id = get_column_value(dms_datasets_df, "uniprot_id", study)
    pdb_id = get_column_value(dms_datasets_df, "pdb_id", study)
    chain_id = get_column_value(dms_datasets_df, "pdb_chain", study)
    uniprot_dist = joblib.load(
        "../../processing/structures/uniprot_cmap/{}_mapped_{}_{}.uniprot_distance_matrix.joblib.xz".format(
            uniprot_id, pdb_id, chain_id
        )
    )
    curr_uniprot_long_df = get_distance_long_df(uniprot_dist["distance_matrix"])
    curr_uniprot_long_df["dms_id"] = study
    # need to make also these 1-indexed
    curr_uniprot_long_df["position_1"] = curr_uniprot_long_df["position_1"] + 1
    curr_uniprot_long_df["position_2"] = curr_uniprot_long_df["position_2"] + 1

    if uniprot_id == "P06654":
        # the gb1 structure is not available but it is availbale a structure for the next IgG binding domain
        curr_uniprot_long_df["position_1"] = curr_uniprot_long_df["position_1"] - 70
        curr_uniprot_long_df["position_2"] = curr_uniprot_long_df["position_2"] - 70
    elif uniprot_id == "P0CG63":
        # the structure is about the third ubiquitin repeat, the mutated positions refer to the first
        curr_uniprot_long_df["position_1"] = curr_uniprot_long_df["position_1"] - 304
        curr_uniprot_long_df["position_2"] = curr_uniprot_long_df["position_2"] - 304

    # concatenate the tr rosetta and experimental df independently
    tr_rosetta_long_df = pd.concat([tr_rosetta_long_df, curr_tr_rosetta_long_df])
    uniprot_long_df = pd.concat([uniprot_long_df, curr_uniprot_long_df])

# need to rename to have 2 columns in the resulting df
tr_rosetta_long_df.rename(columns={"distance": "tr_rosetta_distance"}, inplace=True)
uniprot_long_df.rename(columns={"distance": "experimental_distance"}, inplace=True)
distance_df = tr_rosetta_long_df.merge(uniprot_long_df)

## Export the distance dataframe

In [2]:
distance_df.to_csv(
    "../../processing/structures/experimental_and_predicted_distances.csv", index=False
)
distance_df

Unnamed: 0,tr_rosetta_distance,position_1,position_2,dms_id,experimental_distance
0,inf,1,1,beta-lactamase,
1,5.505627,2,1,beta-lactamase,
2,inf,2,2,beta-lactamase,
3,7.424289,3,1,beta-lactamase,
4,5.491487,3,2,beta-lactamase,
...,...,...,...,...,...
121836,12.133971,203,199,Pab1,13.797928
121837,9.938418,203,200,Pab1,9.642304
121838,7.518795,203,201,Pab1,9.068827
121839,5.858654,203,202,Pab1,5.317393
