# Sausage Scripts

Generate RMSD values for all positions in nchoose2 calculation. 

These RMSD values can be set to b-values to display sausage and color using the `Load new B-factors` script (https://figshare.com/articles/Pymol_script_loadBfacts_py/1176991 and https://pymolwiki.org/index.php/Load_new_B-factors). To run the script, download from figshare and execute the following commands in pymol:

```
run SCRIPT_FILENAME
load PDB_FILENAME
loadBfacts mol, startaa, source

# for example
run /loadBfacts.py
run /1g68.pdb
loadBfacts 1g68, 1, RMSD_FILENAME.txt
```

Requires `structural_variability_data.zip` from https://evcouplings.org/3Dseq




In [0]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format='png'

In [0]:
import os, sys, copy, time
import matplotlib.colors as colors
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
import numpy as np
import pandas as pd
import scipy

# Global Variables and Functions

In [0]:
PROJECT_ROOT = '.'

ATTRIBUTES = {
    'pse1': {
        'NCHOOSE2_FILENAME': PROJECT_ROOT+'/pse1_nchoose2_Sept2019.csv',
    },
    'aac6': { 
        'NCHOOSE2_FILENAME': PROJECT_ROOT+'/aac6_nchoose2_Sept2019.csv',
    }
}

In [0]:
def getRmsds(nchoose2_filename):
    '''
    Returns a dataframe containing residue positions and their rmsd.
    '''
    toreturn = {
        'position': [],
        'rmsd': []
    }
    df = pd.read_csv( nchoose2_filename )
    for col in df.columns:
        if 'dist_matrix_' in col:
            distances = df[col]
            toreturn['position'].append(
                int( col.replace('dist_matrix_', '').replace(' ', '') )
            )
            toreturn['rmsd'].append(
                np.sqrt(
                    (1.0/len(distances)) *
                    np.sum(
                        np.power(distances, 2.0)
                    )
                )
            )
    return pd.DataFrame( toreturn )


def plotRmsdPerPosition(rmsd_df, title=None, output=None):
    fig, ax = plt.subplots(1, 1, figsize=(20, 5))
    cmap = cm.get_cmap('rainbow')
    norm = colors.Normalize(
        vmin=np.min(rmsd_df.rmsd), 
        vmax=np.max(rmsd_df.rmsd), 
    )
    mycolors = []
    for rmsd in rmsd_df.rmsd:
        mycolors.append(cmap(norm(rmsd)))
        
    plot = ax.bar(rmsd_df.position, 
                   rmsd_df.rmsd, 
                   color=mycolors,
                   width=1)
    ax.set_xlabel('position')
    ax.set_ylabel('RMSD')
    if title: ax.set_title(title)
    if output: fig.savefig(output)

## Load data

In [0]:
pse1df = getRmsds(ATTRIBUTES['pse1']['NCHOOSE2_FILENAME'])

In [0]:
aac6df = getRmsds(ATTRIBUTES['aac6']['NCHOOSE2_FILENAME'])

## Output RMSD file

In [0]:
pse1df['rmsd'].to_csv(PROJECT_ROOT+'/pse1_rmsds_20190925.txt', index=False)

  """Entry point for launching an IPython kernel.


In [0]:
aac6df['rmsd'].to_csv(PROJECT_ROOT+'/aac6_rmsds_20190925.txt', index=False)

  """Entry point for launching an IPython kernel.


## Output plots

In [0]:
plotRmsdPerPosition(pse1df, 'PSE1', output=PROJECT_ROOT+'/pse1_positionvsrmsd_20190925.pdf')

In [0]:
plotRmsdPerPosition(aac6df, 'AAC6', output=PROJECT_ROOT+'/aac6_positionvsrmsd_20190925.pdf')