---------------

## Determine MpN values from datasets

### Modules, lists, and functions

In [1]:
import os, sys
import pandas as pd
import numpy as np
np.set_printoptions(suppress=True)

import seaborn as sns
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
from matplotlib.patches import Ellipse
import matplotlib.transforms as transforms
from matplotlib.offsetbox import AnchoredText

path = os.getcwd()
datapath = path+'/mpn_datasets'

In [2]:
theta_lst = ['tilt','roll','twist','shift','slide','rise']

# --- Lists of dimers, tetramers, and the self-complementary steps ---
BASES     = ['A','C','G','T']
COMP      = {'A':'T', 'T':'A', 'C':'G', 'G':'C','.':'.'}

def dna_seq_complement(sequence_string):
    COMP = {'A':'T', 'T':'A', 'C':'G', 'G':'C','.':'.'}
    STEP = sequence_string[::-1]
    STEP = ''.join([COMP[STEP[i]] for i in range(len(STEP))])
    return STEP

DIMERS    = [b+c for b in BASES for c in BASES]
DIM_MATCH = {'TT':'AA', 'GT':'AC', 'CT':'AG', 'CC':'GG', 'TC':'GA', 'TG':'CA'}
DIMER_LST = ['CG','CA','TA'] + ['AG','GG','AA','GA'] + ['AT','AC','GC']
lstRR = ['AG','GG','AA','GA']
lstYR = ['CG','CA','TA']
lstRY = ['AT','AC','GC']
SCDIM     = []
for DIM in DIMERS:
    if COMP[DIM[1]]+COMP[DIM[0]]==DIM:
        SCDIM.append(DIM)

TET_LST      = [a+b+c+d for a in ['A','C','G','T','.'] for b in BASES for c in BASES for d in ['A','C','G','T','.']]
TETRAMERS    = [i for i in TET_LST if '.' not in i]
TETRAMER_LST = [
    'AAAA','AACA','AAGA','AATA','ACAA','ACGA','AGAA','AGCA','AGGA','ATAA',
    'AAAC','AACC','AAGC','AATC','ACAC','ACGC','AGAC','AGCC','AGGC','ATAC',
    'AAAG','AACG','AAGG','AATG','ACAG','ACGG','AGAG','AGCG','AGGG','ATAG',
    'AAAT','AACT','AAGT','AATT','ACAT','ACGT','AGAT','AGCT','AGGT','ATAT',
    'CAAA','CACA','CAGA','CATA','CCAA','CCGA','CGAA','CGCA','CGGA','CTAA',
    'CAAC','CACC','CAGC','CCAC','CGAC','CGGC','CAAG','CACG','CAGG','CATG',
    'CCAG','CCGG','CGAG','CGCG','CGGG','CTAG','CAAT','CACT','CAGT','CCAT',
    'CGAT','CGGT','GAAA','GACA','GAGA','GATA','GCAA','GCGA','GGAA','GGCA',
    'GGGA','GTAA','GAAC','GACC','GAGC','GATC','GCAC','GCGC','GGAC','GGCC',
    'GGGC','GTAC','GAAG','GACG','GAGG','GATG','GCAG','GCGG','GGAG','GGCG',
    'GGGG','GTAG','GAAT','GACT','GAGT','GCAT','GGAT','GGGT','TAAA','TACA',
    'TAGA','TATA','TCAA','TCGA','TGAA','TGCA','TGGA','TTAA','TAAC','TACC',
    'TAGC','TCAC','TGAC','TGGC','TAAG','TACG','TAGG','TCAG','TGAG','TGGG',
    'TAAT','TACT','TAGT','TCAT','TGAT','TGGT'
]
SCTET        = []
for TET in TETRAMERS:
    if COMP[TET[3]]+COMP[TET[2]]+COMP[TET[1]]+COMP[TET[0]]==TET and '.' not in TET:
        SCTET.append(TET)

In [3]:
def confidence_ellipse(x, y, ax, n_std=3.0, facecolor='none', **kwargs):
    """
    Create a plot of the covariance confidence ellipse of *x* and *y*.

    Parameters
    ----------
    x, y : array-like, shape (n, )
        Input data.

    ax : matplotlib.axes.Axes
        The axes object to draw the ellipse into.

    n_std : float
        The number of standard deviations to determine the ellipse's radiuses.

    **kwargs
        Forwarded to `~matplotlib.patches.Ellipse`

    Returns
    -------
    matplotlib.patches.Ellipse
    """
    if x.size != y.size:
        raise ValueError("x and y must be the same size")

    cov = np.cov(x, y)
    pearson = cov[0, 1]/np.sqrt(cov[0, 0] * cov[1, 1])
    # Using a special case to obtain the eigenvalues of this
    # two-dimensionl dataset.
    ell_radius_x = np.sqrt(1 + pearson)
    ell_radius_y = np.sqrt(1 - pearson)
    ellipse = Ellipse((0, 0), width=ell_radius_x * 2, height=ell_radius_y * 2,
                      facecolor=facecolor, **kwargs)

    # Calculating the stdandard deviation of x from
    # the squareroot of the variance and multiplying
    # with the given number of standard deviations.
    scale_x = np.sqrt(cov[0, 0]) * n_std
    mean_x = np.mean(x)

    # calculating the stdandard deviation of y ...
    scale_y = np.sqrt(cov[1, 1]) * n_std
    mean_y = np.mean(y)

    transf = transforms.Affine2D() \
        .rotate_deg(45) \
        .scale(scale_x, scale_y) \
        .translate(mean_x, mean_y)

    ellipse.set_transform(transf + ax.transData)
    return ax.add_patch(ellipse)

---------------------

### MpN Dataset(s)

In [4]:
FFNAME       = "czapla2022"

main_dataset = pd.read_csv(FFNAME+"_3sig_tet", index_col=0)

pdb_dets = pd.read_csv("czapla2022_pdbid_data", index_col=0)

main_dataset['year']=''
main_dataset['resolution']=''

for i in range(len(main_dataset)):
    ID = main_dataset.at[i, 'pdb_id']
    main_dataset.at[i, 'resolution'] = pdb_dets.at[ID, 'resolution']
    main_dataset.at[i, 'year']       = pdb_dets.at[ID, 'deposit_year']
    del ID

del pdb_dets
#main_dataset

In [5]:
mpn_dataset = pd.DataFrame(index=DIMERS)

for STEP in DIMERS:
    mpn_dataset.at[STEP, 'count'] = len( main_dataset[(main_dataset.step_dimer==STEP)
                                                     #&(main_dataset.coding=='y')
                                                     ])
    for THETA1 in theta_lst:
        mpn_dataset.at[STEP, THETA1] = round(main_dataset.loc[(main_dataset.step_dimer==STEP)][THETA1].mean(), 3)
        
mpn_dataset.at['MN', 'count'] = mpn_dataset['count'].sum()
for THETA1 in theta_lst:
    mpn_dataset.at['MN', THETA1] = round( mpn_dataset[THETA1].sum()/len(DIMERS), 3)
mpn_dataset['count'] = mpn_dataset['count'].astype(int)

mpn_dataset

Unnamed: 0,count,tilt,roll,twist,shift,slide,rise
AA,5691,-0.023,-0.033,35.245,0.006,-0.257,3.244
AC,5182,0.01,1.58,32.226,-0.001,-0.597,3.261
AG,5078,-0.312,3.2,32.363,-0.027,-0.311,3.308
AT,7521,0.0,0.033,30.971,-0.0,-0.671,3.229
CA,5403,-0.024,5.461,35.007,-0.052,0.198,3.336
CC,5112,0.088,4.712,33.283,0.01,-0.278,3.36
CG,7670,-0.0,6.438,33.99,0.0,0.37,3.34
CT,5078,0.312,3.2,32.363,0.027,-0.311,3.308
GA,5399,-0.066,1.938,36.308,-0.026,-0.081,3.285
GC,7154,0.0,2.415,33.92,0.0,-0.382,3.297


### Try: structural means, N entries = 70% of least dimer count

In [6]:
Nmin = mpn_dataset['count'].min()
Nthres = int(0.70*Nmin)

Nthres

3554

In [7]:
mpn_dataset_2 = pd.DataFrame(index=mpn_dataset.index, 
                                 columns=mpn_dataset.columns)
for STEP in DIMERS:

    df = main_dataset.copy().loc[main_dataset.step_dimer==STEP].sample(n=Nthres)
    mpn_dataset_2.at[STEP, 'count']=len(df)
    for THETA1 in theta_lst:
        mpn_dataset_2.at[STEP, THETA1] = round(df.loc[(df.step_dimer==STEP)][THETA1].mean(), 3)

    del df

mpn_dataset_2.at['MN', 'count'] = mpn_dataset_2['count'].sum()
for THETA1 in theta_lst:
    mpn_dataset_2.at['MN', THETA1] = round( mpn_dataset_2[THETA1].sum()/len(DIMERS), 3)

mpn_dataset_2['count'] = mpn_dataset_2['count'].astype(int)
mpn_dataset_2

Unnamed: 0,count,tilt,roll,twist,shift,slide,rise
AA,3554,0.016,-0.137,35.372,0.009,-0.256,3.246
AC,3554,0.001,1.64,32.199,0.007,-0.599,3.26
AG,3554,-0.355,3.268,32.327,-0.024,-0.313,3.308
AT,3554,-0.014,0.042,30.956,-0.009,-0.672,3.224
CA,3554,-0.039,5.425,35.075,-0.057,0.197,3.337
CC,3554,0.086,4.778,33.261,0.009,-0.281,3.361
CG,3554,0.035,6.506,34.011,0.006,0.368,3.345
CT,3554,0.28,3.167,32.4,0.031,-0.307,3.31
GA,3554,-0.013,1.971,36.33,-0.012,-0.077,3.283
GC,3554,0.007,2.454,34.011,0.002,-0.376,3.3


In [8]:
random_sample_test_df = pd.DataFrame(index=[i for i in range(0, 50)], 
                                            columns=mpn_dataset.columns)

for i in range(0, 50):
    mpn_dataset_x = pd.DataFrame(index=mpn_dataset.index, 
                                 columns=mpn_dataset.columns)
    for STEP in DIMERS:

        df = main_dataset.copy().loc[main_dataset.step_dimer==STEP].sample(n=Nthres)
        mpn_dataset_x.at[STEP, 'count']=len(df)
        for THETA1 in theta_lst:
            mpn_dataset_x.at[STEP, THETA1] = round(df.loc[(df.step_dimer==STEP)][THETA1].mean(), 3)

        del df
    
    random_sample_test_df.at[i, 'count'] = mpn_dataset_x['count'].sum()#.astype(int)
    for THETA1 in theta_lst:
        random_sample_test_df.at[i, THETA1] = round( mpn_dataset_x[THETA1].sum()/len(DIMERS), 3)
    
    del mpn_dataset_x
    
random_sample_test_df['count']=random_sample_test_df['count'].astype(int)
random_sample_test_df[theta_lst]=random_sample_test_df[theta_lst].astype(float)

In [9]:
random_sample_test_df.describe()

Unnamed: 0,count,tilt,roll,twist,shift,slide,rise
count,50.0,50.0,50.0,50.0,50.0,50.0,50.0
mean,56864.0,-0.00046,2.79828,34.07628,3.469447e-20,-0.19278,3.29992
std,0.0,0.009786,0.011366,0.008846,0.001484615,0.001093,0.000665
min,56864.0,-0.017,2.774,34.058,-0.003,-0.195,3.299
25%,56864.0,-0.0085,2.791,34.071,-0.001,-0.193,3.3
50%,56864.0,-0.001,2.7975,34.076,0.0,-0.193,3.3
75%,56864.0,0.007,2.807,34.082,0.001,-0.192,3.3
max,56864.0,0.019,2.829,34.1,0.003,-0.19,3.302


### Try: include resolution cut-off

In [10]:
dataset = main_dataset.copy().loc[main_dataset.resolution<=3.0].reset_index(drop=True)

In [11]:
res_mpn_dataset = pd.DataFrame(index=DIMERS)

for STEP in DIMERS:
    res_mpn_dataset.at[STEP, 'count'] = len( dataset[(dataset.step_dimer==STEP)] )
    
    for THETA1 in theta_lst:
        res_mpn_dataset.at[STEP, THETA1] = round(dataset.loc[(dataset.step_dimer==STEP)][THETA1].mean(), 3)
        
res_mpn_dataset.at['MN', 'count'] = res_mpn_dataset['count'].sum()
for THETA1 in theta_lst:
    res_mpn_dataset.at['MN', THETA1] = round( res_mpn_dataset[THETA1].sum()/len(DIMERS), 3)
res_mpn_dataset['count'] = res_mpn_dataset['count'].astype(int)

res_mpn_dataset

Unnamed: 0,count,tilt,roll,twist,shift,slide,rise
AA,4054,0.02,-0.122,35.083,0.021,-0.268,3.237
AC,3710,0.192,1.687,31.869,-0.003,-0.636,3.256
AG,3715,-0.38,3.561,32.239,-0.035,-0.335,3.304
AT,5925,0.0,0.111,30.659,-0.0,-0.682,3.224
CA,3999,-0.004,5.595,35.096,-0.063,0.174,3.333
CC,3817,0.057,4.849,33.263,0.009,-0.325,3.365
CG,5522,0.0,6.393,34.255,0.0,0.354,3.34
CT,3715,0.38,3.561,32.239,0.035,-0.335,3.304
GA,4227,-0.057,1.95,36.302,-0.023,-0.105,3.281
GC,5242,0.0,2.534,33.432,0.0,-0.427,3.291


In [12]:
mpn_dataset_3 = pd.DataFrame(index=mpn_dataset.index, 
                                 columns=mpn_dataset.columns)
for STEP in DIMERS:

    df = dataset.copy().loc[dataset.step_dimer==STEP].sample(n=Nthres)
    mpn_dataset_3.at[STEP, 'count']=len(df)
    for THETA1 in theta_lst:
        mpn_dataset_3.at[STEP, THETA1] = round(df.loc[(df.step_dimer==STEP)][THETA1].mean(), 3)

    del df

mpn_dataset_3.at['MN', 'count'] = mpn_dataset_3['count'].sum()
for THETA1 in theta_lst:
    mpn_dataset_3.at['MN', THETA1] = round( mpn_dataset_3[THETA1].sum()/len(DIMERS), 3)

mpn_dataset_3['count'] = mpn_dataset_3['count'].astype(int)
mpn_dataset_3


Unnamed: 0,count,tilt,roll,twist,shift,slide,rise
AA,3554,0.003,-0.142,35.083,0.019,-0.268,3.237
AC,3554,0.2,1.685,31.879,-0.004,-0.635,3.257
AG,3554,-0.347,3.565,32.245,-0.032,-0.336,3.304
AT,3554,-0.003,0.169,30.641,-0.004,-0.681,3.226
CA,3554,-0.015,5.558,35.106,-0.063,0.181,3.331
CC,3554,0.045,4.802,33.265,0.012,-0.323,3.364
CG,3554,0.015,6.426,34.223,0.007,0.356,3.336
CT,3554,0.358,3.563,32.234,0.034,-0.333,3.305
GA,3554,-0.047,1.99,36.295,-0.03,-0.106,3.28
GC,3554,0.003,2.518,33.43,-0.003,-0.426,3.29


In [13]:
random_sample_test_df2 = pd.DataFrame(index=[i for i in range(0, 50)], 
                                            columns=mpn_dataset.columns)

for i in range(0, 50):
    mpn_dataset_x = pd.DataFrame(index=mpn_dataset.index, 
                                 columns=mpn_dataset.columns)
    for STEP in DIMERS:

        df = main_dataset.copy().loc[main_dataset.step_dimer==STEP].sample(n=Nthres)
        mpn_dataset_x.at[STEP, 'count']=len(df)
        for THETA1 in theta_lst:
            mpn_dataset_x.at[STEP, THETA1] = round(df.loc[(df.step_dimer==STEP)][THETA1].mean(), 3)

        del df
    
    random_sample_test_df2.at[i, 'count'] = mpn_dataset_x['count'].sum()#.astype(int)
    for THETA1 in theta_lst:
        random_sample_test_df2.at[i, THETA1] = round( mpn_dataset_x[THETA1].sum()/len(DIMERS), 3)
    
    del mpn_dataset_x
    
random_sample_test_df2['count']=random_sample_test_df2['count'].astype(int)
random_sample_test_df2[theta_lst]=random_sample_test_df2[theta_lst].astype(float)

In [14]:
random_sample_test_df2.describe()

Unnamed: 0,count,tilt,roll,twist,shift,slide,rise
count,50.0,50.0,50.0,50.0,50.0,50.0,50.0
mean,56864.0,0.00056,2.79944,34.07212,4e-05,-0.19316,3.2999
std,0.0,0.008844,0.012925,0.011406,0.001726,0.001788,0.000647
min,56864.0,-0.018,2.77,34.041,-0.005,-0.197,3.299
25%,56864.0,-0.00575,2.792,34.067,-0.001,-0.195,3.29925
50%,56864.0,0.0015,2.799,34.074,0.0,-0.193,3.3
75%,56864.0,0.006,2.806,34.079,0.00175,-0.192,3.3
max,56864.0,0.032,2.839,34.094,0.003,-0.189,3.301


### Scratch

In [None]:
for THETA in theta_lst:#['twist']:#
    
    plt_df= pd.DataFrame(index=[a+b for a in ['A','G','C','T'] for b in ['A','G','C','T']],
                         columns=[i[::-1] for i in [a+b for a in ['A','G','C','T'] for b in ['A','G','C','T']]]).T
    
    for dim1 in plt_df.index:
        for dim2 in plt_df.columns:
            plt_df.at[dim1, dim2] = mpn_dataset[mpn_dataset.index==dim1+dim2][THETA]
            
    plt_df = plt_df.astype(float)
    
    
    fig, ax = plt.subplots(figsize=(8,8))
    
    if THETA in ['tilt','roll','twist']:
        formatter = '.1f'
        rounder = 1
        symbol = "$^\circ$"
    else:
        formatter = '.2f'
        rounder = 2
        symbol = "$\AA$"
        
    sns.heatmap(data=plt_df,
                cbar=True, annot=True,fmt=formatter,
                cmap="coolwarm", 
                center=mpn_dataset.at['MN', THETA], #vmin=spread_values[THETA][0], vmax=spread_values[THETA][1],
                cbar_kws={'orientation':'horizontal','pad':0.05, 
                          'label':"Avg "+THETA.capitalize()+" (MpN = {x}{y})".format(x=str(round(mpn_dataset.at['MN', THETA], rounder)), y=symbol),
                          'shrink':0.50},
                linewidths=1,linecolor='white',
                ax=ax)
    del formatter, rounder, symbol
    #plt.title("Tetrameric Averages\n")
    plt.yticks(rotation=0)
    ax.xaxis.tick_top()
    plt.tight_layout()
    #plt.savefig("czapla2022_heatmap_par-avg-tet_"+THETA+"_v02.png", dpi=300)
    plt.show()
    plt.clf()
    
    #plt_df.to_csv("czapla2022_heatmap_par-avg-tet_"+THETA+"_v02_figdata")
    del plt_df

In [None]:
df1 = mpn_dataset.copy().drop(['MN'], axis=0)

df1['dimer']=mpn_dataset.drop(['MN'], axis=0).index.str[1:3]

mpn_dim_dataset=pd.DataFrame(index=DIMERS)
for STEP in DIMERS:
    mpn_dim_dataset.at[STEP, 'count'] = df1[(df1.dimer==STEP)]['count'].sum()
    
    for THETA1 in theta_lst:
        mpn_dim_dataset.at[STEP, THETA1] = round( df1[(df1.dimer==STEP)][THETA1].sum() / len(df1[(df1.dimer==STEP)]) , 3)
del df1
mpn_dim_dataset.at['MN', 'count'] = mpn_dim_dataset['count'].sum()
for THETA1 in theta_lst:
    mpn_dim_dataset.at['MN', THETA1] = round( mpn_dim_dataset[THETA1].sum()/len(DIMERS), 3)
mpn_dim_dataset['count']=mpn_dim_dataset['count'].astype(int)

mpn_dim_dataset.to_csv(FFNAME+"_par-avg_dimer_tet-seq-space")

mpn_dim_dataset

In [None]:
dimer_lst_02 = [a+b for a in ['A','G','C','T'] for b in ['A','G','C','T']]
dimer_df     = pd.DataFrame(index=['A','G','C','T'],columns=['A','G','C','T'])

for b1 in dimer_df.index:
    for b2 in dimer_df.columns:
        dimer_df.at[b1, b2] = len( main_dataset.loc[main_dataset.step_dimer==b1+b2] )
dimer_df=dimer_df.astype(int)

tetramer_df = pd.DataFrame(index=[i for i in dimer_lst_02],
                           columns=[i[::-1] for i in dimer_lst_02]).T
del dimer_lst_02
for dim1 in tetramer_df.index:
    for dim2 in tetramer_df.columns:
        tetramer_df.at[dim1, dim2]= len( main_dataset.loc[main_dataset.step_tetramer==dim1+dim2] )
tetramer_df=tetramer_df.astype(int)

fig, ax = plt.subplots(figsize=(4,4))

sns.heatmap(data=dimer_df,
            cbar=False, annot=True,fmt='d',
            cmap="GnBu",
            linewidths=1,linecolor='white',
           ax=ax)
#plt.title(FFNAME+" Tetramers\n")
ax.xaxis.tick_top()
plt.yticks(rotation=0)
plt.savefig("czapla2022_heatmap_count-dimer-all_v01.png", dpi=300)
plt.show()
plt.clf()

fig, ax = plt.subplots(figsize=(8,8))

sns.heatmap(data=tetramer_df,
            cbar=False, annot=True,fmt='d',
            cmap="GnBu",
            linewidths=1,linecolor='white',
           ax=ax)
#plt.title(FFNAME+" Tetramers\n")
ax.xaxis.tick_top()
plt.yticks(rotation=0)
plt.savefig("czapla2022_heatmap_count-tetramer-all_v01.png", dpi=300)
plt.show()
plt.clf()


dimer_df.to_csv("czapla2022_heatmap_count-dimer-all_v01_figdata")
tetramer_df.to_csv("czapla2022_heatmap_count-tetramer-all_v01_figdata")

for b1 in dimer_df.index:
    for b2 in dimer_df.columns:
        if b1+b2 not in DIMER_LST:
            dimer_df.at[b1, b2] = 0
for dim1 in tetramer_df.index:
    for dim2 in tetramer_df.columns:
        if dim1+dim2 not in TETRAMER_LST:
            tetramer_df.at[dim1, dim2]= 0

fig, ax = plt.subplots(figsize=(8,8))
sns.heatmap(data=tetramer_df,
            cbar=False, annot=True,fmt='d',
            cmap="GnBu",
            linewidths=1,linecolor='white',
            mask=( tetramer_df==0 ),
           ax=ax)
ax.xaxis.tick_top()
plt.yticks(rotation=0)

axins = inset_axes(ax,  "40%", "40%" ,loc="lower right", borderpad=0.5)
sns.heatmap(data=dimer_df,
            cbar=False, annot=True,fmt='d',
            cmap="GnBu",
            linewidths=1,linecolor='white', 
            mask=( dimer_df==0 ),
           ax=axins)
#plt.title("Dimer Count\n")
axins.xaxis.tick_top()
plt.yticks(rotation=0)
plt.savefig("czapla2022_heatmap_count-dim+tet-all_v01.png", dpi=300)
plt.show()
plt.clf()

del dimer_df, tetramer_df

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(10,5))

tlst = ['tilt','roll','twist']
for i in range(len(tlst)):
    sns.histplot(data=main_dataset, x=tlst[i], kde=True,
                color="grey", alpha=0.50, ax=axes[0][i])
    
tlst = ['shift','slide','rise']
for i in range(len(tlst)):
    sns.histplot(data=main_dataset, x=tlst[i], kde=True,
                color="gray", alpha=0.50, ax=axes[1][i])

del tlst

plt.tight_layout()
plt.show()
#plt.savefig("czapla2022_hist-kde_dim-tet_full-dataset_parameters_01.png", dpi=300)
plt.clf()