In [None]:
import os, sys, time
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

path = os.getcwd()

datapath = path+'/pdb_time-series-data'

ffname="czapla2022"

In [None]:
detdf   = pd.read_csv("czapla2022_pdbid_data", index_col=False)

dataset  = pd.read_csv("czapla2022_3sig_tet", index_col=0)

timedataset = pd.read_csv(datapath+'/czapla2022_yearly_dataset', index_col=0)

In [None]:
print(len(timedataset))

### lists and functions:

In [None]:
theta_lst = ['tilt','roll','twist','shift','slide','rise']

lstRR = ['AG','GG','AA','GA']
lstYR = ['CG','CA','TA']
lstRY = ['AT','AC','GC']
DIMER_LST = ['CG','CA','TA'] + ['AG','GG','AA','GA'] + ['AT','AC','GC']

# --- Lists of dimers, tetramers, and the self-complementary steps ---
BASES     = ['A','C','G','T']
COMP      = {'A':'T', 'T':'A', 'C':'G', 'G':'C','.':'.'}

def dna_seq_complement(sequence_string):
    COMP = {'A':'T', 'T':'A', 'C':'G', 'G':'C','.':'.'}
    STEP = sequence_string[::-1]
    STEP = ''.join([COMP[STEP[i]] for i in range(len(STEP))])
    return STEP

DIMERS    = [b+c for b in BASES for c in BASES]
TETRAMERS = [a+b+c+d for a in ['A','C','G','T','.'] for b in BASES for c in BASES for d in ['A','C','G','T','.']]
SCDIM     = []
SCTET     = []

for DIM in DIMERS:
    if COMP[DIM[1]]+COMP[DIM[0]]==DIM:
        SCDIM.append(DIM)
for TET in TETRAMERS:
    if COMP[TET[3]]+COMP[TET[2]]+COMP[TET[1]]+COMP[TET[0]]==TET and '.' not in TET:
        SCTET.append(TET)

TETRAMER_LST = [
    'AAAA','AACA','AAGA','AATA','ACAA','ACGA','AGAA','AGCA','AGGA','ATAA',
    'AAAC','AACC','AAGC','AATC','ACAC','ACGC','AGAC','AGCC','AGGC','ATAC',
    'AAAG','AACG','AAGG','AATG','ACAG','ACGG','AGAG','AGCG','AGGG','ATAG',
    'AAAT','AACT','AAGT','AATT','ACAT','ACGT','AGAT','AGCT','AGGT','ATAT',
    'CAAA','CACA','CAGA','CATA','CCAA','CCGA','CGAA','CGCA','CGGA','CTAA',
    'CAAC','CACC','CAGC','CCAC','CGAC','CGGC','CAAG','CACG','CAGG','CATG',
    'CCAG','CCGG','CGAG','CGCG','CGGG','CTAG','CAAT','CACT','CAGT','CCAT',
    'CGAT','CGGT','GAAA','GACA','GAGA','GATA','GCAA','GCGA','GGAA','GGCA',
    'GGGA','GTAA','GAAC','GACC','GAGC','GATC','GCAC','GCGC','GGAC','GGCC',
    'GGGC','GTAC','GAAG','GACG','GAGG','GATG','GCAG','GCGG','GGAG','GGCG',
    'GGGG','GTAG','GAAT','GACT','GAGT','GCAT','GGAT','GGGT','TAAA','TACA',
    'TAGA','TATA','TCAA','TCGA','TGAA','TGCA','TGGA','TTAA','TAAC','TACC',
    'TAGC','TCAC','TGAC','TGGC','TAAG','TACG','TAGG','TCAG','TGAG','TGGG',
    'TAAT','TACT','TAGT','TCAT','TGAT','TGGT'
]        

basic_dimer_coloring = {'AG':'red','GG':'red','GA':'red','AA':'red',
                       'CG':'green','CA':'green','TA':'green',
                       'AT':'royalblue','AC':'royalblue','GC':'royalblue'}

In [None]:
def culling_dictionary(CULL_PAR, DATAFRAME):
    return {'tilt':[DATAFRAME.tilt.mean()      - CULL_PAR*DATAFRAME.tilt.std(),     DATAFRAME.tilt.mean()     + CULL_PAR*DATAFRAME.tilt.std()],
            'roll':[DATAFRAME.roll.mean()      - CULL_PAR*DATAFRAME.roll.std(),     DATAFRAME.roll.mean()     + CULL_PAR*DATAFRAME.roll.std()], 
            'twist':[DATAFRAME.twist.mean()    - CULL_PAR*DATAFRAME.twist.std(),    DATAFRAME.twist.mean()    + CULL_PAR*DATAFRAME.twist.std()],
            'shift':[DATAFRAME['shift'].mean() - CULL_PAR*DATAFRAME['shift'].std(), DATAFRAME['shift'].mean() + CULL_PAR*DATAFRAME['shift'].std()], 
            'slide':[DATAFRAME.slide.mean()    - CULL_PAR*DATAFRAME.slide.std(),    DATAFRAME.slide.mean()    + CULL_PAR*DATAFRAME.slide.std()],
            'rise':[DATAFRAME.rise.mean()      - CULL_PAR*DATAFRAME.rise.std(),     DATAFRAME.rise.mean()     + CULL_PAR*DATAFRAME.rise.std()]
           }


def parametric_culling(culling_par, DATAFRAME, CULL_DF):   
    # Make dictionary with the sigma limits for each parameter
    sigma_check = culling_dictionary(culling_par, DATAFRAME)
    # check each entry of dataset to see if all parameters are within their sigma-limit; if not, cull
    CULL_IDXS = []
    for idx, row in DATAFRAME.iterrows():
        VECTOR = DATAFRAME.loc[idx]
        VECTOR_CHECK=[]
        for theta in ['tilt','roll','twist','shift','slide','rise']:
            if sigma_check[theta][0] <= VECTOR[theta].item() <= sigma_check[theta][1]:
                VECTOR_CHECK.append("pass")
            else:
                VECTOR_CHECK.append("cull")
                
        if len(VECTOR_CHECK)==6 and "cull" in VECTOR_CHECK:
            CULL_DF  = pd.concat([CULL_DF, DATAFRAME.loc[idx:idx]], ignore_index=True)
            CULL_IDXS.append(idx)
        del VECTOR, VECTOR_CHECK
    CULL_DF  = CULL_DF.reset_index(drop=True)
    DATAFRAME = DATAFRAME.drop(index=CULL_IDXS).reset_index(drop=True)
    del CULL_IDXS, sigma_check
    return DATAFRAME, CULL_DF


def culling_cycle(culling_par, CULL_CHECK_DF, STEP_DATAFRAME):
    CULL_CYCLE    = 1
    CULL_CHECK    = len(CULL_CHECK_DF)
    # First culling cycle
    STEP_DATAFRAME, CULL_CHECK_DF = parametric_culling(culling_par, STEP_DATAFRAME, CULL_CHECK_DF)
    CULL_CYCLE+=1
    # Conditionally repeat culling
    while (CULL_CYCLE >= 2) and ( len(CULL_CHECK_DF) - CULL_CHECK > 0 ):
        CULL_CHECK = len(CULL_CHECK_DF)
        STEP_DATAFRAME, CULL_CHECK_DF = parametric_culling(culling_par, STEP_DATAFRAME, CULL_CHECK_DF)
        CULL_CYCLE+=1
    del CULL_CYCLE, CULL_CHECK
    return CULL_CHECK_DF, STEP_DATAFRAME



### Year counts

In [None]:
for i in range(len(dataset)):
    dataset.at[i, 'year'] = int( detdf[detdf.pdb_id==dataset.at[i, 'pdb_id']]['deposit_year'] )
dataset['year']=dataset['year'].astype(int)

In [None]:
fig, ax = plt.subplots(figsize=(10,4))

sns.histplot(data=timedataset,
             x='year',
             binwidth=1, color="blue",
             kde=True,
            ax=ax)

sns.histplot(data=dataset,
             x='year',
             binwidth=1, color="green",
             kde=True,
            ax=ax)
#plt.savefig(ffname+"_hist-by-year_v01.png", dpi=300)
plt.show()
plt.clf()

In [None]:
fig, ax = plt.subplots(figsize=(10,4))

sns.histplot(data=timedataset[timedataset.step_dimer.isin(DIMER_LST)],
                x='year',
                hue='step_dimer', binwidth=1, multiple='stack',
                ax=ax)

sns.move_legend(ax, 
                loc="lower center",
                ncol=10,
                title=None,
                frameon=True,
                bbox_to_anchor=(0.5,-0.4))

plt.tight_layout()
#plt.savefig(ffname+"_hist-by-year_dimer_v01.png", dpi=300)
plt.show()
plt.clf()

------------

### Dimeric Data

In [None]:
yeardf   = pd.DataFrame(columns=[2000+2*i for i in range(0, 12)], 
                        index=['pdb_ct']+['step_ct']+[i for i in DIMERS])

for YEAR in [2000+2*i for i in range(0, 12)]:
    
    DF  = pd.read_csv(datapath+"/czapla2022_pdb-series_year-series_"+str(YEAR)+"_data", index_col=0) 

    yeardf.at['pdb_ct', YEAR]  = len(DF.pdb_id.unique())    
    yeardf.at['step_ct', YEAR] = len(DF)
    
    for X in DIMERS:
        yeardf.at[X, YEAR] = len(DF.loc[(DF.step_dimer==X)])
    del DF
    
#yeardf = yeardf.drop(['CC','CT','GT','TC','TG','TT'], axis=0)
yeardf

In [None]:
fig,ax = plt.subplots(1,1,figsize=(7,4))

pltdf=yeardf.copy().drop(['pdb_ct','step_ct'], axis=0)

x = sns.lineplot(data=pltdf.drop(['CC','CT','GT','TC','TG','TT'], axis=0).T,
                 markers=['X','o','X','o','o','o','o','X','o','X'],
                 palette=['orange','orange','blue','red','green','red','green','red','red','green'], 
                 dashes=[(1,0)]*10,
                 ax=ax)

x.legend(title='Dimers', loc="upper right", bbox_to_anchor=(1.2, 1))
x.set_xticks(pltdf.T.index)

plt.tight_layout()
plt.savefig("czapla2022_pdb_count_time-series_v03.png", dpi=300)
plt.show()
plt.clf()

del x, pltdf

In [None]:
fig, ax0 = plt.subplots(1, 1, figsize=(10,4), sharey=True)

yeardf.loc[DIMER_LST].plot.bar(cmap='coolwarm', 
                               width=0.75, edgecolor='black', rot=0, 
                               ax=ax0)

lgd = plt.legend(loc="lower center", ncol=9, bbox_to_anchor=(0.5, -0.3))
plt.tight_layout()
plt.savefig("czapla2022_pdb-time-dimer-counts_v03.png", dpi=300, bbox_extra_artists=(lgd,), bbox_inches='tight')
plt.show()
plt.clf()
del lgd

In [None]:
yeardf.to_csv("czapla2022_pdb-year-dimer-counts_v03_figdata")
del yeardf

-----------

### Parametric

In [None]:
PARLIMITS = {'tilt':[-5, 5],
             'roll':[-1, 9.0],
             'twist':[30,40],
             'shift':[-0.75,0.75], 
             'slide':[-0.75,0.75], 
             'rise':[2.5, 4.0]}



In [None]:
for PAR in theta_lst:    
    testpar   = pd.DataFrame(columns=[2000+2*i for i in range(0, 12)], index=[i for i in DIMERS])
    
    for YEAR in [2000+2*i for i in range(0, 12)]:
        data_df = pd.read_csv(datapath+"/czapla2022_pdb-series_year-series_"+str(YEAR)+"_data", index_col=0) 
        
        for X in DIMERS:
            
            #df = data_df.copy().loc[data_df.step_dimer==X].reset_index(drop=True)

            testpar.at[X, YEAR] = data_df[data_df.step_dimer==X][PAR].mean()
            
            #del df
        del data_df
        
    testpar.loc['MN']=testpar.sum()/len(DIMERS)
    testpar.to_csv(datapath+"/czapla2022_pdb-series_year-series_"+PAR+"-dim-structurespace_jan2022")
    del testpar
    

In [None]:
fig, axes = plt.subplots(3, 2, figsize=(12,7))

custom_params = {"axes.spines.right": False, "axes.spines.top": False}
#sns.set_theme(style="ticks", rc=custom_params)
AXES={'tilt':[0,0], 'roll':[1,0], 'twist':[2,0],
      'shift':[0,1], 'slide':[1,1], 'rise':[2,1]}

for PAR in theta_lst:
    
    paryeardf = pd.read_csv(datapath+"/czapla2022_pdb-series_year-series_"+PAR+"-dim-structurespace_jan2022",index_col=0)
    pltdf = paryeardf[[str(2000+2*i) for i in range(0, 12)]]
    
    pltdf.loc[DIMER_LST].plot.bar(legend=False,# yerr=devdf, 
                                  cmap='coolwarm', width=0.75, edgecolor='black', rot=0, ax=axes[AXES[PAR][0],AXES[PAR][1]])
    
    axes[AXES[PAR][0],AXES[PAR][1]].set_ylabel(PAR)
    axes[AXES[PAR][0],AXES[PAR][1]].set_ylim(PARLIMITS[PAR][0],PARLIMITS[PAR][1])
    
    del paryeardf, pltdf
    #del devdf

#plt.legend(loc="lower center", ncol=6, bbox_to_anchor=(0.5, -0.45))
plt.tight_layout()
plt.savefig("czapla2022_yearly-par-avg_struc-space_v3.png", dpi=300)
plt.show()
plt.clf()


In [None]:
fig, axes = plt.subplots(3, 2, figsize=(12,7))

custom_params = {"axes.spines.right": False, "axes.spines.top": False}
sns.set_theme(style="ticks", rc=custom_params)
AXES={'tilt':[0,0], 'roll':[1,0], 'twist':[2,0],
      'shift':[0,1], 'slide':[1,1], 'rise':[2,1]}

for PAR in theta_lst:
    
    paryeardf = pd.read_csv(datapath+"/czapla2022_pdb-series_year-series_"+PAR+"-dim-structurespace_jan2022",index_col=0)
    pltdf = paryeardf[[str(2000+2*i) for i in range(0, 12)]].copy()

    STEPLST=DIMER_LST+['MN']
    
    pltdf.loc[STEPLST].plot.bar(legend=False,# yerr=devdf, 
                                  cmap='coolwarm', width=0.75, edgecolor='black', rot=0, ax=axes[AXES[PAR][0],AXES[PAR][1]])
    
    axes[AXES[PAR][0],AXES[PAR][1]].set_ylabel(PAR)
    axes[AXES[PAR][0],AXES[PAR][1]].set_ylim(PARLIMITS[PAR][0],PARLIMITS[PAR][1])
    
    #pltdf.to_csv("czapla2022_pdb-series_year-series_"+PAR+"_dim-struc-space_MN_figdata")
    
    del paryeardf, pltdf, STEPLST

#plt.legend(loc="lower center", ncol=6, bbox_to_anchor=(0.5, -0.45))
plt.tight_layout()
plt.savefig("czapla2022_yearly-par-avg_struc-space_MN_v3.png", dpi=300)
plt.show()
plt.clf()



-----------------
### Volumetric

In [None]:
collection_df = pd.DataFrame(index=DIMER_LST, columns=[2000+2*i for i in range(0, 11)])

for YEAR in [2000+2*i for i in range(0, 12)]:

    fcdf = pd.read_csv(datapath+"/ForceConstants_czapla2022_"+str(YEAR)+"_dim.txt", 
                            header=None, index_col=0, sep=',\s+|={', engine="python")
    
    fcdf[36]=fcdf[36].map(lambda x: x.rstrip('},'))
    fcdf=fcdf.astype(float)
    
    dim_df = pd.DataFrame(index=DIMER_LST, columns=["eigenvalue_product"]+theta_lst)
    for STEP in fcdf.index:
        mat    = np.reshape(fcdf.loc[STEP].to_numpy(), (6,6))
        eigen_inv = np.linalg.eig( np.linalg.inv(mat) )[0]
        if STEP in DIMER_LST:
            dim_df.at[STEP, "eigenvalue_product"] = np.prod( eigen_inv )
            for i in range(len(theta_lst)):
                dim_df.at[STEP, theta_lst[i]] = eigen_inv[i]
        del mat, eigen_inv
    
    for dim in DIMER_LST:
        collection_df.at[dim, YEAR] = np.sqrt( dim_df.at[dim, 'eigenvalue_product'] )
        
    #del fcdf, dim_df

collection_df = collection_df.astype(float)       
collection_df

In [None]:
for YEAR in [2000+2*i for i in range(0, 12)]:

    df   = timedataset[timedataset.year<=YEAR].reset_index(drop=True)
    df   = df[theta_lst]
    for t1 in theta_lst:
        for t2 in theta_lst:
            df[t1+'.'+t2] = df[t1] * df[t2]
    mndf = pd.DataFrame(index=['MN'], columns=df.columns)
    mndf.loc['MN']=df.mean()
    del df
            
    covdf = pd.DataFrame(index=mndf.index)
    for t1 in theta_lst:
        for t2 in theta_lst:
            covdf.at['MN', t1+'.'+t2] = round( mndf.at['MN', t1+'.'+t2] - (mndf.at['MN',t1]*mndf.at['MN',t2]) , 6)
    
    mat_covar = np.reshape(covdf.loc['MN'].to_numpy(), (6,6))
    collection_df.at['MN', YEAR] = np.sqrt( np.prod( np.linalg.eig( mat_covar )[0] )   )
    #del mat_covar#, covdf, df
#collection_df   

In [None]:
fig, ax = plt.subplots(1,1,figsize=(10,4))             
collection_df.plot.bar(legend=False,
                       cmap='coolwarm', 
                       width=0.75, 
                       edgecolor='black', 
                       rot=0,
                      ax=ax)
lgd = plt.legend(loc="lower center", ncol=6, bbox_to_anchor=(0.5, -0.3))
plt.ylim(0, 15)
plt.ylabel("Step Volume (deg$^3$$\AA$$^3$)")
plt.tight_layout()
#plt.savefig("czapla2022_hist-volume_time-series_MN_v02.png", dpi=300)
plt.show()
plt.clf()

fig, ax = plt.subplots(1,1,figsize=(10,4))     
pltdf=collection_df.drop('MN', axis=0)
pltdf.plot.bar(legend=False,
                       cmap='coolwarm', 
                       width=0.75, 
                       edgecolor='black', 
                       rot=0,
                      ax=ax)
lgd = plt.legend(loc="lower center", ncol=6, bbox_to_anchor=(0.5, -0.3))
plt.ylim(0, 15)
plt.ylabel("Step Volume (deg$^3$$\AA$$^3$)")
plt.tight_layout()
#plt.savefig("czapla2022_hist-volume_time-series_v02.png", dpi=300)
plt.show()
plt.clf()
del pltdf


In [None]:
collection_df.to_csv("czapla2022_hist-volume_time-series_v02_figdata")
#del collection_df


## Scratch

In [None]:
rdata= pd.read_csv('pdb_res-series-data/czapla2022_res_dataset', index_col=0)
ydata= pd.read_csv('pdb_time-series-data/czapla2022_yearly_dataset', index_col=0)

pdata = pd.merge(rdata, ydata#, 
                # left_on=['pdb_id','tilt','roll','twist','shift','slide','rise','step_dimer','step_tetramer'],
                # right_on=['pdb_id','tilt','roll','twist','shift','slide','rise','step_dimer','step_tetramer']
                )
del rdata, ydata
pdata

In [None]:
sns.histplot(data=pdata,
             y='resolution', x='year',
            #hue='step_dimer',
            legend=False)

In [None]:
sns.histplot(data=pdata[pdata.year>=2000],
             y='resolution', x='year',
            #hue='step_dimer',
            legend=False)

In [None]:
fig, axes = plt.subplots(3, 1, figsize=(10,8), sharex=True)

yeardf=pd.read_csv("czapla2022_pdb-year-dimer-counts_v03_figdata", index_col=0)
yeardf.loc[DIMER_LST].plot.bar(cmap='coolwarm', legend=False,
                               width=0.75, edgecolor='black', rot=0, 
                               ax=axes[0])
del yeardf

pardf = pd.read_csv(datapath+"/"+ffname+"_pdb-series_year-series_twist-dim-structurespace_jan2022",index_col=0)
pltdf = pardf[[str(2000+2*i) for i in range(0, 12)]]
STEPLST=DIMER_LST+['MN']
pltdf.loc[STEPLST].plot.bar(legend=False,# yerr=devdf, 
                              cmap='coolwarm', width=0.75, edgecolor='black', rot=0, 
                              ax=axes[1])
del pardf, pltdf, STEPLST

collection_df = pd.read_csv("czapla2022_hist-volume_time-series_v02_figdata", index_col=0)

collection_df.plot.bar(legend=False,
                       cmap='coolwarm', 
                       width=0.75, 
                       edgecolor='black', 
                       rot=0,
                      ax=axes[2])

axes[0].set_ylabel("Entry Count")

axes[1].set_ylim(30, 40)
axes[1].set_ylabel("Step Twist (deg)")

axes[2].set_ylim(0, 15)
axes[2].set_ylabel("Step Volume (deg$^3$$\AA$$^3$)")

lgd = plt.legend(loc="lower center", ncol=9, bbox_to_anchor=(0.5, -0.4))


plt.tight_layout()
plt.savefig("czapla2022_year-combo-plot_v01.png", dpi=300, bbox_extra_artists=(lgd,), bbox_inches='tight')
plt.show()
plt.clf()
del lgd