Analysis and development of new sequence-specific forcefield
-----------------------

PDB Data collected up to 2019

## Homopolymeric refinement

In [None]:
import os, sys
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

path = os.getcwd()


In [None]:
def culling_dictionary(CULL_PAR, DATAFRAME):
    return {'tilt':[DATAFRAME.tilt.mean()      - CULL_PAR*DATAFRAME.tilt.std(),     DATAFRAME.tilt.mean()     + CULL_PAR*DATAFRAME.tilt.std()],
            'roll':[DATAFRAME.roll.mean()      - CULL_PAR*DATAFRAME.roll.std(),     DATAFRAME.roll.mean()     + CULL_PAR*DATAFRAME.roll.std()], 
            'twist':[DATAFRAME.twist.mean()    - CULL_PAR*DATAFRAME.twist.std(),    DATAFRAME.twist.mean()    + CULL_PAR*DATAFRAME.twist.std()],
            'shift':[DATAFRAME['shift'].mean() - CULL_PAR*DATAFRAME['shift'].std(), DATAFRAME['shift'].mean() + CULL_PAR*DATAFRAME['shift'].std()], 
            'slide':[DATAFRAME.slide.mean()    - CULL_PAR*DATAFRAME.slide.std(),    DATAFRAME.slide.mean()    + CULL_PAR*DATAFRAME.slide.std()],
            'rise':[DATAFRAME.rise.mean()      - CULL_PAR*DATAFRAME.rise.std(),     DATAFRAME.rise.mean()     + CULL_PAR*DATAFRAME.rise.std()]
           }

def parametric_culling(culling_par, DATAFRAME, CULL_DF):   
    # Make dictionary with the sigma limits for each parameter
    sigma_check = culling_dictionary(culling_par, DATAFRAME)
    # check each entry of dataset to see if all parameters are within their sigma-limit; if not, cull
    CULL_IDXS = []
    for idx, row in DATAFRAME.iterrows():
        VECTOR = DATAFRAME.loc[idx]
        VECTOR_CHECK=[]
        for theta in ['tilt','roll','twist','shift','slide','rise']:
            if sigma_check[theta][0] <= VECTOR[theta].item() <= sigma_check[theta][1]:
                VECTOR_CHECK.append("pass")
            else:
                VECTOR_CHECK.append("cull")
                
        if len(VECTOR_CHECK)==6 and "cull" in VECTOR_CHECK:
            CULL_DF  = pd.concat([CULL_DF, DATAFRAME.loc[idx:idx]], ignore_index=True)
            CULL_IDXS.append(idx)
        del VECTOR, VECTOR_CHECK
    CULL_DF  = CULL_DF.reset_index(drop=True)
    DATAFRAME = DATAFRAME.drop(index=CULL_IDXS).reset_index(drop=True)
    del CULL_IDXS, sigma_check
    return DATAFRAME, CULL_DF
    

In [None]:
theta_lst = ['tilt','roll','twist','shift','slide','rise']

# --- Lists of dimers, tetramers, and the self-complementary steps ---
BASES     = ['A','C','G','T']
COMP      = {'A':'T', 'T':'A', 'C':'G', 'G':'C','.':'.'}

DIMERS    = [b+c for b in BASES for c in BASES]
DIM_MATCH = {'TT':'AA', 'GT':'AC', 'CT':'AG', 'CC':'GG', 'TC':'GA', 'TG':'CA'}
DIMER_LST = ['AT','AC','GC'] + ['AG','GG','GA','AA'] + ['CG','CA','TA']
SCDIM     = []
for DIM in DIMERS:
    if COMP[DIM[1]]+COMP[DIM[0]]==DIM:
        SCDIM.append(DIM)

TET_LST      = [a+b+c+d for a in ['A','C','G','T','.'] for b in BASES for c in BASES for d in ['A','C','G','T','.']]
TETRAMERS    = [i for i in TET_LST if '.' not in i]
TETRAMER_LST = [
    'AAAA','AACA','AAGA','AATA','ACAA','ACGA','AGAA','AGCA','AGGA','ATAA',
    'AAAC','AACC','AAGC','AATC','ACAC','ACGC','AGAC','AGCC','AGGC','ATAC',
    'AAAG','AACG','AAGG','AATG','ACAG','ACGG','AGAG','AGCG','AGGG','ATAG',
    'AAAT','AACT','AAGT','AATT','ACAT','ACGT','AGAT','AGCT','AGGT','ATAT',
    'CAAA','CACA','CAGA','CATA','CCAA','CCGA','CGAA','CGCA','CGGA','CTAA',
    'CAAC','CACC','CAGC','CCAC','CGAC','CGGC','CAAG','CACG','CAGG','CATG',
    'CCAG','CCGG','CGAG','CGCG','CGGG','CTAG','CAAT','CACT','CAGT','CCAT',
    'CGAT','CGGT','GAAA','GACA','GAGA','GATA','GCAA','GCGA','GGAA','GGCA',
    'GGGA','GTAA','GAAC','GACC','GAGC','GATC','GCAC','GCGC','GGAC','GGCC',
    'GGGC','GTAC','GAAG','GACG','GAGG','GATG','GCAG','GCGG','GGAG','GGCG',
    'GGGG','GTAG','GAAT','GACT','GAGT','GCAT','GGAT','GGGT','TAAA','TACA',
    'TAGA','TATA','TCAA','TCGA','TGAA','TGCA','TGGA','TTAA','TAAC','TACC',
    'TAGC','TCAC','TGAC','TGGC','TAAG','TACG','TAGG','TCAG','TGAG','TGGG',
    'TAAT','TACT','TAGT','TCAT','TGAT','TGGT'
]
SCTET        = []
for TET in TETRAMERS:
    if COMP[TET[3]]+COMP[TET[2]]+COMP[TET[1]]+COMP[TET[0]]==TET and '.' not in TET:
        SCTET.append(TET)

In [None]:
#df1 = pd.read_csv("cohen2017_dataset", index_col=0)
#if "step_dimer" not in df1.columns:
#    df1["step_dimer"] = df1.step_tetramer.str[1:3]
#    df1.to_csv("cohen2017_dataset")
#if "dimer_neighbors" not in df1.columns:
#    df1["dimer_neighbors"] = df1.step_tetramer.str[0:1]+'__'+df1.step_tetramer.str[3:]
#    df1.to_csv("cohen2017_dataset")


dataset2019 = pd.read_csv("young2019_dataset", index_col=0)

if "step_dimer" not in dataset2019.columns:
    dataset2019["step_dimer"] = dataset2019.step_tetramer.str[1:3]
    dataset2019.to_csv("young2019_dataset")
    
if "dimer_neighbors" not in dataset2019.columns:
    dataset2019["dimer_neighbors"] = dataset2019.step_tetramer.str[0:1]+'__'+dataset2019.step_tetramer.str[3:]
    dataset2019.to_csv("young2019_dataset")


In [None]:
print(len(dataset2019))

print(len(dataset2019.loc[~dataset2019.dimer_neighbors.str.contains('\.')]))

print(len(dataset2019.loc[(~dataset2019.dimer_neighbors.str.contains('\.'))
                          &(dataset2019.from_pdb=='y')]))
print(len(dataset2019.loc[(~dataset2019.dimer_neighbors.str.contains('\.'))
                          &(dataset2019.from_pdb=='n')]))

dataset2019 = dataset2019.loc[~dataset2019.dimer_neighbors.str.contains('\.')].reset_index(drop=True)

### 3-sigma culling

#### Make sure to do both dimer set and tetramer sets

---------------

## Dimer dataset culling

In [None]:
DF = dataset2019.copy().loc[~dataset2019.dimer_neighbors.str.contains('\.')].reset_index(drop=True)

df_sig  = pd.DataFrame(columns=DF.columns)
df_cull = pd.DataFrame(columns=DF.columns)

print('*'*25)
print(len(DF))
print(len(df_sig))
print(len(df_cull))
print('*'*25)

for STEP in DIMERS:
    step_df = DF.copy().loc[DF.step_dimer==STEP].reset_index(drop=True)
    print("--- STEP : "+STEP+" --- ---")
    
    #self_comp=False
    #if STEP in SCDIM:
    #    self_comp=True
    #    self_comp_df = step_df[step_df.from_pdb == "n"].reset_index(drop=True)
    #    step_df      = step_df[step_df.from_pdb == "y"].reset_index(drop=True)
    
    print("step 1: ", len(step_df))
    CULL_CYCLE    = 1
    df_cull_setup = pd.DataFrame(columns=step_df.columns)
    CULL_CHECK    = len(df_cull_setup)

    # First culling cycle
    #print("Before culling: step ", STEP, ', length ', len(step_df))
    step_df, df_cull_setup = parametric_culling(3.0, step_df, df_cull_setup)
    CULL_CYCLE+=1
    print("step 2: ", len(step_df))
    while (CULL_CYCLE >= 2) and ( len(df_cull_setup) - CULL_CHECK > 0 ):
        CULL_CHECK = len(df_cull_setup)
        step_df, df_cull_setup = parametric_culling(3.0, step_df, df_cull_setup)
        CULL_CYCLE+=1
    del CULL_CYCLE, CULL_CHECK
    df_sig_setup = step_df.copy().reset_index(drop=True)
    
    # If this is a self-comp step, return all calculated steps where tilt and shift are multiplied by -1
    #if self_comp == True:
    #    for i in range(len(df_sig_setup)):
    #        VECTOR = df_sig_setup.loc[[i]]
    #        VECTOR['tilt'] = -1*VECTOR['tilt']
    #        VECTOR['shift'] = -1*VECTOR['shift']
    #        VECTOR.from_pdb = "n"
    #        df_sig_setup = df_sig_setup.append(VECTOR)
    #        del VECTOR

    print("final step: ", len(df_sig_setup))
    print('-'*25)
    print()
    df_sig  = pd.concat([df_sig, df_sig_setup], ignore_index=True).reset_index(drop=True)
    df_cull = pd.concat([df_cull, df_cull_setup], ignore_index=True).reset_index(drop=True)
    
    del df_sig_setup, df_cull_setup  
    del step_df


df_sig  = df_sig.reset_index(drop=True)
df_cull = df_cull.reset_index(drop=True)
print('*'*25)
print(len(df_sig))
print(len(df_cull))
print('*'*25)
df_sig.to_csv("young2019_3sig_dim")
df_cull.to_csv("young2019_3sig_dim-cull")

del df_sig, df_cull, DF

## Tetramer-based culling

### -!- cull from within step dataset, not from whole dataset

In [None]:
print(len(TETRAMER_LST))

print(len( dataset2019.step_tetramer.unique() ))

In [None]:
dataset2019.head(3)

In [None]:
CHANGE_STEP = []
for step in dataset2019.step_tetramer.unique():
    if step not in TETRAMER_LST:
        CHANGE_STEP.append(step)
        
for i in range(len(dataset2019)):
    if dataset2019.at[i, 'step_tetramer'] in CHANGE_STEP:
        
        print("Change at index "+str(i)+": ", dataset2019.loc[i])
        
        dataset2019.at[i, 'tilt']     = -1*float(dataset2019.at[i, 'tilt'])
        dataset2019.at[i, 'shift']    = -1*float(dataset2019.at[i, 'shift'])
        dataset2019.at[i, 'from_pdb'] = 'n'
        
        OLD_STEP = dataset2019.at[i, 'step_tetramer']
        NEW_STEP = COMP[ OLD_STEP[-1:] ]+COMP[ OLD_STEP[-2:-1] ]+COMP[ OLD_STEP[-3:-2] ]+COMP[ OLD_STEP[-4:-3] ]
        
        dataset2019.at[i, 'step_tetramer']   = NEW_STEP
        dataset2019.at[i, 'step_dimer']      = NEW_STEP[1:3]
        dataset2019.at[i, 'dimer_neighbors'] = NEW_STEP[0:1]+"__"+NEW_STEP[3:]
        del OLD_STEP, NEW_STEP
        
        print("--- ", dataset2019.loc[i])

In [None]:
DF = dataset2019.copy().loc[~dataset2019.dimer_neighbors.str.contains("\.__|__\.|\.__\.")]

df_sig  = pd.DataFrame(columns=DF.columns)
df_cull = pd.DataFrame(columns=DF.columns)

print('*'*25)
print(len(DF))
print(len(df_sig))
print(len(df_cull))
print('*'*25)
print()

for STEP in TETRAMERS:
    step_df = DF.loc[DF.step_tetramer==STEP].reset_index(drop=True)
    #print("--- STEP : "+STEP+" --- ---")
    
    #self_comp=False
    #if STEP in SCDIM:
    #    self_comp=True
    #    self_comp_df = step_df[step_df.from_pdb == "n"].reset_index(drop=True)
    #    step_df      = step_df[step_df.from_pdb == "y"].reset_index(drop=True)
    
    #print("step 1: ", len(step_df))
    CULL_CYCLE    = 1
    df_cull_setup = pd.DataFrame(columns=step_df.columns)
    CULL_CHECK    = len(df_cull_setup)
    
    # First culling cycle
    #print("Before culling: step ", STEP, ', length ', len(step_df))
    step_df, df_cull_setup = parametric_culling(3.0, step_df, df_cull_setup)
    CULL_CYCLE+=1
    #print("step 2: ", len(step_df))
    while (CULL_CYCLE >= 2) and ( len(df_cull_setup) - CULL_CHECK > 0 ):
        CULL_CHECK = len(df_cull_setup)
        step_df, df_cull_setup = parametric_culling(3.0, step_df, df_cull_setup)
        CULL_CYCLE+=1
    del CULL_CYCLE, CULL_CHECK
    df_sig_setup = step_df.copy().reset_index(drop=True)
    
    # If this is a self-comp step, return all calculated steps where tilt and shift are multiplied by -1
    #if self_comp == True:
    #    for i in range(len(df_sig_setup)):
    #        VECTOR = df_sig_setup.loc[[i]]
    #        VECTOR['tilt'] = -1*VECTOR['tilt']
    #        VECTOR['shift'] = -1*VECTOR['shift']
    #        VECTOR.from_pdb = "n"
    #        df_sig_setup = df_sig_setup.append(VECTOR)
    #        del VECTOR

    #print("final step: ", len(df_sig_setup))
    #print('-'*25)
    #print()
    df_sig  = pd.concat([df_sig, df_sig_setup], ignore_index=True).reset_index(drop=True)
    df_cull = pd.concat([df_cull, df_cull_setup], ignore_index=True).reset_index(drop=True)
    
    del df_sig_setup, df_cull_setup  
    del step_df

df_sig  = df_sig.reset_index(drop=True)
df_cull = df_cull.reset_index(drop=True)

print('*'*25)
print(len(DF))
print(len(df_sig))
print(len(df_cull))
print('*'*25)


df_sig.to_csv("young2019_3sig_tet")
df_cull.to_csv("young2019_3sig_tet-cull")

del df_sig, df_cull, DF