---------------
## Jan 2022- Reorganize the new czapla2022 dataset

-----------------

## Organize bpstep datasets

In [None]:
import os, sys
import pandas as pd
import numpy as np
import seaborn as sns
import Bio.PDB as PDB
from Bio.PDB import MMCIF2Dict

path = os.getcwd()
datapath = path+'/raw_data_xray-only'

### lists and defined functions:

In [None]:
theta_lst = ['tilt','roll','twist','shift','slide','rise']

BASES     = ['A','C','G','T']
COMP      = {'A':'T', 'T':'A', 'C':'G', 'G':'C','.':'.'}

def dna_seq_complement(sequence_string):
    COMP = {'A':'T', 'T':'A', 'C':'G', 'G':'C','.':'.'}
    STEP = sequence_string[::-1]
    STEP = ''.join([COMP[STEP[i]] for i in range(len(STEP))])
    return STEP

DIMERS    = [b+c for b in ['A','C','G','T'] for c in ['A','C','G','T']]
DIMER_LST = ['AT','AC','GC'] + ['AG','GG','GA','AA'] + ['CG','CA','TA']

SCDIM = []
for DIM in DIMERS:
    if COMP[DIM[1]]+COMP[DIM[0]]==DIM:
        SCDIM.append(DIM)

TET_LST      = [a+b+c+d for a in ['A','C','G','T','.'] for b in BASES for c in BASES for d in ['A','C','G','T','.']]
TETRAMERS    = [i for i in TET_LST if '.' not in i]
TETRAMER_LST = [
    'AAAA','AACA','AAGA','AATA','ACAA','ACGA','AGAA','AGCA','AGGA','ATAA',
    'AAAC','AACC','AAGC','AATC','ACAC','ACGC','AGAC','AGCC','AGGC','ATAC',
    'AAAG','AACG','AAGG','AATG','ACAG','ACGG','AGAG','AGCG','AGGG','ATAG',
    'AAAT','AACT','AAGT','AATT','ACAT','ACGT','AGAT','AGCT','AGGT','ATAT',
    'CAAA','CACA','CAGA','CATA','CCAA','CCGA','CGAA','CGCA','CGGA','CTAA',
    'CAAC','CACC','CAGC','CCAC','CGAC','CGGC','CAAG','CACG','CAGG','CATG',
    'CCAG','CCGG','CGAG','CGCG','CGGG','CTAG','CAAT','CACT','CAGT','CCAT',
    'CGAT','CGGT','GAAA','GACA','GAGA','GATA','GCAA','GCGA','GGAA','GGCA',
    'GGGA','GTAA','GAAC','GACC','GAGC','GATC','GCAC','GCGC','GGAC','GGCC',
    'GGGC','GTAC','GAAG','GACG','GAGG','GATG','GCAG','GCGG','GGAG','GGCG',
    'GGGG','GTAG','GAAT','GACT','GAGT','GCAT','GGAT','GGGT','TAAA','TACA',
    'TAGA','TATA','TCAA','TCGA','TGAA','TGCA','TGGA','TTAA','TAAC','TACC',
    'TAGC','TCAC','TGAC','TGGC','TAAG','TACG','TAGG','TCAG','TGAG','TGGG',
    'TAAT','TACT','TAGT','TCAT','TGAT','TGGT'
]
SCTET        = []
for TET in TETRAMERS:
    if COMP[TET[3]]+COMP[TET[2]]+COMP[TET[1]]+COMP[TET[0]]==TET and '.' not in TET:
        SCTET.append(TET)

# ----------------------------------------------------------------------------------------------------------------------

def culling_dictionary(CULL_PAR, DATAFRAME):
    return {'tilt':[DATAFRAME.tilt.mean()      - CULL_PAR*DATAFRAME.tilt.std(),     DATAFRAME.tilt.mean()     + CULL_PAR*DATAFRAME.tilt.std()],
            'roll':[DATAFRAME.roll.mean()      - CULL_PAR*DATAFRAME.roll.std(),     DATAFRAME.roll.mean()     + CULL_PAR*DATAFRAME.roll.std()], 
            'twist':[DATAFRAME.twist.mean()    - CULL_PAR*DATAFRAME.twist.std(),    DATAFRAME.twist.mean()    + CULL_PAR*DATAFRAME.twist.std()],
            'shift':[DATAFRAME['shift'].mean() - CULL_PAR*DATAFRAME['shift'].std(), DATAFRAME['shift'].mean() + CULL_PAR*DATAFRAME['shift'].std()], 
            'slide':[DATAFRAME.slide.mean()    - CULL_PAR*DATAFRAME.slide.std(),    DATAFRAME.slide.mean()    + CULL_PAR*DATAFRAME.slide.std()],
            'rise':[DATAFRAME.rise.mean()      - CULL_PAR*DATAFRAME.rise.std(),     DATAFRAME.rise.mean()     + CULL_PAR*DATAFRAME.rise.std()]
           }


def parametric_culling(culling_par, DATAFRAME, CULL_DF):   
    # Make dictionary with the sigma limits for each parameter
    sigma_check = culling_dictionary(culling_par, DATAFRAME)
    # check each entry of dataset to see if all parameters are within their sigma-limit; if not, cull
    CULL_IDXS = []
    for idx, row in DATAFRAME.iterrows():
        VECTOR = DATAFRAME.loc[idx]
        VECTOR_CHECK=[]
        for theta in ['tilt','roll','twist','shift','slide','rise']:
            if sigma_check[theta][0] <= VECTOR[theta].item() <= sigma_check[theta][1]:
                VECTOR_CHECK.append("pass")
            else:
                VECTOR_CHECK.append("cull")
                
        if len(VECTOR_CHECK)==6 and "cull" in VECTOR_CHECK:
            CULL_DF  = pd.concat([CULL_DF, DATAFRAME.loc[idx:idx]], ignore_index=True)
            CULL_IDXS.append(idx)
        del VECTOR, VECTOR_CHECK
    CULL_DF  = CULL_DF.reset_index(drop=True)
    DATAFRAME = DATAFRAME.drop(index=CULL_IDXS).reset_index(drop=True)
    del CULL_IDXS, sigma_check
    return DATAFRAME, CULL_DF


def culling_cycle(culling_par, CULL_CHECK_DF, STEP_DATAFRAME):
    CULL_CYCLE    = 1
    CULL_CHECK    = len(CULL_CHECK_DF)
    # First culling cycle
    STEP_DATAFRAME, CULL_CHECK_DF = parametric_culling(culling_par, STEP_DATAFRAME, CULL_CHECK_DF)
    CULL_CYCLE+=1
    # Conditionally repeat culling
    while (CULL_CYCLE >= 2) and ( len(CULL_CHECK_DF) - CULL_CHECK > 0 ):
        CULL_CHECK = len(CULL_CHECK_DF)
        STEP_DATAFRAME, CULL_CHECK_DF = parametric_culling(culling_par, STEP_DATAFRAME, CULL_CHECK_DF)
        CULL_CYCLE+=1
    del CULL_CYCLE, CULL_CHECK
    return CULL_CHECK_DF, STEP_DATAFRAME


----------------------
### Load raw dataset, organize and refine

In [None]:
rawdf = pd.read_csv(datapath+'/xray_step_data_feb2022.txt',
                    index_col=None, 
                    header=None,
                    sep=',\s+|\s+|\s', 
                   engine='python')

rawdf = rawdf[[12,13,14,15,16,17,30,31]].rename(columns={12:'tilt',13:'roll',14:'twist',15:'shift',16:'slide',17:'rise',30:'pdb_id',31:'step'})

rawdf["step_dimer"] = ''
rawdf["step_tetramer"] = ''
rawdf["dimer_neighbors"] = ''

print(len(rawdf))

In [None]:
for i in range(len(rawdf)):
    
    if rawdf.isnull().at[i, "pdb_id"]:
        rawdf.at[i, 'coding']='n'
        
    else:
        rawdf.at[i, 'coding']='y'
        STEP=rawdf.at[i, 'step']
        
        if len(STEP) == 6:
            rawdf.at[i, "step_dimer"] = STEP[2:4]
            rawdf.at[i, "step_tetramer"] = STEP[1:5]
            rawdf.at[i, "dimer_neighbors"] = STEP[1:2]+'__'+STEP[4:5]
        elif len(STEP) == 4:
            rawdf.at[i, "step_dimer"] = STEP[1:3]
            rawdf.at[i, "step_tetramer"] = STEP
            rawdf.at[i, "dimer_neighbors"] = STEP[0:1]+'__'+STEP[3:]
        else:
            print("ERROR AT IDX :", i)
        del STEP

In [None]:
rawdf.to_csv(datapath+"/raw_df_formatted_feb2022")
del rawdf

### Process raw dataset and additional refinement

In [None]:
rawdf = pd.read_csv(datapath+"/raw_df_formatted_feb2022", index_col=0)

In [None]:
rawdf

In [None]:
len( rawdf[rawdf.pdb_id.isnull()] )

In [None]:
rawdataset = rawdf.copy()

rawdf2     = rawdataset.copy().loc[(rawdataset.coding=='y')
                                   &(~rawdataset.step_dimer.isin(SCDIM))].reset_index(drop=True)

rawdf_coll = rawdataset.copy().loc[(rawdataset.coding=='y')
                                   &(rawdataset.step_dimer.isin(SCDIM))].reset_index(drop=True)

rawdf_calc = rawdataset.copy().loc[(rawdataset.coding=='n')].reset_index(drop=True)

print(len(rawdataset))
print()
print(len(rawdf2))
print(len(rawdf_coll))
print(len(rawdf_calc))

In [None]:
checker_df = rawdf_calc.copy()
checker_df['tilt']  = -1*checker_df['tilt']
checker_df['shift'] = -1*checker_df['shift']
for i in range(len(checker_df)):
    IDX = rawdf_coll.loc[(rawdf_coll['tilt']  == checker_df.at[i,'tilt'])
                        &(rawdf_coll['roll']  == checker_df.at[i,'roll'])
                        &(rawdf_coll['twist'] == checker_df.at[i,'twist'])
                        &(rawdf_coll['shift'] == checker_df.at[i,'shift'])
                        &(rawdf_coll['slide'] == checker_df.at[i,'slide'])
                        &(rawdf_coll['rise']  == checker_df.at[i,'rise'])
                        ].index[0]

    rawdf_calc.at[i,'pdb_id']          = rawdf_coll.at[IDX,'pdb_id']
    rawdf_calc.at[i,'step_dimer']      = dna_seq_complement(rawdf_coll.at[IDX,'step_dimer'])# COMP[rawdf_coll.at[IDX,'step_dimer'][-1:]] + COMP[rawdf_coll.at[IDX,'step_dimer'][-2:-1]]
    rawdf_calc.at[i,'step_tetramer']   = dna_seq_complement(rawdf_coll.at[IDX,'step_tetramer'])# COMP[rawdf_coll.at[IDX,'step_tetramer'][-1:]] + COMP[rawdf_coll.at[IDX,'step_tetramer'][-2:-1]] + COMP[rawdf_coll.at[IDX,'step_tetramer'][-3:-2]] + COMP[rawdf_coll.at[IDX,'step_tetramer'][-4:-3]]
    rawdf_calc.at[i,'dimer_neighbors'] = COMP[rawdf_coll.at[IDX,'step_tetramer'][-1:]] + "__" + COMP[rawdf_coll.at[IDX,'step_tetramer'][-4:-3]]

    
compile_df = pd.concat([rawdf2, rawdf_coll, rawdf_calc], axis=0).reset_index(drop=True)

print(len(compile_df))

In [None]:
compile_df[compile_df.coding=='n']

In [None]:
tab_dataset = compile_df[compile_df.coding=='y'].reset_index(drop=True)
mod_dataset = compile_df[compile_df.coding=='n'].reset_index(drop=True)


testdf = tab_dataset.copy()

IDX_DROP_LIST = []
for i in range(len(mod_dataset)):
    IDX = testdf.loc[(testdf['tilt']  == -1*mod_dataset.at[i,'tilt'])
                        &(testdf['roll']  == mod_dataset.at[i,'roll'])
                        &(testdf['twist'] == mod_dataset.at[i,'twist'])
                        &(testdf['shift'] == -1*mod_dataset.at[i,'shift'])
                        &(testdf['slide'] == mod_dataset.at[i,'slide'])
                        &(testdf['rise']  == mod_dataset.at[i,'rise'])
                        ].index[0]
    IDX_DROP_LIST.append(IDX)
    del IDX
testdf = testdf.drop([i for i in IDX_DROP_LIST], axis=0).reset_index(drop=True)
del IDX_DROP_LIST

testdf.coding='n'

for i in range(len(testdf)):
    DIM = testdf.at[i, 'step_dimer']
    TET = testdf.at[i, 'step_tetramer']
    
    COMP_DIM=dna_seq_complement(DIM)
    COMP_TET=dna_seq_complement(TET)
    
    testdf.at[i, 'step_dimer']    = COMP_DIM
    testdf.at[i, 'step_tetramer'] = COMP_TET  
    testdf.at[i, 'dimer_neighbors'] = COMP_TET[0:1]+'__'+COMP_TET[3:]
    
    del DIM, TET, COMP_DIM, COMP_TET

testdf['tilt']  = -1*testdf['tilt']
testdf['shift'] = -1*testdf['shift']

testdf

In [None]:
mod_dataset = pd.concat([mod_dataset, testdf], axis=0, ignore_index=True).reset_index(drop=True)
del testdf

print(len(tab_dataset))
print(len(mod_dataset))

CHECKDF = pd.concat([tab_dataset, mod_dataset], ignore_index=True).reset_index(drop=True)

print(len(CHECKDF))
dim2_lst = [a+b for a in ['A','G','C','T'] for b in ['A','G','C','T']]
tet2_df  = pd.DataFrame(index=[i for i in dim2_lst],columns=[i[::-1] for i in dim2_lst]).T
del dim2_lst

for dim1 in tet2_df.index:
    for dim2 in tet2_df.columns:
        tet2_df.at[dim1, dim2] = len(CHECKDF[CHECKDF.step_tetramer==dim1+dim2])

tet2_df = tet2_df.astype(int)
del CHECKDF

tet_a = tet2_df.to_numpy()
tet_b = tet2_df.T.to_numpy()
if tet_a.all() == tet_b.all():
    print("Symmetric")
del tet_a, tet_b

In [None]:
tab_dataset=tab_dataset.drop('step', axis=1)
mod_dataset=mod_dataset.drop('step', axis=1)

print(len(tab_dataset))
print(len(mod_dataset))
tab_dataset=tab_dataset[~tab_dataset.step_tetramer.str.contains('\.')].reset_index(drop=True)
mod_dataset=mod_dataset[~mod_dataset.step_tetramer.str.contains('\.')].reset_index(drop=True)
print(len(tab_dataset))
print(len(mod_dataset))


In [None]:
mod_dataset["DIMER"]=''
mod_dataset["TETRAMER"]=''
for i in range(len(mod_dataset)):
    if mod_dataset.at[i,"step_dimer"] in DIMER_LST:
        mod_dataset.at[i,'DIMER']=mod_dataset.at[i,"step_dimer"]
    else:
        mod_dataset.at[i,'DIMER'] = dna_seq_complement(mod_dataset.at[i,'step_dimer'])
    
    if mod_dataset.at[i,"step_tetramer"] in TETRAMER_LST:
        mod_dataset.at[i,'TETRAMER']=mod_dataset.at[i,"step_tetramer"]
    else:
        mod_dataset.at[i,'TETRAMER'] = dna_seq_complement(mod_dataset.at[i,'step_tetramer'])

tab_dataset["DIMER"]=''
tab_dataset["TETRAMER"]=''
for i in range(len(tab_dataset)):
    if tab_dataset.at[i,"step_dimer"] in DIMER_LST:
        tab_dataset.at[i,'DIMER']=tab_dataset.at[i,"step_dimer"]
    else:
        tab_dataset.at[i,'DIMER'] = dna_seq_complement(tab_dataset.at[i,'step_dimer'])
    
    if tab_dataset.at[i,"step_tetramer"] in TETRAMER_LST:
        tab_dataset.at[i,'TETRAMER']=tab_dataset.at[i,"step_tetramer"]
    else:
        tab_dataset.at[i,'TETRAMER'] = dna_seq_complement(tab_dataset.at[i,'step_tetramer'])

In [None]:
mod_dataset

In [None]:
tab_dataset.to_csv(datapath+"/czapla-xray-2022_dataset_tabulated")
mod_dataset.to_csv(datapath+"/czapla-xray-2022_dataset_modified")

CHECKDF = pd.concat([tab_dataset, mod_dataset], ignore_index=True).reset_index(drop=True)
CHECKDF.to_csv(datapath+"/czapla-xray-2022_dataset")


In [None]:
del tab_dataset, mod_dataset, CHECKDF, compile_df

del rawdataset, rawdf2, rawdf_coll, rawdf_calc, rawdf

### Cull refined dataset and adjust for repeated step-complementary dimers and tetramers

In [None]:
dataset = pd.read_csv(datapath+"/czapla-xray-2022_dataset", index_col=0)


In [None]:
DF = dataset.copy().loc[(~dataset.step_dimer.str.contains('\.'))
                        &(~dataset.dimer_neighbors.str.contains('\.'))].reset_index(drop=True)

df_sig  = pd.DataFrame(columns=DF.columns)
df_cull = pd.DataFrame(columns=DF.columns)

print('*'*25)
print(len(DF))
print(len(df_sig))
print(len(df_cull))
print('*'*25)

del DF, df_sig, df_cull

In [None]:
DF = dataset.copy().loc[(~dataset.step_dimer.str.contains('\.'))
                        &(~dataset.dimer_neighbors.str.contains('\.'))].reset_index(drop=True)

df_sig  = pd.DataFrame(columns=DF.columns)
df_cull = pd.DataFrame(columns=DF.columns)

print('*'*25)
print(len(DF))
print(len(df_sig))
print(len(df_cull))
print('*'*25)

for STEP in DIMER_LST:

    data_df            = DF.copy().loc[ DF.DIMER==STEP ].reset_index(drop=True)
    print(STEP, len(data_df))
    culled_df, data_df = culling_cycle(3.0, pd.DataFrame(columns=DF.columns), data_df)
    print("--- ", len(data_df))
    df_sig             = pd.concat([df_sig, data_df], ignore_index=True).reset_index(drop=True)
    df_cull            = pd.concat([df_cull, culled_df], ignore_index=True).reset_index(drop=True)
    print()
    del culled_df, data_df

df_sig  = df_sig.reset_index(drop=True)
df_cull = df_cull.reset_index(drop=True)

print('*'*25)
print(len(df_sig))
print(len(df_cull))
print('*'*25)

df_sig.to_csv("czapla2022_3sig_dim")
df_cull.to_csv("czapla2022_3sig_dim-cull")

del df_sig, df_cull, DF

In [None]:
DF = dataset.copy().loc[(~dataset.step_dimer.str.contains('\.'))
                        &(~dataset.dimer_neighbors.str.contains('\.'))].reset_index(drop=True)

df_sig  = pd.DataFrame(columns=DF.columns)
df_cull = pd.DataFrame(columns=DF.columns)

print('*'*25)
print(len(DF))
print(len(df_sig))
print(len(df_cull))
print('*'*25)

for STEP in TETRAMER_LST:

    data_df            = DF.copy().loc[ DF.TETRAMER==STEP ].reset_index(drop=True)
    print(STEP, len(data_df))
    culled_df, data_df = culling_cycle(3.0, pd.DataFrame(columns=DF.columns), data_df)
    print("--- ", len(data_df))
    df_sig             = pd.concat([df_sig, data_df], ignore_index=True).reset_index(drop=True)
    df_cull            = pd.concat([df_cull, culled_df], ignore_index=True).reset_index(drop=True)
    print()
    del culled_df, data_df

df_sig  = df_sig.reset_index(drop=True)
df_cull = df_cull.reset_index(drop=True)

print('*'*25)
print(len(df_sig))
print(len(df_cull))
print('*'*25)

df_sig.to_csv("czapla2022_3sig_tet")
df_cull.to_csv("czapla2022_3sig_tet-cull")

del df_sig, df_cull, DF

In [None]:
del dataset

--------------------------
### Make "modified" data sets

In [None]:
dataset = pd.read_csv("czapla2022_3sig_tet", index_col=0)

tab_dataset = dataset[dataset.coding=='y'].reset_index(drop=True)
mod_dataset = dataset[dataset.coding=='n'].reset_index(drop=True)

print(len(dataset))
print(len(tab_dataset))
print(len(mod_dataset))

In [None]:
dup_ind_lst=[]
for i in range(len(mod_dataset)):
    try:
        IDX = tab_dataset.loc[(tab_dataset['tilt']  == -1*mod_dataset.at[i,'tilt'])
                                &(tab_dataset['roll']  == mod_dataset.at[i,'roll'])
                                &(tab_dataset['twist'] == mod_dataset.at[i,'twist'])
                                &(tab_dataset['shift'] == -1*mod_dataset.at[i,'shift'])
                                &(tab_dataset['slide'] == mod_dataset.at[i,'slide'])
                                &(tab_dataset['rise']  == mod_dataset.at[i,'rise'])
                                ].index[0]

        dup_ind_lst.append(IDX)
        del IDX
    except ValueError:
        continue
print(len(dup_ind_lst))
    
del dup_ind_lst

In [None]:
tab_dataset.to_csv("czapla2022_3sig_tet_tab")
mod_dataset.to_csv("czapla2022_3sig_tet_mod")

del tab_dataset, mod_dataset


----------
## Add pdb-specific details to datasets

In [None]:
detdf = pd.read_csv("czapla2022_pdbid_data", index_col=0)

original_dataset = pd.read_csv(datapath+"/czapla-xray-2022_dataset", index_col=0)


In [None]:
# Make a yearly dataset
yrds = original_dataset.copy()
for i in range(len(yrds)):
    yrds.at[i, 'year']=detdf.loc[yrds.at[i, 'pdb_id']]['deposit_year']
yrds.year=yrds.year.astype(int)
yrds.to_csv("czapla2022_yearly_dataset")
del yrds

#make a resolution dataset
resds = original_dataset.copy()
for i in range(len(resds)):
    resds.at[i, 'resolution']=detdf.loc[resds.at[i, 'pdb_id']]['resolution']
    
resds.to_csv("czapla2022_res_dataset")
del resds

### Cull based on pdb-specific datasets

In [None]:
dataset = pd.read_csv("czapla2022_res_dataset", index_col=0)

for RES in [1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0, 6.0, 10.0]:
    df_sig  = pd.DataFrame(columns=dataset.columns)
    df_cull = pd.DataFrame(columns=dataset.columns)
    
    DF = dataset.copy().loc[dataset.resolution<=RES].reset_index(drop=True) 
    DF = DF.loc[ ~DF.step_tetramer.str.contains('\.') ].reset_index(drop=True)
    
    print(RES)
    print(len(DF))
    
    for STEP in TETRAMER_LST:

        data_df            = DF.copy().loc[ DF.TETRAMER==STEP ].reset_index(drop=True)
        culled_df, data_df = culling_cycle(3.0, pd.DataFrame(columns=DF.columns), data_df)
        df_sig             = pd.concat([df_sig, data_df], ignore_index=True).reset_index(drop=True)
        df_cull            = pd.concat([df_cull, culled_df], ignore_index=True).reset_index(drop=True)
        del culled_df, data_df
    print(len(df_sig))
    
    df_sig  = df_sig.reset_index(drop=True)   
    df_sig.to_csv("czapla2022_pdb-series_res-series_"+str(int(RES*10)).zfill(3)+"_data")
    
    df_cull = df_cull.reset_index(drop=True)
    df_cull.to_csv("czapla2022_pdb-series_res-series_"+str(int(RES*10)).zfill(3)+"_culled")
    print('-'*5)
    del DF
    del df_sig, df_cull
    
del dataset

In [None]:
dataset = pd.read_csv("czapla2022_yearly_dataset", index_col=0)

if 'TETRAMER' not in dataset.columns:
    for i in range(len(dataset)):
        STEP=dataset.at[i, 'step_tetramer']
        if STEP not in TETRAMER_LST:
            STEP=dna_seq_complement(STEP)
        dataset.at[i, 'DIMER']=STEP[1:3]
        dataset.at[i, 'TETRAMER']=STEP
        del STEP

for YEAR in [2000, 2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2020, 2022]:
    
    df_sig  = pd.DataFrame(columns=dataset.columns)
    df_cull = pd.DataFrame(columns=dataset.columns)
    
    DF = dataset.copy().loc[dataset.year<=YEAR].reset_index(drop=True) 
    DF = DF.loc[ ~DF.step_tetramer.str.contains('\.') ].reset_index(drop=True)
    
    print(YEAR)
    print(len(DF))
    
    for STEP in TETRAMER_LST:
        data_df            = DF.copy().loc[ DF.TETRAMER==STEP ].reset_index(drop=True)
        culled_df, data_df = culling_cycle(3.0, pd.DataFrame(columns=DF.columns), data_df)
        df_sig             = pd.concat([df_sig, data_df], ignore_index=True).reset_index(drop=True)
        df_cull            = pd.concat([df_cull, culled_df], ignore_index=True).reset_index(drop=True)
        del culled_df, data_df
    print(len(df_sig))
    df_sig  = df_sig.reset_index(drop=True)   
    df_sig.to_csv("czapla2022_pdb-series_year-series_"+str(YEAR)+"_data")
    print('-'*5)
    df_cull = df_cull.reset_index(drop=True)
    df_cull.to_csv("czapla2022_pdb-series_year-series_"+str(YEAR)+"_culled")
    
    del DF
    del df_sig, df_cull
    
del dataset

### modified pdb-series datasets

In [None]:
set_path = path+'/pdb_res-series-data'
for RES in [1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0, 6.0, 10.0]:
    dataset = pd.read_csv(set_path+"/czapla2022_pdb-series_res-series_"+str(int(RES*10)).zfill(3)+"_data", index_col=0)
    tab_dataset = dataset[dataset.coding=='y'].reset_index(drop=True)
    mod_dataset = dataset[dataset.coding=='n'].reset_index(drop=True)
    tab_dataset.to_csv(set_path+"/czapla2022_pdb-series_res-series_"+str(int(RES*10)).zfill(3)+"_data_tab")
    mod_dataset.to_csv(set_path+"/czapla2022_pdb-series_res-series_"+str(int(RES*10)).zfill(3)+"_data_mod")
    del tab_dataset, mod_dataset, dataset
del set_path


set_path = path+'/pdb_time-series-data'
for YEAR in [2000, 2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2020, 2022]:
    dataset = pd.read_csv(set_path+"/czapla2022_pdb-series_year-series_"+str(YEAR)+"_data", index_col=0)
    tab_dataset = dataset[dataset.coding=='y'].reset_index(drop=True)
    mod_dataset = dataset[dataset.coding=='n'].reset_index(drop=True)
    tab_dataset.to_csv(set_path+"/czapla2022_pdb-series_year-series_"+str(YEAR)+"_data_tab")
    mod_dataset.to_csv(set_path+"/czapla2022_pdb-series_year-series_"+str(YEAR)+"_data_mod")
    del tab_dataset, mod_dataset, dataset
del set_path

## Scratch

In [None]:
testdf = pd.read_csv("czapla2022_3sig_tet", index_col=0)
testdf

In [None]:
testdf[(testdf.step_dimer=='TT')&(testdf.pdb_id=="3FYL")]