In [None]:
import os, sys, math, scipy
import pandas as pd
import numpy as np

sys.path.append("/home/rty10/Documents/scripts/analysis/")
import dna_analysis
import circle_data_compile as opt_compile

path = os.getcwd()

optpath  = path + '/optimization_data'
freepath = path + '/optimization_data_free'

forcefieldpath = "/home/rty10/Documents/forcefields"

## Compile: all opt global data to single .csv

In [None]:
circles = sorted([ i.split('.')[0] for i in os.listdir(optpath) if i.endswith('.log') ])

df = pd.DataFrame()

for i in range(len(circles)):
    name = circles[i]
    
    df.at[i,'length']   = 150
    df.at[i,'state']    = name.split('_')[1]
    df.at[i,'sequence'] = name.split('_')[2]
    df.at[i,'config']     = 'circ'
    if 'ideal' in name:
        df.at[i,'forcefield'] = 'ideal'
        df.at[i,'ff_type']    = 'dim'
    else:
        df.at[i,'forcefield'] = name.split('_')[3].split('-')[0]
        df.at[i,'ff_type']    = name.split('_')[3].split('-')[1]
    
    df = opt_compile.log_file_data(optpath+'/'+name+'.log', i, df)
    df = opt_compile.refframe_file_data(optpath+'/'+name+'.dat', i, df, 150)
    df = opt_compile.topology_file_data(optpath+'/topo_'+name+'.txt', i, df)        
    del name
    
df['eopt_bend']=df.eopt_tilt+df.eopt_roll
df[['length','lk']]=df[['length','lk']].astype(int)
df = df.sort_values(by=['state','config','sequence'])
df = df.reset_index(drop=True)

In [None]:
df.to_csv("circ150_koo86_dataset")
del df

In [None]:
df

In [None]:
df.forcefield.unique()

In [None]:
df = pd.read_csv("circ150_koo86_dataset", index_col=0)

df.head(10)

In [None]:
del df

del circles

## Compile: all opt structural data to single .csv

In [None]:
circles_o = sorted([ i.split('.')[0] for i in os.listdir(optpath) if i.endswith('.par') ])
circles_f = sorted([ i.split('.')[0] for i in os.listdir(freepath) if i.endswith('.par') ])
circles = circles_o+circles_f

NCIRC = 150
for theta in ['tilt','roll','twist','shift','slide','rise']:
    
    df = pd.DataFrame(columns=[i for i in range(1, NCIRC+1)])

    for i in range(len(circles)):
        name = circles[i]

        df.at[i,'state']    = name.split('_')[1]
        df.at[i,'sequence'] = name.split('_')[2]
        if 'ideal' in name:
            df.at[i,'forcefield'] = 'ideal'
            df.at[i,'ff_type']    = 'dim'
        else:
            df.at[i,'forcefield'] = name.split('_')[3].split('-')[0]
            df.at[i,'ff_type']    = name.split('_')[3].split('-')[1]
        
        if name in circles_o:
            df.at[i,'config']     = 'circ'
            pardf = dna_analysis.df_read_bpsteppars(optpath+'/'+name+'.par', bend=False)
            
        elif name in circles_f:
            df.at[i,'config']     = 'free'
            pardf = dna_analysis.df_read_bpsteppars(freepath+'/'+name+'.par', bend=False)
        else:
            print(name)
        
        pardf = pardf[theta].drop([0],axis=0).T
        
        for j in range(1, NCIRC+1):
            df.at[i, j] = pardf[j]
        
        del pardf, name
        
    df=df[['state','sequence','config','forcefield','ff_type']+[i for i in range(1, NCIRC+1)]]
    
    df.to_csv("koo86_par-"+theta+"_dataset")
    del df
del circles_o, circles_f, circles, NCIRC

## Try: get energy per base-pair step for each par file, compile into single .csv

In [None]:
circles = sorted([ i.split('.')[0] for i in os.listdir(optpath) if i.endswith('.log') ])

dataset = pd.read_csv(path+"/datasets/circ150_koo86_dataset", index_col=0)

NCIRC = 150
df = pd.DataFrame(columns=[i for i in range(1, NCIRC+1)])

for i in range(len(circles)):
    
    name = circles[i]

    df.at[i,'state']    = name.split('_')[1]
    df.at[i,'sequence'] = name.split('_')[2]
    if 'ideal' in name:
        df.at[i,'forcefield'] = 'ideal'
        df.at[i,'ff_type']    = 'dim'
    else:
        df.at[i,'forcefield'] = name.split('_')[3].split('-')[0]
        df.at[i,'ff_type']    = name.split('_')[3].split('-')[1]

    df.at[i,'config']     = 'circ'
    pardf = dna_analysis.df_read_bpsteppars(optpath+'/'+name+'.par', bend=False)
    if 'tet' in name:
        pardf = dna_analysis._par_tetrameric(pardf)
    
    if 'ideal' in name:
        FORCEFIELD="IdealDNA"
    elif "olson" in name:
        FORCEFIELD="Olson1998"
    else:
        FORCEFIELD=name.split('_')[3]
    reststatedf = dna_analysis.df_read_intrinsic_state(forcefieldpath, FORCEFIELD)
    forcecondf  = dna_analysis.df_read_force_constants(forcefieldpath, FORCEFIELD)
    del FORCEFIELD

    if 'tet' in name:
        pardf = dna_analysis._par_energetic_analysis(pardf, reststatedf, forcecondf, tet=True)
    else:
        pardf = dna_analysis._par_energetic_analysis(pardf, reststatedf, forcecondf)
        
    for j in range(1, NCIRC+1):
        df.at[i, j] = pardf.at[j, 'energy']
    df.at[i, 'sum'] = df.loc[i][0:NCIRC].sum()
    df.at[i, 'eopt']=dataset[(dataset['state']     == df.at[i, 'state'])
                           &(dataset['sequence']   == df.at[i, 'sequence'])
                           &(dataset['forcefield'] == df.at[i, 'forcefield'])
                           &(dataset['ff_type']    == df.at[i, 'ff_type'])
                           &(dataset['config']     == df.at[i, 'config'])].eopt.values[0]
    del name
    del pardf
    del reststatedf, forcecondf

df.to_csv("koo86_circ-energy_dataset")
del df