In [None]:
#Script Name: fragment_frequency_grapher
#Script Purpose: Graphing and comparing various different torsion libraries. 
#Author Name: John Bickel
#Affiliation: Rizzo Lab, Stony Brook University
#Create date: 03/04/2019 (MM/DD/YYYY)
#Last edit: 05/04/2021 John Bickel/SBU


import pandas as pd                             #data set handling
import numpy as np                              #math functions
import matplotlib.pyplot as plt                 #plotting
from matplotlib.ticker import NullFormatter     #formatting for plot axes
from scipy.stats import gaussian_kde            #scatter plot density coloring
from scipy.optimize import curve_fit            #distribution fitting
from scipy import stats
from operator import itemgetter
import matplotlib.ticker as ticker

import math
import sys

#file locations for the torsion libraries - dock is input library, no_roul and roul are self explanatory
#G:\zzz.research\roul_anal\2021.05.11.simple_build\001a.fraglib
dock_generated_torenv="../../../fraglib_torenv.dat"
no_roul_self_generated_torenv="./roul_off/fraglib_off_torenv.dat"
roul_self_generated_torenv="./roul_on/fraglib_on_torenv.dat"

#size of the input molecule library to make the "DOCK" library - used for headings in graphs
input_mol_lib_size=11208

In [None]:
###takes in the dock torenv and prepares it for graphing

dock_list=[]
dock_x=[]
dock_y=[]
dock_vals=np.array([])

#generates the dock_list from the input file
dock_file=open(dock_generated_torenv,"r")
for line in dock_file:
    linesplit_list=line.rsplit("-",1)
    linesplit_list[1]=linesplit_list[1].strip()
    dock_x.append(linesplit_list[0])
    dock_y.append(linesplit_list[1])
    dock_list.append([linesplit_list[0],int(linesplit_list[1])])
dock_file.close()

#sorts the lists by frequency, largest frequency first
dock_list = sorted(dock_list, key=itemgetter(1), reverse=True)

#Gives each entry in dock_list a 'unique identifier,' with largest frequency=1   
entry_number=1
for entry in dock_list:
    entry.append(entry_number)
    entry_number+=1
      

dock_x_list=[]
dock_y_list=[]
for entry in range(0,len(dock_list)):
    dock_x_list.append(dock_list[entry][2])
    dock_y_list.append(dock_list[entry][1])
y_list_summer=0
for entry in dock_y_list:
    y_list_summer+=entry
for val in range(0, len(dock_y_list)):
    dock_y_list[val]=dock_y_list[val]/y_list_summer

######################################## LIST BLOCK 1 - NO ROULETTE ###########################################

no_roul_self_list=[]

no_roul_self_x=[]
no_roul_self_y=[]
no_roul_own_vals=np.array([])


#generates the self_list from the input file
no_roul_self_file=open(no_roul_self_generated_torenv,"r")
for line in no_roul_self_file:
    linesplit_list=line.rsplit("-",1)
    linesplit_list[1]=int(linesplit_list[1].strip())
    no_roul_self_x.append(linesplit_list[0])
    no_roul_self_y.append(linesplit_list[1])
    no_roul_self_list.append([linesplit_list[0],int(linesplit_list[1])])
no_roul_self_file.close()
    
#checks each entry in the DOCK list against 'own' list
#and inserts the 'unique id' - otherwise appends it
#to the 'own' list with frequency 0
for i in dock_list:
    found = False
    for y in no_roul_self_list:
        if i[0] == y[0]:
            y.append(i[2])
            found=True
            break
    if found == False:
        no_roul_self_list.append([i[0],0,i[2]])
    
    
#does the above fail? the for loop at the bottom can be uncommented to check
#for torsions in the self_list that doesn't exist in the DOCK file

#no_roul_self_list=sorted(no_roul_self_list, key=itemgetter(2))      

bad_torsion_list=[]
for i in no_roul_self_list:
    if len(i) < 3:
        bad_torsion_list.append(i)
if len(bad_torsion_list) > 0:
    print("BAD TORSION(s) FOUND. WILL NOT PROCEED WITH GRAPH PREP. PLEASE CHECK YOUR OUTPUT.")
    print("Number of bad torsions: %d" % (len(bad_torsion_list)))
    print("BAD TORSIONS ARE:")
    for entry in bad_torsion_list:
        print(entry)
    sys.exit()

##Prepares the data to be input into a graph - normalizes everything to totals for the run
x_list_1=[]
y_list_1=[]
no_roul_self_list = sorted(no_roul_self_list, key=itemgetter(2), reverse=False)    

for entry in range(0,len(no_roul_self_list)):
    x_list_1.append(no_roul_self_list[entry][2])
    y_list_1.append(no_roul_self_list[entry][1])
y_list_summer=0
for entry in y_list_1:
    y_list_summer+=entry
for x in range(0, len(y_list_1)):
    y_list_1[x]=y_list_1[x]/y_list_summer

    
    
######################################## LIST BLOCK 2 - ROULETTE ON ########################################

roul_self_list=[]

roul_self_x=[]
roul_self_y=[]
roul_own_vals=np.array([])


#generates the roul_self_list from the input file
roul_self_file=open(roul_self_generated_torenv,"r")
for line in roul_self_file:
    linesplit_list=line.rsplit("-",1)
    linesplit_list[1]=int(linesplit_list[1].strip())
    roul_self_x.append(linesplit_list[0])
    roul_self_y.append(linesplit_list[1])
    roul_self_list.append([linesplit_list[0],int(linesplit_list[1])])
roul_self_file.close()

#checks each entry in the DOCK list against 'own' list
#and inserts the 'unique id' - otherwise appends it
#to the 'own' list with frequency 0
for i in dock_list:
    found = False
    for y in roul_self_list:
        if i[0] == y[0]:
            y.append(i[2])
            found=True
            break
    if found == False:
        roul_self_list.append([i[0],0,i[2]])
    
    
#does the above fail? the for loop at the bottom can be uncommented to check
#for torsions in the roul_self_list that doesn't exist in the DOCK file

#roul_self_list=sorted(roul_self_list, key=itemgetter(2))      

roul_bad_torsion_list=[]
for i in roul_self_list:
    if len(i) < 3:
        roul_bad_torsion_list.append(i)
if len(bad_torsion_list) > 0:
    print("BAD TORSION(s) FOUND. WILL NOT PROCEED WITH GRAPH PREP. PLEASE CHECK YOUR OUTPUT.")
    print("Number of bad torsions: %d" % (len(roul_bad_torsion_list)))
    print("BAD TORSIONS ARE:")
    for entry in roul_bad_torsion_list:
        print(entry)
    sys.exit()

##Prepares the data to be input into a graph - normalizes everything to totals for the run
x_list_2=[]
y_list_2=[]

roul_self_list = sorted(roul_self_list, key=itemgetter(2), reverse=False)    



for entry in range(0,len(roul_self_list)):
    x_list_2.append(roul_self_list[entry][2])
    y_list_2.append(roul_self_list[entry][1])
y_list_summer=0
for entry in y_list_2:
    y_list_summer+=entry
for x in range(0, len(y_list_2)):
    y_list_2[x]=y_list_2[x]/y_list_summer

In [None]:
#### Bin generator and number liner for visualization of torsion distribution.

bins_list=[0]
bin_summer=0
i=0
while i < len(dock_y_list):
    if dock_y_list[i] == dock_y_list[i+1]:
        for x in range(i,len(dock_y_list)):
            if x == len(dock_y_list)-1:
                bin_summer+=dock_y_list[x]
                bins_list.append(bin_summer)
                i=len(dock_y_list)
                break
            elif dock_y_list[x] == dock_y_list[x+1]:
                bin_summer+=dock_y_list[x]
            elif dock_y_list[x] != dock_y_list[x+1]:
                bins_list.append(bin_summer)
                i = x
                break
    else:
        bin_summer+=dock_y_list[i]
        bins_list.append(bin_summer)
        i+=1
        
%config InlineBackend.figure_format ='retina'
# Setup a plot such that only the bottom spine is shown
def setup(ax):
    ax.spines['right'].set_color('none')
    ax.spines['left'].set_color('none')
    ax.yaxis.set_major_locator(ticker.NullLocator())
    ax.spines['top'].set_color('none')
    ax.xaxis.set_ticks_position('bottom')
    ax.tick_params(which='major', width=1.00)
    ax.tick_params(which='major', length=5)
    ax.tick_params(which='minor', width=0.75)
    ax.tick_params(which='minor', length=2.5)
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)
    ax.patch.set_alpha(0.0)
    plt.style.use('dark_background')

fig=plt.figure(figsize=(10, 8),dpi=1000)
n = 8

# Null Locator
ax = fig.add_subplot(n, 1, 1)
setup(ax)
ax.xaxis.set_major_locator(ticker.NullLocator())
ax.xaxis.set_minor_locator(ticker.NullLocator())
ax.text(0.0, 1, "",fontsize=14, transform=ax.transAxes)
ax.text(0 - 0.01, 0, '0', horizontalalignment='right')
ax.text(1 + 0.01, 0, '1', horizontalalignment='left')


for entry in bins_list:
    #ax.plot(entry,0,'|',color="red",markersize=4)
    ax.axvline(x=entry,linewidth=1,ymin=0,ymax=0.2)
    #ax.text(entry,0.05,str(entry),size=5)

In [None]:
###### Sets up the ranges for each of the graphs below - can be edited here or in place
###### for better or finer visualization.

start_val_x=0
#end_val_x=len(dock_x_list)
end_val_x=11208

start_val_y=0
end_val_y=0.04

In [None]:
plt.style.use('default')
axes=[start_val_x, end_val_x, start_val_y, end_val_y]
fig, (ax1,ax2) = plt.subplots(2,gridspec_kw={'hspace': 0},dpi=150,figsize=(8,2))
ax1.axis(axes)
ax2.axis(axes)

for ax in fig.get_axes():
    ax.label_outer()

fig.suptitle('Roul vs. NoRoul - Simple Build - Normal Torsions - First 30 anchors')
fig.axes[1].set_xlabel('Torsion ID',size=5)
fig.text(0.08, 0.5, 'Rel. Acc. %', ha='center', va='center', rotation='vertical',size=5)


ax1.tick_params(axis='both',which='major',labelsize=5)
ax2.tick_params(axis='both',which='major',labelsize=5)

%config InlineBackend.figure_format ='retina'

line_noroul=ax1.plot(x_list_1,y_list_1,color='green',label="No Roulette",linewidth=0.6)
line_roul=ax2.plot(x_list_2, y_list_2, color='orange',label="Roulette",linewidth=0.6)
line_orig=ax1.plot(dock_x_list, dock_y_list,color='black',label="Input Library" ,linewidth=0.4)
ax2.plot(dock_x_list, dock_y_list,color='black',label="Input Library",linewidth=0.4)


ax1.legend(handles=[line_noroul[0],line_roul[0],line_orig[0]],loc="upper right",fontsize="x-small")

In [None]:
plt.style.use('default')
fig=plt.figure(num=None, figsize=(8, 2), dpi=400, facecolor='w', edgecolor='k')
plt.axis([start_val_x, end_val_x, start_val_y, end_val_y])
plt.xlabel('Torsion ID',size=5)
plt.ylabel('Rel. Frequency',size=5)
plt.title('No Roulette - Simple Build - Normal Torsions',size=8)
plt.tick_params(axis='both',which='major',labelsize=6)
%config InlineBackend.figure_format ='retina'
#plt.plot(x_list_2, y_list_2, color='orange',label="Roulette")
plt.plot(x_list_1,y_list_1,color='green',label="No Roulette",linewidth=0.6)

plt.plot(dock_x_list, dock_y_list,color='black',label="Input Library")
plt.legend(loc="upper right")

In [None]:
plt.style.use('default')
fig=plt.figure(num=None, figsize=(8, 2), dpi=400, facecolor='w', edgecolor='k')
plt.axis([start_val_x, 500, start_val_y, end_val_y])
plt.xlabel('Torsion ID',size=5)
plt.ylabel('Rel. Frequency',size=5)
plt.title('Roulette - Simple Build - Normal Torsions',size=8)
plt.tick_params(axis='both',which='major',labelsize=6)
%config InlineBackend.figure_format ='retina'
plt.plot(x_list_2, y_list_2, color='orange',label="Roulette",linewidth=0.6)
#plt.plot(x_list_1,y_list_1,color='green',label="No Roulette")

plt.plot(dock_x_list, dock_y_list,color='black',label="Input Library")
plt.legend(loc="upper right")

In [None]:
mid_val=2000

#area for no roulette
no_roul_area_test_list=[]
no_roul_area_test_list2=[]
for i in range(0,mid_val):
    no_roul_area_test_list.append(y_list_1[i])
for i in range (mid_val,end_val_x):
    no_roul_area_test_list2.append(y_list_1[i])
    
no_roul_area = np.trapz(no_roul_area_test_list, dx=1)
no_roul_area2 = np.trapz(no_roul_area_test_list2, dx=1)


#area for roulette
roul_area_test_list=[]
roul_area_test_list2=[]
for i in range(0,mid_val):
    roul_area_test_list.append(y_list_2[i])
for i in range (mid_val,end_val_x):
    roul_area_test_list2.append(y_list_2[i])
    
roul_area = np.trapz(roul_area_test_list, dx=1)
roul_area2 = np.trapz(roul_area_test_list2, dx=1)



#area for dock
input_area_test_list=[]
input_area_test_list2=[]
for i in range(0,mid_val):
    input_area_test_list.append(dock_y_list[i])
for i in range (mid_val,end_val_x):
    input_area_test_list2.append(dock_y_list[i])
    
input_area = np.trapz(input_area_test_list, dx=1)
input_area2 = np.trapz(input_area_test_list2, dx=1)


print("Input Library")
print("    1-%s: " % (str(mid_val)) + str(round(input_area,3)))
print("%s-%s: " % (str(mid_val+1),str(end_val_x)) + str(round(input_area2,3))+"\n")
print("------------------------------")
print("No Roulette")
print("    1-%s: " % (str(mid_val)) + str(round(no_roul_area,3)))
print("%s-%s: " % (str(mid_val+1),str(end_val_x)) + str(round(no_roul_area2,3))+"\n")
print("------------------------------")
print("Roulette")
print("    1-%s: " % (str(mid_val)) + str(round(roul_area,3)))
print("%s-%s: " % (str(mid_val+1),str(end_val_x)) + str(round(roul_area2,3))+"\n")



In [None]:
list_summer=0
roulette_area_quantifier=[]
for i in (y_list_2):
    list_summer+=i
    
    roulette_area_quantifier.append(list_summer)

list_summer=0
dock_area_quantifier=[]
for i in (dock_y_list):
    list_summer+=i
    dock_area_quantifier.append(list_summer)
    
list_summer=0
no_roulette_area_quantifier=[]
for i in(y_list_1):
    list_summer+=i
    no_roulette_area_quantifier.append(list_summer)

In [None]:
#progressive area under the curve
plt.style.use('default')
fig=plt.figure(num=None, figsize=(8, 2), dpi=400, facecolor='w', edgecolor='k')
plt.axis([start_val_x, end_val_x, 0, 1])
plt.xlabel('Torsion ID',size=5)
plt.ylabel('Area under curve 0 -> N',size=5)
plt.title('Progressive Area Under the Curve',size=8)
plt.tick_params(axis='both',which='major',labelsize=6)
%config InlineBackend.figure_format ='retina'
plt.plot(x_list_2, roulette_area_quantifier, color='orange',label="Roulette")
plt.plot(x_list_2, no_roulette_area_quantifier, color='green',label="No Roulette")
#plt.plot(x_list_1,y_list_1,color='green',label="No Roulette")

plt.plot(dock_x_list, dock_area_quantifier,color='black',label="Input Library")
plt.legend(loc="lower right")

In [None]:
#area for roulette
area_roul = np.trapz(roulette_area_quantifier,dx=1)
area_no_roul = np.trapz(no_roulette_area_quantifier,dx=1)
area_zinc = np.trapz(dock_area_quantifier,dx=1)

print("No roulette: " +str(1-((area_zinc-area_no_roul)/area_zinc)))
print("Roulette:    " +str(1-((area_zinc-area_roul)/area_zinc)))




In [None]:
#area for roulette
trunc_roulette_area_quantifier=[]
trunc_no_roulette_area_quantifier=[]
trunc_dock_area_quantifier=[]
start=0
end=input_mol_lib_size


for i in range(start_val_x,end_val_x):
    trunc_roulette_area_quantifier.append(roulette_area_quantifier[i])
    trunc_no_roulette_area_quantifier.append(no_roulette_area_quantifier[i])
    trunc_dock_area_quantifier.append(dock_area_quantifier[i])
    
area_roul_truncated = np.trapz(trunc_roulette_area_quantifier,dx=1)
area_no_roul_truncated = np.trapz(trunc_no_roulette_area_quantifier,dx=1)
area_zinc_truncated = np.trapz(trunc_dock_area_quantifier,dx=1)
    
    

print("No roulette: " +str(1-((area_zinc_truncated-area_no_roul_truncated)/area_zinc_truncated)))
print("Roulette:    " +str(1-((area_zinc_truncated-area_roul_truncated)/area_zinc_truncated)))



In [None]:
#area for roulette
trunc_roulette_area_quantifier=[]
trunc_no_roulette_area_quantifier=[]
trunc_dock_area_quantifier=[]
start=1
end=end_val_x

roul_per_diff_track=[]
no_roul_per_diff_track=[]
x_vals=[]

for i in range(start,end):
    trunc_roulette_area_quantifier.append(roulette_area_quantifier[i])
    trunc_no_roulette_area_quantifier.append(no_roulette_area_quantifier[i])
    trunc_dock_area_quantifier.append(dock_area_quantifier[i])
    
    area_roul_truncated = np.trapz(trunc_roulette_area_quantifier,dx=1)
    area_no_roul_truncated = np.trapz(trunc_no_roulette_area_quantifier,dx=1)
    area_zinc_truncated = np.trapz(trunc_dock_area_quantifier,dx=1)
    
    roul_per_diff_track.append(((area_zinc_truncated-area_roul_truncated)/area_zinc_truncated))
    no_roul_per_diff_track.append(((area_zinc_truncated-area_no_roul_truncated)/area_zinc_truncated))
    
    x_vals.append(i)
    



In [None]:
#Combines the ZINC, torsion, and 
plt.style.use('default')
fig=plt.figure(num=None, figsize=(8, 2), dpi=400, facecolor='w', edgecolor='k')
plt.axis([start_val_x, end_val_x, 0, 1.1])
plt.xlabel('Torsion ID',size=5)
plt.ylabel('% Difference to the ZINC Area',size=5)
plt.title('Running percent difference in progressive area',size=8)
plt.tick_params(axis='both',which='major',labelsize=6)
%config InlineBackend.figure_format ='retina'
plt.plot(x_vals, roul_per_diff_track, color='orange',label="Roulette")
plt.plot(x_vals, no_roul_per_diff_track, color='green',label="No Roulette")
#plt.plot(x_list_1,y_list_1,color='green',label="No Roulette")
plt.legend(loc="upper right")

# Euclidean Distance

In [None]:
euclidean_no_roul=[]
euclidean_roul=[]
for i in range(len(dock_x_list)):
    euclidean_roul.append(math.sqrt(((dock_y_list[i]-y_list_2[i])*(dock_y_list[i]-y_list_2[i]))))
    euclidean_no_roul.append(math.sqrt(((dock_y_list[i]-y_list_1[i])*(dock_y_list[i]-y_list_1[i]))))

no_summer=0
yes_summer=0
running_euclidean_no=[]
running_euclidean_yes=[]
running_euclidean_no_avg=[]
running_euclidean_yes_avg=[]
for i in range(len(dock_x_list)):
    no_summer=no_summer+euclidean_no_roul[i]
    yes_summer=yes_summer+euclidean_roul[i]
    running_euclidean_no.append(no_summer)
    running_euclidean_yes.append(yes_summer)
    running_euclidean_no_avg.append(1/(no_summer/(i+1)))
    running_euclidean_yes_avg.append(1/(yes_summer/(i+1)))


#for i in range(500):
#    no_summer=no_summer+euclidean_no_roul[i]
#    yes_summer=yes_summer+euclidean_roul[i]
    
print("No_Roul: " + str(no_summer))
print("Roul:    " + str(yes_summer))

In [None]:
#Plots euclidean distance the ZINC, torsion, and 
plt.style.use('default')
fig=plt.figure(num=None, figsize=(8, 2), dpi=400, facecolor='w', edgecolor='k')
plt.axis([start_val_x, end_val_x, 0, 0.1])
plt.xlabel('Torsion ID',size=5)
plt.ylabel('Euclidean Distance',size=5)
plt.title('Euclidean distance per entry',size=8)
plt.tick_params(axis='both',which='major',labelsize=6)
%config InlineBackend.figure_format ='retina'
plt.plot(dock_x_list, euclidean_roul, color='orange',label="Roulette")
plt.plot(dock_x_list, euclidean_no_roul, color='green',label="No Roulette")
#plt.plot(x_list_1,y_list_1,color='green',label="No Roulette")
plt.legend(loc="upper right")

In [None]:
#Plots running euclidean distance the ZINC, torsion, and 
plt.style.use('default')
fig=plt.figure(num=None, figsize=(8, 2), dpi=400, facecolor='w', edgecolor='k')
plt.axis([start_val_x, end_val_x, 0, 1.5])
plt.xlabel('Torsion ID',size=5)
plt.ylabel('Euclidean Distance',size=5)
plt.title('Running Euclidean Distance',size=8)
plt.tick_params(axis='both',which='major',labelsize=6)
%config InlineBackend.figure_format ='retina'
plt.plot(dock_x_list, running_euclidean_yes, color='orange',label="Roulette")
plt.plot(dock_x_list, running_euclidean_no, color='green',label="No Roulette")
#plt.plot(x_list_1,y_list_1,color='green',label="No Roulette")
plt.legend(loc="lower right")

In [None]:
#Plots running euclidean distance the ZINC, torsion, and 
plt.style.use('default')
fig=plt.figure(num=None, figsize=(8, 2), dpi=400, facecolor='w', edgecolor='k')
plt.axis([start_val_x, end_val_x, 0, 1000])
plt.xlabel('Torsion ID',size=5)
plt.ylabel('Average Euclidean Distance',size=5)
plt.title('Running Average Euclidean Distance',size=8)
plt.tick_params(axis='both',which='major',labelsize=6)
%config InlineBackend.figure_format ='retina'
plt.plot(dock_x_list, running_euclidean_yes_avg, color='orange',label="Roulette")
plt.plot(dock_x_list, running_euclidean_no_avg, color='green',label="No Roulette")
#plt.plot(x_list_1,y_list_1,color='green',label="No Roulette")
plt.legend(loc="lower right")

In [None]:
for x in dock_list:
    print(x)