In [1]:
import pandas as pd
from collections import Counter
from pypif import pif
from pypif.obj import *

## Optional: Time module and starting time to measure runtime of script.
import time

initialized_time = time.time()

In [2]:
## Defining the start/end file range to convert (for example, dsgdb9nsd_000001.xyz to dsgdb9nsd_010000.xyz)
## This script expects the dsgdb .xyz files to be in a folder named "dsgdb9nsd_files", located in this
## same working directory

## Since we're only trying a representative subset, and we want to show the simpler molecules in the early
## files all the way up to the complex ones near the end, we'll select only every 13th file, which yields
## about 10,000 files to experiment with.

file_path = "/Users/robertmanriquez/Documents/citrine_challenge/dsgdb9nsd_files"
starting_file_no = 1
ending_file_no = 133885

In [3]:
def xyz_to_df(xyz_file):
    
    ### Function for converting an individual .xyz file into a single dataframe row.
    
    ## Grab line 2 from the .xyz, parses to a list.
    scalar_list = [float(i) for i in xyz_file[1].strip('gdb ').strip('\t\n').split('\t')]
    
    ## Grab lines 3 through n_a + 2, only the atom locations and Mulliken charges (x, y, z, e)
    xyz_string =''.join([xyz_file[i] for i in range(2, len(xyz_file)-3)])

    row_dict = {  # Define columns entres for each scalar as a dictionary entry

        "n_atoms": xyz_file[0].strip('\n'),

        "i"      : int(scalar_list[0]),
        "A"      : scalar_list[1],
        "B"      : scalar_list[2],
        "C"      : scalar_list[3],
        "u"      : scalar_list[4],
        "alpha"  : scalar_list[5],
        "HOMO"   : scalar_list[6],
        "LUMO"   : scalar_list[7],
        "HL_gap" : scalar_list[8],
        "<R2>"   : scalar_list[9],
        "zpve"   : scalar_list[10],
        "U_0K"   : scalar_list[11],
        "U_298K" : scalar_list[12],
        "H"      : scalar_list[13],
        "G"      : scalar_list[14],
        "Cv"     : scalar_list[15],
            
        # Storing harmonics, smiles, inchl as lists converted to strings to store in a Pandas df.
        
        "Harmonic_freqs": str([float(i) for i in xyz_file[-3].strip('\n').split('\t')]),

        "SMILES" : str(xyz_file[-2].strip('\t\n').split('\t')),

        "InChI"  : str(xyz_file[-1].strip('\n').split('\t')),

        "atoms_xyz": xyz_string   # Grabbing the whole string for atom locations, will parse into .pif later.
    }
    
    return pd.DataFrame(row_dict, index =[0])

In [4]:
# Optional: Timing the function (does 10k files in ~1.1 min, 100k in 15+ min)
start_time = time.time()

df = pd.DataFrame()

pt_num = 1

for i in range(starting_file_no,ending_file_no + 1):

    loop_file = open(str(file_path) + "/dsgdb9nsd_" \
                    + str(i).zfill(6) + ".xyz", 'r').readlines()

    df = pd.concat([df, xyz_to_df(loop_file)], axis = 0)
    
    if i%(10000) == 0 or i == ending_file_no:
        
        print(i, round((time.time() - start_time)/60,2), " minutes")
        
        df.set_index('i', inplace = True)
        df.to_csv('pt' + str(pt_num) + '_dsgdb_' + \
                      str(starting_file_no).zfill(6)+'_to_'+str(ending_file_no).zfill(6)+ '.csv')
        df = pd.DataFrame()
        pt_num += 1

10000 1.09  minutes
20000 2.23  minutes
30000 3.38  minutes
40000 4.56  minutes
50000 5.73  minutes
60000 6.96  minutes
70000 8.12  minutes
80000 9.34  minutes
90000 10.61  minutes
100000 11.78  minutes
110000 12.95  minutes
120000 14.15  minutes
130000 15.34  minutes
133885 15.63  minutes


In [5]:
agg_df = pd.DataFrame()
for n in range(1,15):
    read_df = pd.read_csv('pt'+ str(n) + '_dsgdb_000001_to_133885.csv')
    agg_df = pd.concat([agg_df, read_df], axis = 0)
    
agg_df.to_csv('all_dsgdb.csv', index = False)

In [6]:
agg_df.head()

Unnamed: 0,i,n_atoms,A,B,C,u,alpha,HOMO,LUMO,HL_gap,...,zpve,U_0K,U_298K,H,G,Cv,Harmonic_freqs,SMILES,InChI,atoms_xyz
0,1,5,157.7118,157.70997,157.70699,0.0,13.21,-0.3877,0.1171,0.5048,...,0.044749,-40.47893,-40.476062,-40.475117,-40.498597,6.469,"[1341.307, 1341.3284, 1341.365, 1562.6731, 156...","['C', 'C']","['InChI=1S/CH4/h1H4', 'InChI=1S/CH4/h1H4']",C\t-0.0126981359\t 1.0858041578\t 0.0080009958...
1,2,4,293.60975,293.54111,191.39397,1.6256,9.46,-0.257,0.0829,0.3399,...,0.034358,-56.525887,-56.523026,-56.522082,-56.544961,6.316,"[1103.8733, 1684.1158, 1684.3072, 3458.7145, 3...","['N', 'N']","['InChI=1S/H3N/h1H3', 'InChI=1S/H3N/h1H3']",N\t-0.0404260543\t 1.0241077531\t 0.0625637998...
2,3,3,799.58812,437.90386,282.94545,1.8511,6.31,-0.2928,0.0687,0.3615,...,0.021375,-76.404702,-76.401867,-76.400922,-76.422349,6.002,"[1671.4222, 3803.6305, 3907.698]","['O', 'O']","['InChI=1S/H2O/h1H2', 'InChI=1S/H2O/h1H2']",O\t-0.0343604951\t 0.9775395708\t 0.0076015923...
3,4,4,0.0,35.610036,35.610036,0.0,16.28,-0.2845,0.0506,0.3351,...,0.026841,-77.308427,-77.305527,-77.304583,-77.327429,8.574,"[549.7648, 549.7648, 795.2713, 795.2713, 2078....","['C#C', 'C#C']","['InChI=1S/C2H2/c1-2/h1-2H', 'InChI=1S/C2H2/c1...",C\t 0.5995394918\t 0.\t 1.\t-0.207019\nC\t-0.5...
4,5,3,0.0,44.593883,44.593883,2.8937,12.99,-0.3604,0.0191,0.3796,...,0.016601,-93.411888,-93.40937,-93.408425,-93.431246,6.278,"[799.0101, 799.0101, 2198.4393, 3490.3686]","['C#N', 'C#N']","['InChI=1S/CHN/c1-2/h1H', 'InChI=1S/CHN/c1-2/h...",C\t-0.0133239314\t 1.1324657151\t 0.0082758861...


In [7]:
agg_df.shape

(133885, 21)