# Packages


In [1]:
# import packages/modules
import numpy as np
# import matplotlib.pyplot as plt
import pandas as pd
from math import pow, exp, log, sqrt
# from scipy import optimize
import os
import re
import csv
import openpyxl
import xlsxwriter
# word
# from docx import Document

### Base Dir


In [2]:
# current directory
base_dir = os.getcwd()
print(f"base_dir {base_dir}")

base_dir e:\Python Projects\GaussParse\GaussParse\notebook


### List uploaded files from desired folder


In [3]:
def ListFiles(targetPath, fileExtension='txt'):
    '''
    list files in a target file

    args:
        targetPath: target path
        fileExtension: file extension, default is empty

    return:
        a list of files in the target path
    '''
    try:
        # check
        if os.path.exists(targetPath):
            filesFound = []
            for f in os.listdir(targetPath):
                if f.endswith('.'+str(fileExtension)):
                    filesFound.append(f)
            # res
            return filesFound
        else:
            raise Exception("target path is not valid.")

    except Exception as e:
        raise

In [4]:
def CheckFileFormat(filePath):
    '''
    check file format

    args:
        filePath: file name dir

    return:
        file directory, file name, file format
    '''
    # check file exist
    if os.path.isfile(filePath):
        # file analysis
        fileDir = os.path.dirname(filePath)
        fileNameWithExtension = os.path.basename(filePath)
        fileName, fileExtension = os.path.splitext(fileNameWithExtension)
        # res
        return fileDir, fileName, fileExtension
    else:
        raise Exception('file path is not valid.')

### Load file names

**_ `file_folder` _**

a directory in which all log file are stored.


In [5]:
# log dir
target_folder = "D:\\OneDrive\\Project Analysis\\Computational Chemistry\\analysis\\terpens epoxidation\\extract data\\fructose-dioxirane-limonene-mechanism-3"

# list files
file_list = ListFiles(target_folder, "txt")
print("res:", file_list)
print("count: ", len(file_list))

res: ['1,2LO.txt', 'PC1.txt', 'RC1.txt', 'TS1-cis.txt']
count:  4


In [7]:
# # file folder
# file_folder = "data\\isolate-sol"
# # target folder
# target_folder = os.path.join(base_dir, file_folder)

# # list files
# file_list = ListFiles(target_folder, "log")
# print("res:", file_list)

### Sort file list based on model


Read log file


In [6]:
def ReadFile(filePath):
    '''
    read the content of file and put the info in a matrix

    args:
        filePath: full name of file with directory and format

    return:
        res: matrix of info
    '''
    try:
        # file info
        fileDir, fileName, fileExtension = CheckFileFormat(filePath)

        # dict
        item_rows = {}
        column_names = ['Parameter', 'Value', 'Unit']

        # index
        k = 1

        # file open
        with open(filePath, "r") as f:
            # find main section
            fContent = f.readlines()

            for item in fContent:
                # to skip empty line
                if len(item) > 1:
                    # set id
                    _id = "data "+str(k)
                    # find "=" otherwise is description line
                    if item.find("=") > -1:
                        # split: data 1 = data 2
                        data1, data2 = item.strip().split("=", 1)

                        # key
                        _key = str(data1).replace("\n", "").strip()

                        # check "space" between number and string
                        _str_search = data2.strip()
                        if _str_search.find(" ") > -1:
                            # check begins with [-][+][number]
                            if _str_search.startswith("-") or _str_search.startswith("+") or _str_search[0].isdigit():
                                # sort data
                                _ext = re.search(
                                    r"([-+]?\d*\.?\d+([eE][-+]?\d+)?)\s*(.*)", _str_search, re.M)
                                # check
                                if _ext:
                                    numeric_value = float(_ext.group(1))
                                    unit = str(_ext.group(3))
                            else:
                                numeric_value = _str_search
                                unit = ''
                        else:
                            # only one part value (without unit)
                            # check numeric
                            _data2 = str(data2).replace(
                                "\n", "").replace(".", "").strip()
                            if _data2.isnumeric():
                                # number
                                numeric_value = float(data2)
                            else:
                                # string
                                numeric_value = data2
                            unit = ""

                    else:
                        # description line
                        _key = str(item).replace("\n", "")
                        numeric_value = ''
                        unit = ''

                    # store
                    item_rows[_id] = {"Parameter": str(
                        _key), "Value": numeric_value, "Unit": str(unit)}

                    # set
                    k += 1

        return fileName, item_rows, column_names
    except Exception as e:
        raise

In [7]:
def AnalyzeFiles(targetPath, fileList):
    '''
    analyze each file

    args:
        targetPath: target folder
        fileList: list of selected files

    output:
        res: dict
    '''
    try:
        # check
        if len(fileList) > 0:
            # res
            res = []
            for item in fileList:
                # file path
                _file_full_path = os.path.join(str(targetPath), str(item))
                # read file
                _res = ReadFile(_file_full_path)
                # save
                res.append(_res)

            # return
            return res
        else:
            raise Exception("file list is empty!")

    except Exception as e:
        raise

In [8]:
# target folder
print(target_folder)
# file list
print(file_list)
# count
print(len(file_list))

D:\OneDrive\Project Analysis\Computational Chemistry\analysis\terpens epoxidation\extract data\fructose-dioxirane-limonene-mechanism-3
['1,2LO.txt', 'PC1.txt', 'RC1.txt', 'TS1-cis.txt']
4


In [9]:
res = AnalyzeFiles(target_folder, file_list)
print(res[0][0])
print(res[0][1])
print(res[0][2])
print(len(res))

1,2LO
{'data 1': {'Parameter': 'Filename', 'Value': 'D:/OneDrive/Project Analysis/Computational Chemistry/models/terpens epoxidation/limonene-dmdo/mechanism 2/step 1/qst3/res 1/tight/res 1/irc/res 1/1,2 LO/res 1/link/res 1/solvent/res 2/g09_exp.log', 'Unit': ''}, 'data 2': {'Parameter': 'Overview Tab Data Section:', 'Value': '', 'Unit': ''}, 'data 3': {'Parameter': 'TS1 (Optimization completed)', 'Value': '', 'Unit': ''}, 'data 4': {'Parameter': 'D:/OneDrive/Project Analysis/Computational Chemistry/models/terpens epoxidation/limonene-dmdo/mechanism 2/step 1/qst3/res 1/tight/res 1/irc/res 1/1,2 LO/res 1/link/res 1/solvent/res 2/g09_exp.log', 'Value': '', 'Unit': ''}, 'data 5': {'Parameter': 'File Type', 'Value': ' .log', 'Unit': ''}, 'data 6': {'Parameter': 'Calculation Type', 'Value': ' FREQ', 'Unit': ''}, 'data 7': {'Parameter': 'Calculation Method', 'Value': ' RM062X', 'Unit': ''}, 'data 8': {'Parameter': 'Basis Set', 'Value': ' 6-311++G(d,p)', 'Unit': ''}, 'data 9': {'Parameter': 'C

In [10]:
# file name (sheet name)
dict_name = res[0][0]
print(dict_name)

dict_data = list(res[0][1].values())
print(dict_data)

# define columns
dict_column_name = res[0][2]
print(dict_column_name)

1,2LO
[{'Parameter': 'Filename', 'Value': 'D:/OneDrive/Project Analysis/Computational Chemistry/models/terpens epoxidation/limonene-dmdo/mechanism 2/step 1/qst3/res 1/tight/res 1/irc/res 1/1,2 LO/res 1/link/res 1/solvent/res 2/g09_exp.log', 'Unit': ''}, {'Parameter': 'Overview Tab Data Section:', 'Value': '', 'Unit': ''}, {'Parameter': 'TS1 (Optimization completed)', 'Value': '', 'Unit': ''}, {'Parameter': 'D:/OneDrive/Project Analysis/Computational Chemistry/models/terpens epoxidation/limonene-dmdo/mechanism 2/step 1/qst3/res 1/tight/res 1/irc/res 1/1,2 LO/res 1/link/res 1/solvent/res 2/g09_exp.log', 'Value': '', 'Unit': ''}, {'Parameter': 'File Type', 'Value': ' .log', 'Unit': ''}, {'Parameter': 'Calculation Type', 'Value': ' FREQ', 'Unit': ''}, {'Parameter': 'Calculation Method', 'Value': ' RM062X', 'Unit': ''}, {'Parameter': 'Basis Set', 'Value': ' 6-311++G(d,p)', 'Unit': ''}, {'Parameter': 'Charge', 'Value': 0.0, 'Unit': ''}, {'Parameter': 'Spin', 'Value': ' Singlet', 'Unit': ''},

# Save csv


In [5]:
def dict_to_csv(dict_data, dict_column_name, csv_name, path=''):
    with open(str(csv_name), 'w') as f:
        writer = csv.DictWriter(f, fieldnames=dict_column_name)
        writer.writeheader()
        writer.writerows(dict_data)

In [None]:
with open('test.csv', 'w') as f:
    writer = csv.DictWriter(f, fieldnames=dict_column_name)
    writer.writeheader()
    writer.writerows(dict_data)

### Display data in table


In [11]:
def save_data_to_excel(d, excel_file_dir='', excel_file_name='res', excel_engine='xlsxwriter'):
    '''
    save gaussian output.log to excel

    args:
        d: input data
            file name
    '''
    # file name
    excel_file_name = excel_file_name+'.xlsx'
    #  excel file path
    if len(excel_file_dir) > 0:
        # chosen path
        excel_file_path = os.path.join(
            excel_file_dir, excel_file_name)
    else:
        # current directory
        excel_file_path = os.getcwd()+'/'+excel_file_name

    # check file exist
    # if not os.path.isfile(excel_file_path):
    #     if not os.path.exists(excel_file_path):
    #         # create excel file
    #         writer = pd.ExcelWriter(excel_file_path,engine='xlsxwriter')
    #         # writer.close()

    writer = pd.ExcelWriter(excel_file_path, engine='xlsxwriter')

    # add each df to the excel sheet
    for item in d:
        # sheet name
        sheet_name = item[0]
        sheet_data = list(item[1].values())
        sheet_column_name = item[2]

        # df
        df = pd.DataFrame.from_dict(sheet_data)
        # size
        row, col = df.shape

        # check excel engine
        if excel_engine == 'xlsxwriter':
            # store
            df.to_excel(writer, sheet_name=str(sheet_name),
                        index=False, header=True, columns=sheet_column_name)

            # workbook and worksheet objects
            workbook = writer.book
            worksheet = writer.sheets[str(sheet_name)]

            # set column
            worksheet.set_column(0, 0, 110)
            worksheet.set_column(1, 2, 30)

            # set boarder
            border_format = workbook.add_format({'border': 1})
            worksheet.conditional_format(
                0, 0, row, col, {'type': 'no_blanks', 'format': border_format})
            worksheet.conditional_format(
                0, 0, row, col, {'type': 'blanks', 'format': border_format})

        else:
            # store
            with pd.ExcelWriter(excel_file_path, engine='openpyxl', mode='a', if_sheet_exists="new") as writer2:
                try:
                    df.to_excel(writer2, sheet_name=str(
                        sheet_name), index=False, header=True, columns=sheet_column_name)
                except:
                    df.to_excel(writer2, sheet_name=str(
                        sheet_name), index=False)

    # close
    writer.close()

Save to excel files


In [14]:
# test
excel_file_name = "fructose-dioxirane-m3"


# store data in excel


save_data_to_excel(res, excel_file_dir=target_folder,
                   excel_file_name=excel_file_name)