In [2]:
import numpy as np  
  
def generate_water_structure(num_molecules):  
    # Constants for water geometry  
    O_H_distance = 0.96  # angstroms  
    H_O_H_angle = 104.5  # degrees in angle  
      
    # Start position for the first oxygen atom  
    start_pos = np.array([0.0, 0.0, 0.0])  
    molecule_distance = 3.0  # Distance between molecules in the grid  
      
    # Convert H-O-H angle to radians for calculations  
    angle_rad = np.radians(H_O_H_angle / 2)  
      
    # Calculate initial positions for hydrogen atoms relative to oxygen  
    h1_rel_pos = np.array([O_H_distance * np.sin(angle_rad), O_H_distance * np.cos(angle_rad), 0.0])  
    h2_rel_pos = np.array([-O_H_distance * np.sin(angle_rad), O_H_distance * np.cos(angle_rad), 0.0])  
      
    print(f"{num_molecules * 3}\ncharge 0 multiplicity 1 unit angstrom")  
      
    # Generate and print positions  
    for i in range(num_molecules):  
        # Calculate grid positions  
        x = (i % 4) * molecule_distance  
        y = ((i // 4) % 4) * molecule_distance  
        z = (i // 16) * molecule_distance  
          
        o_pos = start_pos + np.array([x, y, z])  
        h1_pos = o_pos + h1_rel_pos  
        h2_pos = o_pos + h2_rel_pos  
          
        # Print molecule positions  
        print(f"O       {o_pos[0]:.6f}   {o_pos[1]:.6f}   {o_pos[2]:.6f}")  
        print(f"H       {h1_pos[0]:.6f}    {h1_pos[1]:.6f}   {h1_pos[2]:.6f}")  
        print(f"H       {h2_pos[0]:.6f}    {h2_pos[1]:.6f}   {h2_pos[2]:.6f}")


In [9]:
generate_water_structure(63)

189
charge 0 multiplicity 1 unit angstrom
O       0.000000   0.000000   0.000000
H       0.759062    0.587729   0.000000
H       -0.759062    0.587729   0.000000
O       3.000000   0.000000   0.000000
H       3.759062    0.587729   0.000000
H       2.240938    0.587729   0.000000
O       6.000000   0.000000   0.000000
H       6.759062    0.587729   0.000000
H       5.240938    0.587729   0.000000
O       9.000000   0.000000   0.000000
H       9.759062    0.587729   0.000000
H       8.240938    0.587729   0.000000
O       0.000000   3.000000   0.000000
H       0.759062    3.587729   0.000000
H       -0.759062    3.587729   0.000000
O       3.000000   3.000000   0.000000
H       3.759062    3.587729   0.000000
H       2.240938    3.587729   0.000000
O       6.000000   3.000000   0.000000
H       6.759062    3.587729   0.000000
H       5.240938    3.587729   0.000000
O       9.000000   3.000000   0.000000
H       9.759062    3.587729   0.000000
H       8.240938    3.587729   0.000000
O   

In [22]:
db_path = '/data/pubchem_pm6/data.db'

In [23]:
import apsw
connection = apsw.Connection(db_path)  
cursor = connection.cursor()  


In [25]:

query = "SELECT name FROM sqlite_master WHERE type='table';"  
    
selected_molecules = []  
for row in cursor.execute(query):
    print(row)


In [26]:
table_names = ['data', 'nuclear_charges', 'basisset', 'metadata']
for table_name in table_names:
    print(table_name) 
    query = f"SELECT COUNT(*) FROM {table_name}"
    cursor.execute(f"SELECT COUNT(*) FROM {table_name};")  
    count = cursor.fetchone()[0] # fetchone()返回一个元组，我们需要第一个元素 
    print(f"The table '{table_name}' has {count} records.")
    query = f"PRAGMA table_info({table_name});"     
    for row in cursor.execute(query):
        print(row)
    print()

data


SQLError: SQLError: no such table: data

In [6]:
type(row[1])

str

In [7]:
import pickle

In [8]:
import numpy as np
# def _deblob(buf, dtype=np.float32, shape=None):
#     """Convert blob/buffer object to numpy array. Warns and returns None if shape cannot be transformed.
    
#     Args:
#         buf: The buffer object to be converted.
#         dtype: Data type of the resulting array, defaults to np.float32.
#         shape: The desired shape of the array.
    
#     Returns:
#         numpy.ndarray or None: The transformed array with the desired shape, or None if the transformation is not possible.
        
#     Note:
#         This function includes a check for shape transformability because, in the dataset creation process,
#         non-existent keys are sometimes stored as np.array(1), which cannot be reshaped as intended.
#     """
#     if buf is None:
#         return np.zeros(shape)
    
#     array = np.frombuffer(buf, dtype=dtype)
#     if not np.little_endian:
#         array = array.byteswap()
    
#     if shape is not None and np.prod(shape) != array.size:
#         # warnings.warn("Buffer shape cannot be transformed to the desired shape, returning None.")
#         return None
    
#     array.shape = shape
#     return array

def _deblob(buf, dtype=np.float32):
    """Convert blob/buffer object to numpy array. Warns and returns None if shape cannot be transformed.
    
    Args:
        buf: The buffer object to be converted.
        dtype: Data type of the resulting array, defaults to np.float32.
        shape: The desired shape of the array.
    
    Returns:
        numpy.ndarray or None: The transformed array with the desired shape, or None if the transformation is not possible.
        
    Note:
        This function includes a check for shape transformability because, in the dataset creation process,
        non-existent keys are sometimes stored as np.array(1), which cannot be reshaped as intended.
    """
    # if buf is None:
    #     return np.zeros(shape)
    
    array = np.frombuffer(buf, dtype=dtype)
    if not np.little_endian:
        array = array.byteswap()
    
    # if shape is not None and np.prod(shape) != array.size:
    #     # warnings.warn("Buffer shape cannot be transformed to the desired shape, returning None.")
    #     return None
    
    # array.shape = shape
    return array


In [13]:
col_names = ['id', 'R', 'E', 'F', 'A', 'G', 'L']
col_names_str = ",".join(col_names)
query = f"SELECT {col_names_str} FROM data LIMIT 5"
for row in cursor.execute(query):
    print(row)
    for col_name, original_item in zip(col_names, row):
        if type(original_item) is bytes:
            # print(original_item)            
            try:
                # item = pickle.loads(original_item) 
                item = _deblob(original_item)
            except pickle.UnpicklingError:
                print("failed: ", col_name)
                continue            
        else:
            item = original_item
        print(col_name, item)
        try:
            print(item.shape)
        except:
            pass
        

(0, b'.\x17\xd5@\xc1\xdd\x82@\x87\xd7\x99?<I\xbc@\xc3w@@K\xe0\xe6>\x16e\xa5@\xdcF\x03@\xb0\x8a\xab?o\xb6\xbe@u\x1ay?Z\xb1\xf3?\r\xa1\xcd@\xe6o\x82\xbd\x11\xbd\xa4?\x05L\xc8@\xdb\x80\x9f\xbe\xf0\xba\x1e\xbeu\xe0\xf3@b(\xa7\xbepZP\xbf\xa9\xab\x01A\xe7\xf6Z?\x06|\xb2\xbf\x9c\x85\x15A\x13yb?\xfe&\x02\xc0\x94\xf1!A\xd4>\x8d\xbeH\xf9\x05\xc0\xba~\x1aA\xa5D\xba\xbf\x03\xe7\xc0\xbf\x07\x94\x06A\xb7\x01\xbf\xbf\x9b\x16\\\xbf\x81\xe2\xfe@\xc1\xdd1\xc0\x1d3P\xbe\x15\xd9\xdb@&\xa5Z\xc0^F\x81\xbfK\xa6\xeb@\x96\x1c}\xc0\x0c\xe0\x13\xc0v!\x03A\xd8\xd3\xa7\xc0\xb4\x1a\x0c\xc0\x18\x90\xed@\xc7)\xca\xc0!\xa6\xb4\xbf;\xbd\xdd@\x13\x05\xb9\xc0\xf0\xba~\xbd\x19\x86\xc4@\x1a\x9e\x8e\xc0\xd9\x8fT\xbe\xfbR\x98@\xb5\xf8\x95\xc0\x12,V\xbf\xfd`\xc1@\xbe\xbc\x8c?\xd8ZW@7J\xb9@\x07\xac\x1c\xbe\x8bg\x84@\xc6F\xdf@\xd3\xb7\x90\xbf\x9a\x94\x85@\xb2\x8c\x04A\xcbb\xe2\xbesF\x93@\rx\x08AmQV?q3r@\xa0\xbe\xe9@\xc8\x02\xe2?n\xd3s@\xd4\xa6\x03A\x10S\xe2\xbdt\xb0\xc2@nd\xf1@\x0f#,?OS\xd6@\xf4\x84\x12A=\x05h\xbf\x91?\xd7@?\x8

In [41]:
import lmdb  
import pickle  
import os  
  
# 定义一组我们感兴趣的元素（碳氮氧氢氯的原子序数）  
INTERESTED_ELEMENTS = {1, 6, 7, 8, 17}  
  
# 创建或打开一个 LMDB 数据库环境  
env = lmdb.open('/data/pubchem_pm6/', readonly=True, map_size=100*1024*1024)  # 数据库大小设为 100MB  
  
def is_valid_system(atoms):  
    """检查体系是否只包含特定元素，并且原子数量在1到100之间""" 
    print('mol size:', len(atoms))
    if 1 <= len(atoms) <= 100 and all(atom+1 in INTERESTED_ELEMENTS for atom in atoms):  
        return True  
    return False  
  
def write_xyz_file(data, file_path):  
    """将体系信息写入.xyz文件"""  
    # if not os.path.exists(file_path):
    #     os.makedirs(file_path)
    with open(file_path, 'w') as f:  
        f.write(f"{len(data['coords'])}\n")  
        f.write("charge 0 multiplicity 1 unit angstrom\n")  
        for atom, pos in zip(data['node_feat'][:,0], data['coords']):  
            f.write(f"{atom}   {'   '.join(map(str, pos))}\n")  
  
def get_atom_count_interval(atom_count):  
    """根据原子数量返回对应的区间"""  
    return (atom_count - 1) // 10  
  
# 用于跟踪每个原子数量区间内已选取的体系数量  
interval_count = {}  
  
# 开始只读事务  
with env.begin() as txn:  
    cursor = txn.cursor()  
    selected_systems = 0  
    
    for key, value in cursor:  
        # 解码和反序列化值  
        data = pickle.loads(value)  
          
        # 检查原子类型和数量是否符合条件  
        if is_valid_system(data['node_feat'][:,0].tolist()): 
        # if True==True: 
            atom_count = len(data['node_feat'][:,0])
            # if atom_count <= 200:
            #     continue
            interval = get_atom_count_interval(atom_count)  
              
            # 确保每个区间最多有两个体系  
            if interval_count.get(interval, 0) < 2:  
                # 符合条件，写入.xyz文件  
                file_path = os.path.join('/home/weixinran/MADFT-NN/local_files/pubchem_1_200/', f"pubchem{len(data['node_feat'])}.xyz")  
                write_xyz_file(data, file_path)  
                  
                # 更新区间内的体系数量  
                interval_count[interval] = interval_count.get(interval, 0) + 1  
                selected_systems += 1  
                  
                # 当选取足够的体系后退出循环  
                if selected_systems >= 50:  
                    break  
  
# 关闭环境  
env.close()

mol size: 57


FileNotFoundError: [Errno 2] No such file or directory: '/home/weixinran/MADFT-NN/local_files/pubchem_1_100/pubchem57.xyz'

In [14]:
from madftnn.dataset.sqlite_database.hamiltonian_database_qhnet import HamiltonianDatabase_qhnet

filepath = '/data/pubchem/pubchem_big_nolimit.db'
db = HamiltonianDatabase_qhnet(filepath)



/data/pubchem/pubchem_big_nolimit.db


In [16]:
db[0][-1]

array([6, 6, 7, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 7, 6,
       6, 6, 6, 6, 6, 8, 8, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1], dtype=int32)

In [21]:
from madftnn.dataset.sqlite_database.hamiltonian_database_qhnet import HamiltonianDatabase_qhnet
import os  
  
# 定义一组我们感兴趣的元素（碳氮氧氢氯的原子序数）  
INTERESTED_ELEMENTS = {1, 6, 7, 8, 17}  
  
# 创建或打开一个 LMDB 数据库环境  
filepath = '/data/pubchem/pubchem_huge.db'
db = HamiltonianDatabase_qhnet(filepath)
  
def is_valid_system(atoms):  
    """检查体系是否只包含特定元素，并且原子数量在1到100之间""" 
    # print(atoms)
    if 1 <= len(atoms) <= 100 and all(atom in INTERESTED_ELEMENTS for atom in atoms):  
        return True  
    return False  
  
def write_xyz_file(data, file_path):  
    """将体系信息写入.xyz文件"""  
    # if not os.path.exists(file_path):
    #     os.makedirs(file_path)
    with open(file_path, 'w') as f:  
        f.write(f"{len(data['pos'])}\n")  
        f.write("charge 0 multiplicity 1 unit angstrom\n")  
        for atom, pos in zip(data['node_feat'][:,0], data['pos']):  
            f.write(f"{atom}   {'   '.join(map(str, pos))}\n")  
  
def get_atom_count_interval(atom_count):  
    """根据原子数量返回对应的区间"""  
    return (atom_count - 1) // 10  
  
# 用于跟踪每个原子数量区间内已选取的体系数量  
interval_count = {}  
  
# 开始只读事务  

selected_systems = 0  

for idx in range(len(db)):  
    # 解码和反序列化值  
    data = db[idx]
    L = data[-1]
    R = data[0]
    # 检查原子类型和数量是否符合条件  
    if is_valid_system(L):  
        atom_count = len(L)  
        interval = get_atom_count_interval(atom_count)  
            
        # 确保每个区间最多有两个体系  
        if interval_count.get(interval, 0) < 2:  
            # 符合条件，写入.xyz文件  
            # file_path = os.path.join('./local_files/pubchem_1_100/', f"pubchem{len(L)}.xyz") 
            # write_xyz_file(data, file_path)  
            print('idx:', idx, ', size:',len(L))
                
            # 更新区间内的体系数量  
            interval_count[interval] = interval_count.get(interval, 0) + 1  
            selected_systems += 1  
                
            # 当选取足够的体系后退出循环  
            if selected_systems >= 20:  
                break  


/data/pubchem/pubchem_huge.db
idx: 3 , size: 49
idx: 4 , size: 54


idx: 16 , size: 53
idx: 60 , size: 48


KeyboardInterrupt: 

In [9]:
import os  
  
# 原子符号到序数的映射表  
ATOM_LIST = [  
    'h',  'he', 'li', 'be', 'b',  'c',  'n',  'o',  'f',  'ne',  
    'na', 'mg', 'al', 'si', 'p',  's',  'cl', 'ar',  
    'k',  'ca', 'sc', 'ti', 'v',  'cr', 'mn', 'fe', 'co', 'ni', 'cu',  
    'zn', 'ga', 'ge', 'as', 'se', 'br', 'kr',  
    'rb', 'sr', 'y',  'zr', 'nb', 'mo', 'tc', 'ru', 'rh', 'pd', 'ag',  
    'cd', 'in', 'sn', 'sb', 'te', 'i',  'xe',  
    'cs', 'ba', 'la', 'ce', 'pr', 'nd', 'pm', 'sm', 'eu', 'gd', 'tb', 'dy',  
    'ho', 'er', 'tm', 'yb', 'lu', 'hf', 'ta', 'w',  're', 'os', 'ir', 'pt',  
    'au', 'hg', 'tl', 'pb', 'bi', 'po', 'at', 'rn',  
    'fr', 'ra', 'ac', 'th', 'pa', 'u',  'np', 'pu'  
]  
  
def adjust_atomic_symbols(filepath):  
    # 读取原始文件内容  
    with open(filepath, 'r') as file:  
        lines = file.readlines()  
  
    # 调整原子符号  
    adjusted_lines = []  
    for idx, line in enumerate(lines):  
        parts = line.split()  
        if len(parts) > 1 and idx>=2:  
            # 找到原子符号对应的序数，并加1  
            index = int(parts[0])
            # 由于列表索引从0开始，而原子序数从1开始，这里需要加2  
            new_symbol = ATOM_LIST[(index) % len(ATOM_LIST)].upper()  # 使用模运算防止越界
            print(new_symbol)  
            parts[0] = new_symbol  
            adjusted_line = "    ".join(parts) + "\n" 
            adjusted_lines.append(adjusted_line)
            print(adjusted_lines)  
        else:  
            adjusted_lines.append(line)  
  
    # 将修改后的内容写回文件  
    with open(filepath, 'w') as file:  
        file.writelines(adjusted_lines)  
  
def main(directory):  
    # 遍历指定目录下的所有xyz文件  
    for root, dirs, files in os.walk(directory):  
        for file in files:  
            if file.endswith(".xyz"):  
                filepath = os.path.join(root, file)  
                print(f"正在调整文件：{filepath}")  
                adjust_atomic_symbols(filepath)  
    print("所有文件调整完成。")  
  
# 指定你要调整的文件夹路径  
directory = "/home/weixinran/MADFT-NN/local_files/pubchem_1_200/"  
main(directory)  



正在调整文件：/home/weixinran/MADFT-NN/local_files/pubchem_1_200/pubchem195.xyz
正在调整文件：/home/weixinran/MADFT-NN/local_files/pubchem_1_200/pubchem201.xyz
C
['201\n', 'charge 0 multiplicity 1 unit angstrom\n', 'C 8.27576 -2.47079 4.45612\n']
C
['201\n', 'charge 0 multiplicity 1 unit angstrom\n', 'C 8.27576 -2.47079 4.45612\n', 'C 9.16936 -3.70225 4.66289\n']
C
['201\n', 'charge 0 multiplicity 1 unit angstrom\n', 'C 8.27576 -2.47079 4.45612\n', 'C 9.16936 -3.70225 4.66289\n', 'C 8.78243 -4.4286 5.95775\n']
C
['201\n', 'charge 0 multiplicity 1 unit angstrom\n', 'C 8.27576 -2.47079 4.45612\n', 'C 9.16936 -3.70225 4.66289\n', 'C 8.78243 -4.4286 5.95775\n', 'C 9.0706 -4.63969 3.44017\n']
C
['201\n', 'charge 0 multiplicity 1 unit angstrom\n', 'C 8.27576 -2.47079 4.45612\n', 'C 9.16936 -3.70225 4.66289\n', 'C 8.78243 -4.4286 5.95775\n', 'C 9.0706 -4.63969 3.44017\n', 'C 10.18029 -5.69884 3.42745\n']
C
['201\n', 'charge 0 multiplicity 1 unit angstrom\n', 'C 8.27576 -2.47079 4.45612\n', 'C 9.16936 -3.70