In [1]:
import pandas as pd
from rdkit import Chem

In [2]:
b97d3_df = pd.read_csv('tests/data/reaction/b97d3.csv')
wb97xd3_df = pd.read_csv('tests/data/reaction/wb97xd3.csv')
ccsdtf12_dz_df = pd.read_csv('tests/data/reaction/ccsdtf12_dz.csv')
ccsdtf12_tz_df = pd.read_csv('tests/data/reaction/ccsdtf12_tz.csv')

In [3]:
# print the length of each df
print(len(b97d3_df))
print(len(wb97xd3_df))
print(len(ccsdtf12_dz_df))
print(len(ccsdtf12_tz_df))

16302
11926
11926
15


In [4]:
list(b97d3_df.columns)

['idx', 'rsmi', 'psmi', 'rinchi', 'pinchi', 'dE0', 'dHrxn298', 'rmg_family']

In [5]:
# drop idx column from all dfs
b97d3_df = b97d3_df.drop(columns=['idx'])
wb97xd3_df = wb97xd3_df.drop(columns=['idx'])
ccsdtf12_dz_df = ccsdtf12_dz_df.drop(columns=['idx'])
ccsdtf12_tz_df = ccsdtf12_tz_df.drop(columns=['idx'])

In [6]:
# add a suffix of the df name to the 'dE0' and 'dHrxn298' column names in all dfs
b97d3_df = b97d3_df.rename(columns={'dE0': 'dE0_b97d3', 'dHrxn298': 'dHrxn298_b97d3'})
wb97xd3_df = wb97xd3_df.rename(columns={'dE0': 'dE0_wb97xd3', 'dHrxn298': 'dHrxn298_wb97xd3'})
ccsdtf12_dz_df = ccsdtf12_dz_df.rename(columns={'dE0': 'dE0_ccsdtf12_dz', 'dHrxn298': 'dHrxn298_ccsdtf12_dz'})
ccsdtf12_tz_df = ccsdtf12_tz_df.rename(columns={'dE0': 'dE0_ccsdtf12_tz', 'dHrxn298': 'dHrxn298_ccsdtf12_tz'})

In [7]:
# merge all 4 dataframes on the shared columns
merged_df = pd.merge(b97d3_df, wb97xd3_df, on=['rsmi', 'psmi', 'rinchi', 'pinchi', 'rmg_family'], how='outer')
merged_df = pd.merge(merged_df, ccsdtf12_dz_df, on=['rsmi', 'psmi', 'rinchi', 'pinchi', 'rmg_family'], how='outer')
merged_df = pd.merge(merged_df, ccsdtf12_tz_df, on=['rsmi', 'psmi', 'rinchi', 'pinchi', 'rmg_family'], how='outer')
merged_df

Unnamed: 0,rsmi,psmi,rinchi,pinchi,dE0_b97d3,dHrxn298_b97d3,rmg_family,dE0_wb97xd3,dHrxn298_wb97xd3,dE0_ccsdtf12_dz,dHrxn298_ccsdtf12_dz,dE0_ccsdtf12_tz,dHrxn298_ccsdtf12_tz
0,[C:1]([c:2]1[n:3][o:4][n:5][n:6]1)([H:7])([H:8...,[C:1]1([H:7])([H:8])/[C:2](=[N:3]\[H:9])[N:6]1...,InChI=1S/C2H3N3O/c1-2-3-5-6-4-2/h1H3,"InChI=1S/C2H3N3O/c3-2-1-5(2)4-6/h3H,1H2/b3-2+",77.68929,39.67568,,,,,,,
1,[C:1]([c:2]1[n:3][o:4][n:5][n:6]1)([H:7])([H:8...,[C:1]([C:2](=[N:3][O-:4])[N+:6]#[N:5])([H:7])(...,InChI=1S/C2H3N3O/c1-2-3-5-6-4-2/h1H3,InChI=1S/C2H3N3O/c1-2(4-3)5-6/h1H3,45.36772,10.92837,,,,,,,
2,[C:1]([c:2]1[n:3][o:4][n:5][n:6]1)([H:7])([H:8...,[C:1](=[C:2]1[N-:3][O+:4]=[N:5][N:6]1[H:9])([H...,InChI=1S/C2H3N3O/c1-2-3-5-6-4-2/h1H3,"InChI=1S/C2H3N3O/c1-2-3-5-6-4-2/h3H,1H2",84.71304,50.33734,,,,,,,
3,[C:1]([c:2]1[n:3][o:4][n:5][n:6]1)([H:7])([H:8...,[C:1]([C:2]#[N:3])([H:7])([H:8])[H:9].[O:4]=[N...,InChI=1S/C2H3N3O/c1-2-3-5-6-4-2/h1H3,InChI=1S/C2H3N.N2O/c2*1-2-3/h1H3;,24.03598,-15.08950,,,,,,,
4,[C:1]([c:2]1[n:3][o:4][n:5][n:6]1)([H:7])([H:8...,[C:1](=[C:2]1[N:3]([H:8])[O:4][N:5]=[N:6]1)([H...,InChI=1S/C2H3N3O/c1-2-3-5-6-4-2/h1H3,"InChI=1S/C2H3N3O/c1-2-3-5-6-4-2/h4H,1H2",77.67989,32.07768,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17823,[C:1]([C@@:2]1([H:11])[C@@:3]2([H:12])[C:4]([H...,[C:1]([C@:2]12[C:3]([H:11])([H:12])[C@@:7]1([H...,"InChI=1S/C7H10/c1-5-6-3-2-4-7(5)6/h2-3,5-7H,4H...","InChI=1S/C7H10/c1-3-5-6-4-7(5,6)2/h3,5-6H,1,4H...",,,,117.48318,31.76502,112.31594,33.00587,,
17824,[N:1]([c:2]1[c:3]([H:10])[n:4]([H:11])[c:5]([O...,[N+:1]1([H:8])([H:9])[C:2](=[C:3]([N:4]([C:5][...,"InChI=1S/C3H5N3O/c4-2-1-5-3(7)6-2/h1H,4H2,(H2,...","InChI=1S/C3H5N3O/c7-2-4-1-3-5-6-3/h1,4,7H,5H2",,,,129.66013,127.26319,126.96395,124.91065,,
17825,[C:1]([C@@:2]([O:3][H:12])([C:4]([O:5][C:6](=[...,[C:1]1([H:8])([H:10])[C@:2]([O:3][H:12])([H:11...,"InChI=1S/C4H8O3/c1-4(6)2-7-3-5/h3-4,6H,2H2,1H3...","InChI=1S/C4H8O3/c5-3-1-4(6)7-2-3/h3-6H,1-2H2/t...",,,,87.63014,9.24800,86.82397,8.28092,,
17826,[C:1]([C@@:2]([O:3][H:12])([C:4]([O:5][C:6](=[...,[C:1]([C@:2]1([H:11])[C:4]([H:13])([H:14])[O:5...,"InChI=1S/C4H8O3/c1-4(6)2-7-3-5/h3-4,6H,2H2,1H3...","InChI=1S/C4H6O2.H2O/c1-3-2-6-4(3)5;/h3H,2H2,1H...",,,,90.30378,7.00548,85.95415,6.62931,,


In [8]:
merged_df.count()

rsmi                    17828
psmi                    17828
rinchi                  17828
pinchi                  17828
dE0_b97d3               16302
dHrxn298_b97d3          16302
rmg_family               1877
dE0_wb97xd3             11932
dHrxn298_wb97xd3        11932
dE0_ccsdtf12_dz         11932
dHrxn298_ccsdtf12_dz    11932
dE0_ccsdtf12_tz            15
dHrxn298_ccsdtf12_tz       15
dtype: int64

In [10]:
# show a sample of the duplicate rows that have the same idx but different rsmi
merged_df[merged_df.duplicated(subset=['rsmi', 'psmi', 'rinchi', 'pinchi', 'rmg_family'], keep=False)].sort_values('rsmi')

Unnamed: 0,rsmi,psmi,rinchi,pinchi,dE0_b97d3,dHrxn298_b97d3,rmg_family,dE0_wb97xd3,dHrxn298_wb97xd3,dE0_ccsdtf12_dz,dHrxn298_ccsdtf12_dz,dE0_ccsdtf12_tz,dHrxn298_ccsdtf12_tz
7344,[C:1]([C:2]1([H:10])[O:3][C:4]([H:11])([H:12])...,[C:1]([C:2](=[O:6])[H:10])([H:7])([H:8])[H:9]....,"InChI=1S/C4H8O2/c1-4-5-2-3-6-4/h4H,2-3H2,1H3","InChI=1S/2C2H4O/c2*1-2-3/h2*2H,1H3",54.98014,0.96357,,69.42974,3.0123,67.79682,3.85493,,
7343,[C:1]([C:2]1([H:10])[O:3][C:4]([H:11])([H:12])...,[C:1]([C:2](=[O:6])[H:10])([H:7])([H:8])[H:9]....,"InChI=1S/C4H8O2/c1-4-5-2-3-6-4/h4H,2-3H2,1H3","InChI=1S/2C2H4O/c2*1-2-3/h2*2H,1H3",54.98547,0.95482,,69.42974,3.0123,67.79682,3.85493,,
8510,[C:1]([C@@:2]([C:3]([O:4][H:14])([H:12])[H:13]...,[C:1](/[C:2](=[N:5]\[H:15])[H:11])([H:8])([H:9...,"InChI=1S/C4H9NO2/c1-4(2-6)5-3-7/h3-4,6H,2H2,1H...","InChI=1S/C2H5N.2CH2O/c1-2-3;2*1-2/h2-3H,1H3;2*...",65.4202,57.51205,,,,,,,
8509,[C:1]([C@@:2]([C:3]([O:4][H:14])([H:12])[H:13]...,[C:1](/[C:2](=[N:5]\[H:15])[H:11])([H:8])([H:9...,"InChI=1S/C4H9NO2/c1-4(2-6)5-3-7/h3-4,6H,2H2,1H...","InChI=1S/C2H5N.2CH2O/c1-2-3;2*1-2/h2-3H,1H3;2*...",65.4234,57.51292,,,,,,,
16296,[C:1]([C@@:2]([O:3][H:12])([C:4]([O:5][C:6](=[...,[C:1]([C:2](=[O:3])[H:11])([H:8])([H:9])[H:10]...,"InChI=1S/C4H8O3/c1-4(6)2-7-3-5/h3-4,6H,2H2,1H3...","InChI=1S/C2H4O.2CH2O/c1-2-3;2*1-2/h2H,1H3;2*1H2",68.29274,49.88476,,95.1593,47.95042,92.59918,46.15094,,
16295,[C:1]([C@@:2]([O:3][H:12])([C:4]([O:5][C:6](=[...,[C:1]([C:2](=[O:3])[H:11])([H:8])([H:9])[H:10]...,"InChI=1S/C4H8O3/c1-4(6)2-7-3-5/h3-4,6H,2H2,1H3...","InChI=1S/C2H4O.2CH2O/c1-2-3;2*1-2/h2H,1H3;2*1H2",68.31444,49.89513,,95.1593,47.95042,92.59918,46.15094,,
5953,[C:1]([C@@:2]1([H:10])[C:3]([H:11])([H:12])[N:...,[C:1]([C:2](=[O:6])[H:10])([H:7])([H:8])[H:9]....,"InChI=1S/C4H7NO/c1-4-2-5-3-6-4/h3-4H,2H2,1H3/t...","InChI=1S/C2H3N.C2H4O/c1-3-2;1-2-3/h1H,2H2;2H,1H3",58.40385,56.92557,,74.86391,61.98602,70.99353,62.06074,,
5954,[C:1]([C@@:2]1([H:10])[C:3]([H:11])([H:12])[N:...,[C:1]([C:2](=[O:6])[H:10])([H:7])([H:8])[H:9]....,"InChI=1S/C4H7NO/c1-4-2-5-3-6-4/h3-4H,2H2,1H3/t...","InChI=1S/C2H3N.C2H4O/c1-3-2;1-2-3/h1H,2H2;2H,1H3",58.40221,56.93225,,74.86391,61.98602,70.99353,62.06074,,
9485,[C:1]([C@@:2]1([H:11])[C:3]([H:12])([H:13])[N:...,[C:1]([C:2](=[C:3]([H:12])[H:13])[H:11])([H:8]...,"InChI=1S/C5H8N2/c1-5-4-7(5)3-2-6/h5H,3-4H2,1H3...","InChI=1S/C3H6.C2H2N2/c1-3-2;1-4-2-3/h3H,1H2,2H...",78.23222,6.23573,,,,,,,
9486,[C:1]([C@@:2]1([H:11])[C:3]([H:12])([H:13])[N:...,[C:1]([C:2](=[C:3]([H:12])[H:13])[H:11])([H:8]...,"InChI=1S/C5H8N2/c1-5-4-7(5)3-2-6/h5H,3-4H2,1H3...","InChI=1S/C3H6.C2H2N2/c1-3-2;1-4-2-3/h3H,1H2,2H...",76.29768,6.24361,,,,,,,
