In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# 自 NIST cccbdb 資料庫網頁找到 dipole moment資訊
chem = pd.read_html("https://cccbdb.nist.gov/diplistx.asp")


In [2]:
chem[:4]

[                                                   0  \
 0                                                NaN   
 1  Home All data for one species Geometry Experim...   
 2                                                NaN   
 3  You are here: Experimental > Electrostatics > ...   
 
                                                    1  
 0  Computational Chemistry Comparison and Benchma...  
 1                                                NaN  
 2                                                NaN  
 3                                                NaN  ,
                   0                              1      2      3       4  \
 0          Molecule                           name      x      y       z   
 1                H2              Hydrogen diatomic  0.000  0.000   0.000   
 2                D2             Deuterium diatomic    NaN    NaN     NaN   
 3               LiH                Lithium Hydride    NaN    NaN     NaN   
 4               Li2               Lithium diat

### 由上表可得知， "chem" 此  dataframe  有過多不必要之表格，
### 因此我們逐一檢查其內容後發現， chem[1]  才是真正  dipole moment  相關的表格。

In [3]:
chem_table = chem[1].copy()     # chem[1] 為想要之表格
chem_table.head(20)


Unnamed: 0,0,1,2,3,4,5,6,7
0,Molecule,name,x,y,z,tot,squib,commment
1,H2,Hydrogen diatomic,0.000,0.000,0.000,0.000,NSRDS-NBS10,
2,D2,Deuterium diatomic,,,,0.000,NSRDS-NBS10,
3,LiH,Lithium Hydride,,,,5.880,NSRDS-NBS10,
4,Li2,Lithium diatomic,,,,0.000,NSRDS-NBS10,
5,Be2,Beryllium diatomic,,,,0.000,NSRDS-NBS10,
6,BH,Boron monohydride,,,,1.270,1969Tho/Dal:1155,+- 0.21
7,C2,Carbon diatomic,,,,0.000,NSRDS-NBS10,
8,CN,Cyano radical,0.000,0.000,-1.450,1.450,1968Tho/Dal:2815,
9,N2,Nitrogen diatomic,,,,0.000,NSRDS-NBS10,


### 另外，表格中之標題並非第一列，因此將資料第二列標題移到第一列。

In [4]:
colname = chem_table.loc[0, :]
chem_table_new = chem_table.loc[1:, :].copy()
chem_table_new.columns = colname


In [5]:
chem_table_new.head(20)

Unnamed: 0,Molecule,name,x,y,z,tot,squib,commment
1,H2,Hydrogen diatomic,0.0,0.0,0.0,0.0,NSRDS-NBS10,
2,D2,Deuterium diatomic,,,,0.0,NSRDS-NBS10,
3,LiH,Lithium Hydride,,,,5.88,NSRDS-NBS10,
4,Li2,Lithium diatomic,,,,0.0,NSRDS-NBS10,
5,Be2,Beryllium diatomic,,,,0.0,NSRDS-NBS10,
6,BH,Boron monohydride,,,,1.27,1969Tho/Dal:1155,+- 0.21
7,C2,Carbon diatomic,,,,0.0,NSRDS-NBS10,
8,CN,Cyano radical,0.0,0.0,-1.45,1.45,1968Tho/Dal:2815,
9,N2,Nitrogen diatomic,,,,0.0,NSRDS-NBS10,
10,OH,Hydroxyl radical,,,,1.66,NSRDS-NBS10,


* chem : 原始資料
* chem[1] : 為 molecule 部分(我們要的)資料，但多一列
* chem_table ：為 chem[1] 之複製檔，避免更改到原始檔。
* chem_table_new : 為已更改過標題之資料檔

然而因本資料中，許多分子為線性分子，無xyz軸之分，故chem_table僅有total dipole moment，無各軸分量；
也有一些分子因不明原因(或未測定、或未有確切值)而資料為空，接下來將所有無值空格一律填入-1，以免未來分析出現error。

In [6]:
# 將各行 NaN值代換為 -1
for i in chem_table_new[2: -1]:
    chem_table_new[i].fillna(value=-1, inplace=True)
chem_table_new.head(20)

Unnamed: 0,Molecule,name,x,y,z,tot,squib,commment
1,H2,Hydrogen diatomic,0.0,0.0,0.0,0.0,NSRDS-NBS10,-1
2,D2,Deuterium diatomic,-1.0,-1.0,-1.0,0.0,NSRDS-NBS10,-1
3,LiH,Lithium Hydride,-1.0,-1.0,-1.0,5.88,NSRDS-NBS10,-1
4,Li2,Lithium diatomic,-1.0,-1.0,-1.0,0.0,NSRDS-NBS10,-1
5,Be2,Beryllium diatomic,-1.0,-1.0,-1.0,0.0,NSRDS-NBS10,-1
6,BH,Boron monohydride,-1.0,-1.0,-1.0,1.27,1969Tho/Dal:1155,+- 0.21
7,C2,Carbon diatomic,-1.0,-1.0,-1.0,0.0,NSRDS-NBS10,-1
8,CN,Cyano radical,0.0,0.0,-1.45,1.45,1968Tho/Dal:2815,-1
9,N2,Nitrogen diatomic,-1.0,-1.0,-1.0,0.0,NSRDS-NBS10,-1
10,OH,Hydroxyl radical,-1.0,-1.0,-1.0,1.66,NSRDS-NBS10,-1


接下來簡單的將資料進行分析，以 total dipole moment 大小之四捨五入分類。

此時在資料中會多出一直行稱為  dipole_cat  的分類。

In [7]:
# 新增一直行分類，以 total dipole moment四捨五入之值取到整數。
chem_table_new.loc[:, "dipole_cat"] = np.around(chem_table_new.loc[:, "tot"].astype(float), decimals=0)
# 將任何 total dipole moment大於8之資料都 merge到 8.0類別中。
chem_table_new.loc[:, "dipole_cat"].where(chem_table_new.loc[:, "dipole_cat"] < 8, 8.0, inplace=True)
chem_table_new.head(20)

Unnamed: 0,Molecule,name,x,y,z,tot,squib,commment,dipole_cat
1,H2,Hydrogen diatomic,0.0,0.0,0.0,0.0,NSRDS-NBS10,-1,0.0
2,D2,Deuterium diatomic,-1.0,-1.0,-1.0,0.0,NSRDS-NBS10,-1,0.0
3,LiH,Lithium Hydride,-1.0,-1.0,-1.0,5.88,NSRDS-NBS10,-1,6.0
4,Li2,Lithium diatomic,-1.0,-1.0,-1.0,0.0,NSRDS-NBS10,-1,0.0
5,Be2,Beryllium diatomic,-1.0,-1.0,-1.0,0.0,NSRDS-NBS10,-1,0.0
6,BH,Boron monohydride,-1.0,-1.0,-1.0,1.27,1969Tho/Dal:1155,+- 0.21,1.0
7,C2,Carbon diatomic,-1.0,-1.0,-1.0,0.0,NSRDS-NBS10,-1,0.0
8,CN,Cyano radical,0.0,0.0,-1.45,1.45,1968Tho/Dal:2815,-1,1.0
9,N2,Nitrogen diatomic,-1.0,-1.0,-1.0,0.0,NSRDS-NBS10,-1,0.0
10,OH,Hydroxyl radical,-1.0,-1.0,-1.0,1.66,NSRDS-NBS10,-1,2.0


最後我們看一下各分類之資料共有幾筆：

In [8]:
chem_table_new["dipole_cat"].value_counts().sort_index()

-1.0      4
 0.0    118
 1.0    121
 2.0    121
 3.0     36
 4.0     25
 5.0      4
 6.0      5
 7.0      3
 8.0      6
Name: dipole_cat, dtype: int64

如上列可得知大部分分子之 total dipole moment集中於 0~2debye之間，且有一部分拖尾狀況，

未來若要以此資料進行訓練，可能還需後續處理。