In [None]:
import pandas as pd
import numpy as np
import os

# 获取当前文件所在目录  # Get the directory where the current file is located
file_root_dir = os.path.dirname(os.path.realpath('__file__'))  

# 构建数据文件路径  # Build the data file path
data_path = os.path.join(file_root_dir, 'data', 'Ecological risk assessment.xlsx')
# 读取Excel文件，以'No.'列为索引  # Read Excel file, using 'No.' column as index
data_file = pd.read_excel(data_path, index_col='No.')
print(data_file)

# 提取不重复的类别列表  # Extract a list of unique categories
class_ = list(dict.fromkeys(data_file.loc[:, 'Class'].values))
print(class_)

            CAS                 Pollutants_name                Class  \
No.                                                                    
1     7664-41-7                Ammonia nitrogen  Inorganic compounds   
2    14797-55-8                Nitrate nitrogen  Inorganic compounds   
3    14797-65-0                         Nitrite  Inorganic compounds   
4    14808-79-8                         Sulfate  Inorganic compounds   
5    18496-25-8                         Sulfite  Inorganic compounds   
..          ...                             ...                  ...   
273    375-73-5   Perfluorobutane sulfonic acid                PFSAs   
274    355-46-4   Perfluorohexane sulfonic acid                PFSAs   
275    375-92-8  Perfluoroheptane sulfonic acid                PFSAs   
276   1763-23-1   Perfluorooctane sulfonic acid                PFSAs   
277    335-77-3   Perfluorodecane sulfonic acid                PFSAs   

     方差（σ）\nLOG  HU=MEC/x\n计算  
No.                            

In [None]:
# 设置随机数生成器种子  # Set the random number generator seed
rng = np.random.default_rng(42)

# 设置采样次数和比例范围  # Set the number of samples and proportion range
N = 10000
lower = 0.5
upper = 1.0

# 存储结果的列表  # Lists to store results
hu_set = []
sigma_avg_set = []

# 遍历每个类别  # Iterate through each category
for cls in class_:
    # 筛选当前类别的数据  # Filter data for the current category
    data_file_filter = data_file[data_file['Class'] == cls]
    # 获取HU值  # Get HU values
    hu = data_file_filter.loc[:, 'HU=MEC/x\n计算'].values
    # 提取 σ 列  # Extract the σ column
    sigma = data_file_filter.loc[:, '方差（σ）\nLOG'].values  
    # 生成随机比例  # Generate random proportions
    proportions = rng.uniform(lower, upper, size=N)
    # 计算每轮需要采样的数量  # Calculate the number of samples needed for each round
    ks = np.maximum(1, (proportions * hu.size).astype(int))
    # 存储每轮HU总和的数组  # Array to store HU sums for each round
    sums = np.empty(N, dtype=hu.dtype)
    # 用于存储 σ 平均值  # Used to store average σ values
    sigmas_avg = np.empty(N, dtype=sigma.dtype)  
    
    # 进行N次随机采样  # Perform N random samplings
    for i, k in enumerate(ks):
        # 随机选择k个索引  # Randomly select k indices
        indices = rng.choice(hu.size, size=k, replace=False)
        # 计算选中元素的HU总和  # Calculate the HU sum of selected elements
        sums[i] = hu[indices].sum()
        # 计算 σ 的平均值  # Calculate the average of σ
        sigmas_avg[i] = sigma[indices].mean()  
        
    # 将结果添加到列表中  # Add results to the lists
    hu_set.append(sums)
    sigma_avg_set.append(sigmas_avg)

# 转置数组以匹配 DataFrame 的结构  # Transpose arrays to match DataFrame structure
hu_set = np.array(hu_set).transpose()
sigma_avg_set = np.array(sigma_avg_set).transpose()

# 创建HU的DataFrame  # Create DataFrame for HU
hu_pd = pd.DataFrame(
    data=hu_set,
    index=[f"{i+1}" for i in range(hu_set.shape[0])],
    columns=[f"{cls}_HU" for cls in class_]
)

# 创建σ平均值的DataFrame  # Create DataFrame for average σ
sigma_avg_pd = pd.DataFrame(
    data=sigma_avg_set,
    index=[f"{i+1}" for i in range(sigma_avg_set.shape[0])],
    columns=[f"{cls}_σ_avg" for cls in class_]
)

# 合并两个DataFrame  # Merge the two DataFrames
result_pd = pd.concat([hu_pd, sigma_avg_pd], axis=1)

# 输出到Excel文件  # Output to Excel file
result_pd.to_excel('./res_2.xlsx')

  result_pd.to_excel('./res_2.xlsx')
