# 1. 加载数据

In [5]:
import numpy as np

# 加载训练集数据
X_train = np.load('../data_set/X_train.npy')

X_train  # 输出训练集数据

array([[57.,  5.,  0., ...,  0.,  0.,  0.],
       [71.,  2.,  2., ...,  0.,  0.,  1.],
       [64.,  2.,  0., ...,  0.,  0.,  0.],
       ...,
       [46.,  4.,  0., ...,  1.,  0.,  0.],
       [70.,  2.,  1., ...,  0.,  0.,  0.],
       [60.,  2.,  0., ...,  0.,  0.,  0.]])

In [6]:
X_train.shape  # 输出训练集大小

(1000, 111)

# 2. 统计缺失值

In [7]:
# 计算每个特征的缺失值比例
missing_ratio = np.mean(np.isnan(X_train), axis=0)

missing_ratio


array([0.005, 0.   , 0.002, 0.067, 0.046, 0.034, 0.963, 0.005, 0.004,
       0.153, 0.034, 0.016, 0.016, 0.016, 0.016, 0.016, 0.016, 0.016,
       0.012, 0.012, 0.012, 0.012, 0.012, 0.012, 0.012, 0.002, 0.002,
       0.002, 0.003, 0.003, 0.003, 0.003, 0.003, 0.644, 0.644, 0.161,
       0.161, 0.007, 0.008, 0.009, 0.007, 0.007, 0.007, 0.051, 0.05 ,
       0.048, 0.043, 0.   , 0.096, 0.096, 0.096, 0.096, 0.096, 0.096,
       0.069, 0.069, 0.069, 0.069, 0.069, 0.069, 0.069, 0.069, 0.069,
       0.068, 0.068, 0.068, 0.068, 0.068, 0.068, 0.068, 0.068, 0.068,
       0.068, 0.068, 0.006, 0.006, 0.006, 0.006, 0.006, 0.006, 0.006,
       0.201, 0.201, 0.204, 0.204, 0.174, 0.175, 0.998, 0.074, 0.115,
       0.08 , 0.009, 0.069, 0.08 , 0.371, 0.392, 0.382, 0.003, 0.003,
       0.068, 0.082, 0.007, 0.069, 0.081, 0.005, 0.004, 0.005, 0.008,
       0.008, 0.007, 0.007])

筛选缺失率高于某个阈值(90%)的特征先清理去除，并重新输出清理后的数据集大小

In [15]:
# 定义高缺失率阈值为90%
high_missing_ratio_threshold = 0.9

# 标记应删除的特征
features_to_drop_high_missing = np.where(missing_ratio > high_missing_ratio_threshold)[0]

# 删除高缺失率的特征
X_dropped_high_missing = np.delete(X_train, features_to_drop_high_missing, axis=1)

# 保存删除高缺失率特征后的数据
np.save('../data_set/processed_data_set/X_dropped_90.npy', X_dropped_high_missing)

features_to_drop_high_missing, X_dropped_high_missing.shape


(array([ 6, 87], dtype=int64), (1000, 109))

# 3. 特征类型分类
对特征进行分类，离散型和连续型

In [16]:
# 加载删除高缺失率特征后的数据
X_train = np.load('../data_set/processed_data_set/X_dropped_90.npy')

# 确定每个特征的唯一值数量
unique_counts = np.apply_along_axis(lambda col: len(np.unique(col[~np.isnan(col)])), 0, X_dropped_high_missing)

# 使用唯一值数量与样本总数的比例来推断特征类型
# 设定一个阈值
# 如果唯一值占总样本数的比例小于5%是离散型特征，否则是连续型特征
threshold_ratio = 0.05
discrete_feature_indices = np.where((unique_counts / X_train.shape[0]) < threshold_ratio)[0]
continuous_feature_indices = np.where((unique_counts / X_train.shape[0]) >= threshold_ratio)[0]

discrete_feature_indices, continuous_feature_indices  # 输出离散型和连续型特征的索引


(array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
         14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
         27,  28,  29,  30,  31,  32,  36,  37,  38,  39,  40,  41,  42,
         43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,
         56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,
         69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,
         83,  89,  90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100,
        101, 102, 103, 104, 105, 106, 107, 108], dtype=int64),
 array([ 0, 33, 34, 35, 82, 84, 85, 86, 87, 88], dtype=int64))

离散型特征
(array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
         14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
         27,  28,  29,  30,  31,  32,  36,  37,  38,  39,  40,  41,  42,
         43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,
         56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,
         69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,
         83,  89,  90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100,
        101, 102, 103, 104, 105, 106, 107, 108], dtype=int64)
连续型特征
array([ 0, 33, 34, 35, 82, 84, 85, 86, 87, 88], dtype=int64))


## 3.1 连续性特征分类为整数型和浮点型

In [17]:
# 初始化列表来保存整数型和浮点型连续特征的索引
continuous_integer_indices = []
continuous_float_indices = []

# 遍历所有连续型特征，检查它们是整数型还是浮点型
for index in continuous_feature_indices:
    if np.all(np.mod(X_train[:, index][~np.isnan(X_train[:, index])], 1) == 0):
        continuous_integer_indices.append(index)
    else:
        continuous_float_indices.append(index)

continuous_integer_indices, continuous_float_indices

([0, 33, 34, 35], [82, 84, 85, 86, 87, 88])

整数型连续特征
[0, 33, 34, 35]
浮点型连续特征
[82, 84, 85, 86, 87, 88]

离散型特征分类可能暂时不需要

In [ ]:
# # 初始化列表来保存整数型和浮点型离散特征的索引
# discrete_integer_indices = []
# discrete_float_indices = []
# 
# # 遍历所有离散型特征，检查它们是整数型还是浮点型
# for index in discrete_feature_indices:
#     if np.all(np.mod(X_train[:, index][~np.isnan(X_train[:, index])], 1) == 0):
#         discrete_integer_indices.append(index)
#     else:
#         discrete_float_indices.append(index)
# 
# discrete_integer_indices, discrete_float_indices

按照这个分类，我们将对缺失值采取不同的填充策略：

对于**离散型特征**，我们将使用众数进行填充。
对于**连续浮点型特征**，我们将使用均值进行填充。
对于**连续整数型特征**，我们也将使用均值进行填充，然后对结果进行四舍五入，以确保填充值保持为整数。

In [18]:
from sklearn.impute import SimpleImputer

# 对离散型特征使用众数填充
imputer_mode = SimpleImputer(strategy='most_frequent')
X_train[:, discrete_feature_indices] = imputer_mode.fit_transform(X_train[:, discrete_feature_indices])

# 对连续浮点型特征使用均值填充
imputer_mean = SimpleImputer(strategy='mean')
X_train[:, continuous_float_indices] = imputer_mean.fit_transform(X_train[:, continuous_float_indices])

# 对连续整数型特征使用均值填充并四舍五入
X_continuous_integer = imputer_mean.fit_transform(X_train[:, continuous_integer_indices])
X_train[:, continuous_integer_indices] = np.round(X_continuous_integer)

# 验证处理后的数据是否还含有缺失值
missing_after_imputation = np.isnan(X_train).sum()

missing_after_imputation


0

In [19]:
X_train  # 输出处理后的数据

array([[57.,  5.,  0., ...,  0.,  0.,  0.],
       [71.,  2.,  2., ...,  0.,  0.,  1.],
       [64.,  2.,  0., ...,  0.,  0.,  0.],
       ...,
       [46.,  4.,  0., ...,  1.,  0.,  0.],
       [70.,  2.,  1., ...,  0.,  0.,  0.],
       [60.,  2.,  0., ...,  0.,  0.,  0.]])