<a href="https://colab.research.google.com/github/sjmjdddsq2030/B1PB-Worker-Panel-fixed/blob/main/321%E6%B5%8B%E8%AF%95.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [226]:
pip install pytest pytest-cov  # 安装pytest及测试覆盖率插件



In [227]:
import unittest
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import numpy as np

def preprocess_data(raw_data):
    # 步骤1：填充缺失值
    imputer = SimpleImputer(strategy='mean')
    data_imputed = imputer.fit_transform(raw_data)
    # 步骤2：标准化
    scaler = StandardScaler()
    data_scaled = scaler.fit_transform(data_imputed)
    return data_scaled

class TestDataPreprocessing(unittest.TestCase):
    def test_missing_value_handling(self):
        """独立测试缺失值填充（不含标准化）"""
        raw_data = [[1, np.nan], [np.nan, 3]]
        imputer = SimpleImputer(strategy='mean')
        data_imputed = imputer.fit_transform(raw_data)
        # 验证填充后的值
        self.assertTrue(np.array_equal(data_imputed, [[1, 3], [1, 3]]))

    def test_feature_scaling(self):
        """测试标准化（输入无缺失值）"""
        raw_data = [[10], [20], [30]]
        scaler = StandardScaler()
        scaled = scaler.fit_transform(raw_data)
        self.assertAlmostEqual(scaled.mean(), 0, delta=0.01)
        self.assertAlmostEqual(scaled.std(), 1, delta=0.01)

if __name__ == '__main__':
    suite = unittest.TestLoader().loadTestsFromTestCase(TestDataPreprocessing)
    unittest.TextTestRunner().run(suite)

..
----------------------------------------------------------------------
Ran 2 tests in 0.006s

OK


In [249]:
import unittest
from unittest.mock import patch
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import numpy as np

# 虚拟数据集函数
def load_dataset():
    X = np.array([[1,2], [3,4], [5,6], [7,8]])
    y = np.array([0, 1, 0, 1])
    return X, y

class TestModelTraining(unittest.TestCase):
    @patch('xgboost.XGBClassifier.fit')
    def test_model_initialization(self, mock_fit):
        """验证模型参数初始化"""
        model = XGBClassifier(max_depth=3, n_estimators=100)
        model.fit([[1,2], [3,4]], [0,1])
        self.assertEqual(model.max_depth, 3)

    def test_train_test_split(self):
        """数据划分完整性验证"""
        X, y = load_dataset()
        X_train, X_test, _, _ = train_test_split(X, y, test_size=0.25)
        self.assertEqual(len(X_train) + len(X_test), len(X))

# 运行所有测试
if __name__ == '__main__':
    # Jupyter 环境下需配置 argv 和 exit=False
    unittest.main(argv=['ignored', '-v'], exit=False)

test_feature_scaling (__main__.TestDataPreprocessing.test_feature_scaling)
测试标准化过程 ... ok
test_missing_value_handling (__main__.TestDataPreprocessing.test_missing_value_handling)
测试缺失值填充逻辑 ... ok
test_something (__main__.TestModel.test_something) ... ok
test_model_initialization (__main__.TestModelTraining.test_model_initialization)
验证模型参数初始化 ... ok
test_train_test_split (__main__.TestModelTraining.test_train_test_split)
数据划分完整性验证 ... ok
test_model_prediction (__main__.TestPredictionLogic.test_model_prediction)
验证输入4个特征 → Pipeline处理后为2个 → 预测成功 ... ok

----------------------------------------------------------------------
Ran 6 tests in 0.023s

OK


被选中的特征索引: [1 3]


In [231]:
from sklearn.linear_model import LogisticRegression
import joblib

# 生成示例模型并保存
model = LogisticRegression()
model.fit([[1,2], [3,4]], [0,1])  # 使用虚拟数据训练
joblib.dump(model, 'pretrained_model.pkl')

['pretrained_model.pkl']

In [232]:
import os
print(os.path.exists('pretrained_model.pkl'))  # 应输出 True

True


In [233]:
loaded_model = joblib.load('pretrained_model.pkl')
print(loaded_model.predict([[1, 2]]))  # 应输出 [0]

[0]


In [234]:
# 检查包是否已安装
!pip list | grep "pytest\|joblib"

# 检查模型文件是否存在
import os
print("模型文件存在:", os.path.exists('pretrained_model.pkl'))

joblib                             1.4.2
pytest                             8.3.5
pytest-cov                         6.0.0
模型文件存在: True


In [235]:
import os

# 检查Colab本地模型文件
print("Colab本地模型存在:", os.path.exists("/content/pretrained_model.pkl"))

# 检查Drive中的模型文件
print("Drive模型存在:", os.path.exists("/content/drive/MyDrive/pretrained_model.pkl"))

# 检查测试脚本（以321测试.ipynb为例）
print("测试脚本存在:", os.path.exists("/content/drive/MyDrive/Colab Notebooks/321测试.ipynb"))

Colab本地模型存在: True
Drive模型存在: True
测试脚本存在: True


In [236]:
import pathlib
test_path = pathlib.Path('tests/test_model.py').absolute()
print("测试文件绝对路径:", test_path)

测试文件绝对路径: /content/tests/test_model.py


In [237]:
!pytest -v "/content/drive/MyDrive/Colab Notebooks/test_model.py"

platform linux -- Python 3.11.11, pytest-8.3.5, pluggy-1.5.0 -- /usr/bin/python3
cachedir: .pytest_cache
rootdir: /content
plugins: cov-6.0.0, anyio-4.9.0, typeguard-4.4.2, langsmith-0.3.15
[1mcollecting ... [0m[1mcollected 1 item                                                                                   [0m

drive/MyDrive/Colab Notebooks/test_model.py::TestModel::test_sample [32mPASSED[0m[32m                   [100%][0m



In [238]:
!python "/content/drive/MyDrive/Colab Notebooks/test_model.py"

.
----------------------------------------------------------------------
Ran 1 test in 0.000s

OK


In [239]:
# 示例：训练时使用特征选择器（假设保留2个特征）
from sklearn.feature_selection import SelectKBest
import numpy as np  # 导入numpy

# 定义示例数据
X_train_raw = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
y_train = np.array([0, 1, 0])

print("Before feature selection: X_train_raw shape =", X_train_raw.shape)  # Added print statement

selector = SelectKBest(k=2)
X_train = selector.fit_transform(X_train_raw, y_train)

print("After feature selection: X_train shape =", X_train.shape)  # Added print statement

# 测试时需应用同一选择器
# 假设 X_test_raw 也已定义
X_test_raw = np.array([[10, 11, 12], [13, 14, 15]])
X_test = selector.transform(X_test_raw)  # 必须用transform，非fit_transform

print("Transformed X_test shape =", X_test.shape)  # Added print statement

Before feature selection: X_train_raw shape = (3, 3)
After feature selection: X_train shape = (3, 2)
Transformed X_test shape = (2, 2)


In [240]:
from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(
    StandardScaler(),  # 预处理步骤
    SelectKBest(k=2),  # 特征选择步骤
    LogisticRegression()  # 模型
)

pipeline.fit(X_train_raw, y_train)  # 训练时自动处理数据
pipeline.predict(X_test_raw)  # 预测时自动复用相同预处理

array([0, 0])

In [241]:
# ✅ 正确加载并使用
loaded_pipeline = joblib.load('full_pipeline.pkl')

# 输入原始数据（4个特征），Pipeline会自动处理成2个特征
X_test_raw = [[1.5, 0.7, 10, 8]]  # 4个特征
result = loaded_pipeline.predict(X_test_raw)
print(result)  # 输出预测结果

[1]


In [242]:
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import LogisticRegression

pipeline = Pipeline([
    ('selector', SelectKBest(k=2)),
    ('model', LogisticRegression())
])
pipeline.fit([[1.2, 0.8, 12, 5], [2.1, 1.5, 10, 8]], [0, 1])
joblib.dump(pipeline, 'full_pipeline.pkl')

  msw = sswn / float(dfwn)


['full_pipeline.pkl']

In [243]:
!python -m pytest /content/drive/MyDrive/Colab\ Notebooks/test_model.py -vv  # 替换为实际测试文件路径

platform linux -- Python 3.11.11, pytest-8.3.5, pluggy-1.5.0 -- /usr/bin/python3
cachedir: .pytest_cache
rootdir: /content
plugins: cov-6.0.0, anyio-4.9.0, typeguard-4.4.2, langsmith-0.3.15
[1mcollecting ... [0m[1mcollected 1 item                                                                                   [0m

drive/MyDrive/Colab Notebooks/test_model.py::TestModel::test_sample [32mPASSED[0m[32m                   [100%][0m



In [244]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, VarianceThreshold
from sklearn.linear_model import LogisticRegression
import joblib

# 定义Pipeline
pipeline = Pipeline([
    ('variance_filter', VarianceThreshold(threshold=0)),  # 过滤零方差特征
    ('scaler', StandardScaler()),                         # 标准化
    ('selector', SelectKBest(k=2)),                       # 选择2个特征
    ('model', LogisticRegression())                       # 模型
])

# 示例训练数据（4个特征）
X_train_raw = [[1.2, 0.8, 12, 5], [2.1, 1.5, 10, 8], [3.0, 2.0, 8, 6]]
y_train = [0, 1, 0]

# 训练并保存Pipeline
pipeline.fit(X_train_raw, y_train)
joblib.dump(pipeline, 'full_pipeline.pkl')

# 验证训练后特征数
selector = pipeline.named_steps['selector']
print("训练时选择的特征数:", sum(selector.get_support()))  # 应为2

训练时选择的特征数: 2


In [245]:
# 在Colab中运行
!python -m pytest test_model.py -vv

# 预期输出
# 在Colab中运行
!python -m pytest test_model.py -vv

platform linux -- Python 3.11.11, pytest-8.3.5, pluggy-1.5.0 -- /usr/bin/python3
cachedir: .pytest_cache
rootdir: /content
plugins: cov-6.0.0, anyio-4.9.0, typeguard-4.4.2, langsmith-0.3.15
[1mcollecting ... [0m[1mcollected 1 item                                                                                   [0m

test_model.py::TestModel::test_sample [32mPASSED[0m[32m                                                 [100%][0m

platform linux -- Python 3.11.11, pytest-8.3.5, pluggy-1.5.0 -- /usr/bin/python3
cachedir: .pytest_cache
rootdir: /content
plugins: cov-6.0.0, anyio-4.9.0, typeguard-4.4.2, langsmith-0.3.15
collected 1 item                                                                                   [0m

test_model.py::TestModel::test_sample [32mPASSED[0m[32m                                                 [100%][0m



In [168]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold, SelectKBest
from sklearn.linear_model import LogisticRegression
import joblib

# 定义Pipeline
pipeline = Pipeline([
    ('variance_filter', VarianceThreshold(threshold=0)),  # 过滤零方差特征
    ('scaler', StandardScaler()),                         # 标准化
    ('selector', SelectKBest(k=2)),                       # 选择2个特征
    ('model', LogisticRegression())                       # 模型
])

# 示例训练数据（4个特征）
X_train_raw = [[1.2, 0.8, 12, 5], [2.1, 1.5, 10, 8], [3.0, 2.0, 8, 6]]
y_train = [0, 1, 0]

# 训练并保存Pipeline
pipeline.fit(X_train_raw, y_train)
joblib.dump(pipeline, 'full_pipeline.pkl')

# 验证输出维度
test_sample = [[1.2, 0.8, 12, 5]]

# Instead of pipeline.transform, use pipeline.predict_proba or pipeline.decision_function
# if you want to see the intermediate output after the 'selector' step:

# Option 1: Get probabilities (for classification)
probabilities = pipeline.predict_proba(test_sample)
print("模型输出概率:", probabilities.shape)  # Output shape will depend on the number of classes

# Option 2: Get decision function values (for classification)
decision_values = pipeline.decision_function(test_sample)
print("模型输出决策值:", decision_values.shape)  # Output shape will be (1,) for binary classification

# Option 3: Access the transformed data directly (less common)
transformed_data = pipeline[:-1].transform(test_sample) # Access all steps except the last one
print("模型输入维度:", transformed_data.shape)  # This should print (1, 2)

模型输出概率: (1, 2)
模型输出决策值: (1,)
模型输入维度: (1, 2)


In [174]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, VarianceThreshold
from sklearn.linear_model import LogisticRegression
import joblib

# 定义Pipeline
pipeline = Pipeline([
    ('variance_filter', VarianceThreshold(threshold=0)),  # 过滤零方差特征
    ('scaler', StandardScaler()),                         # 标准化
    ('selector', SelectKBest(k=2)),                       # 选择2个特征
    ('model', LogisticRegression())                       # 模型
])

# 示例训练数据（4个特征）
X_train_raw = [[1.2, 0.8, 12, 5], [2.1, 1.5, 10, 8], [3.0, 2.0, 8, 6]]
y_train = [0, 1, 0]

# 训练并保存Pipeline
pipeline.fit(X_train_raw, y_train)
joblib.dump(pipeline, 'full_pipeline.pkl')

# 验证训练后特征数
selector = pipeline.named_steps['selector']
print("训练时选择的特征数:", sum(selector.get_support()))  # 应为2

训练时选择的特征数: 2


In [247]:
import unittest
import joblib
import numpy as np

class TestPredictionLogic(unittest.TestCase):
    def setUp(self):
        """加载Pipeline并验证特征处理流程"""
        try:
            self.pipeline = joblib.load('full_pipeline.pkl')
            # 检查关键步骤是否存在
            self.assertIn('selector', self.pipeline.named_steps, "Pipeline缺少selector步骤")
            # 打印被选中的特征索引
            selector = self.pipeline.named_steps['selector']
            selected_indices = np.where(selector.get_support())[0]
            print("被选中的特征索引:", selected_indices)  # 应为类似 [0, 2]
        except Exception as e:
            self.fail(f"Pipeline加载失败: {str(e)}")

    def test_model_prediction(self):
        """验证输入4个特征 → Pipeline处理后为2个 → 预测成功"""
        test_input = [[1.2, 0.8, 12, 5]]  # 输入4个特征
        result = self.pipeline.predict(test_input)
        self.assertIsInstance(result, np.ndarray)
        # Check for shape and data type instead of specific values
        self.assertEqual(result.shape, (1,)) # Assuming you expect a single prediction
        self.assertEqual(result.dtype, int) # or np.int64 if necessary
        # optionally, if you know the possible range of your prediction, add assertions:
        # self.assertTrue(np.all(result >= 0))
        # self.assertTrue(np.all(result < num_classes)) # Where num_classes is the number of classes

if __name__ == '__main__':
    unittest.main(argv=['ignored', '-v'], exit=False)

test_feature_scaling (__main__.TestDataPreprocessing.test_feature_scaling)
测试标准化过程 ... ok
test_missing_value_handling (__main__.TestDataPreprocessing.test_missing_value_handling)
测试缺失值填充逻辑 ... ok
test_something (__main__.TestModel.test_something) ... ok
test_model_initialization (__main__.TestModelTraining.test_model_initialization)
验证模型参数初始化 ... ok
test_train_test_split (__main__.TestModelTraining.test_train_test_split)
数据划分完整性验证 ... ok
test_model_prediction (__main__.TestPredictionLogic.test_model_prediction)
验证输入4个特征 → Pipeline处理后为2个 → 预测成功 ... ok

----------------------------------------------------------------------
Ran 6 tests in 0.021s

OK


被选中的特征索引: [1 3]
