## 注意：大量代码是冗余的

In [58]:
from datetime import timedelta
from sklearn.ensemble import IsolationForest
import pandas as pd
import os
import xgboost
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

kernel_log_data_path = r'memory_sample_kernel_log_round1_a_train.csv'
failure_tag_data_path = r'memory_sample_failure_tag_round1_a_train.csv'
PARENT_FOLDER = 'data'

# 计算每个agg_time区间的和
def etl(path, agg_time):
    data = pd.read_csv(os.path.join(PARENT_FOLDER, path))
    # 检查缺失值
    if data.isnull().sum().sum() > 0:
        data = data.fillna(method='ffill')  # 填充缺失值
    # 降低时间精度 向上取整
    data['collect_time'] = pd.to_datetime(data['collect_time']).dt.ceil(agg_time)
    group_data = data.groupby(['serial_number', 'collect_time'], as_index=False).agg('sum')
    return group_data

# 设置聚合时间粒度
AGG_VALUE = 5
AGG_UNIT = 'min'
AGG_TIME = str(AGG_VALUE) + AGG_UNIT

# 示例仅使用了kernel数据
group_min = etl(kernel_log_data_path, AGG_TIME)
failure_tag = pd.read_csv(os.path.join(PARENT_FOLDER, failure_tag_data_path))
failure_tag['failure_time'] = pd.to_datetime(failure_tag['failure_time'])

# 为数据打标
merged_data = pd.merge(group_min, failure_tag[['serial_number', 'failure_time']], how='left', on=['serial_number'])
merged_data['failure_tag'] = (merged_data['failure_time'].notnull()) & ((merged_data['failure_time']
                                                                         - merged_data['collect_time']).dt.seconds <= AGG_VALUE * 60)
merged_data['failure_tag'] = merged_data['failure_tag'] + 0
feature_data = merged_data.drop(['serial_number', 'collect_time', 'manufacturer', 'vendor', 'failure_time'], axis=1)

# 计算正负样本的数量
num_positive_samples = len(feature_data[feature_data['failure_tag'] == 1])
num_negative_samples = len(feature_data[feature_data['failure_tag'] == 0])

# 根据正样本的数量决定上采样倍数
upsampling_factor = num_positive_samples // num_negative_samples

# 负样本上采样
sampled_negative_data = feature_data[feature_data['failure_tag'] == 0].sample(n=upsampling_factor, replace=True)

# 合并正负样本
upsampled_data = pd.concat([sampled_negative_data, feature_data[feature_data['failure_tag'] == 1]])

# 重新采样后的数据集
sample = upsampled_data

# 将负样本的类别标签从1改为0
upsampled_data['failure_tag'] = upsampled_data['failure_tag'].replace({1: 0})

# 负样本下采样
sample_0 = feature_data[feature_data['failure_tag'] == 0].sample(frac=0.02)
sample = pd.concat([sample_0, feature_data[feature_data['failure_tag'] == 1]])

# 计算异常值得分，将异常值得分作为特征，注意异常值检测特征的位置
clf = IsolationForest(contamination=0.05)  # 设置异常值比例
merged_data['outlier_score'] = clf.fit_predict(feature_data.iloc[:, :-1])

# 时间特征：提取更多时间特征，如小时、分钟、周几等。
merged_data['hour'] = merged_data['collect_time'].dt.hour
merged_data['day_of_week'] = merged_data['collect_time'].dt.dayofweek

# 将异常值得分作为特征
feature_data['outlier_score'] = merged_data['outlier_score']

# 在计算特征后插入异常值检测特征工程
clf = IsolationForest(contamination=0.05)  # 设置异常值比例
merged_data['outlier_score'] = clf.fit_predict(feature_data.iloc[:, :-1])

# 将异常值得分作为特征
feature_data['outlier_score'] = merged_data['outlier_score']

# 过去故障次数
merged_data['past_failure_count'] = merged_data.groupby('serial_number')['failure_tag'].cumsum().shift()

# 负样本上采样   #添加一个网格搜索代码
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

param_grid = {
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300]
}

xgb = XGBClassifier()
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=5)
grid_search.fit(sample.iloc[:, :-1], sample['failure_tag'])
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

# 使用最佳参数训练模型
clf = XGBClassifier(**best_params)
clf.fit(sample.iloc[:, :-1], sample['failure_tag'])

# 测试数据
group_data_test = etl(r'memory_sample_kernel_log_k12_round1_a_test.csv', AGG_TIME)
group_min_sn_test = pd.DataFrame(group_data_test[['serial_number', 'collect_time']])
group_min_test = group_data_test.drop(['serial_number', 'collect_time', 'manufacturer', 'vendor'], axis=1)

# 模型预测
res = clf.predict(group_min_test)
group_min_sn_test['predict'] = res

# 保存结果
group_min_sn_test = group_min_sn_test[group_min_sn_test['predict'] == 1]
group_min_sn_res = group_min_sn_test.drop('predict', axis=1)

output_path = os.path.join('./', 'memory_predit_res_svm.csv')
print(group_min_sn_res)

# 由于预测pti对分数影响不大，先直接末尾增加pti为1
pti = 5
with open(output_path, 'w') as result_fp:
    for _, _row in group_min_sn_res.iterrows():
        result_fp.write("{},{},{}\n".format(_row.serial_number, _row.collect_time, pti))

Best parameters found:  {'learning_rate': 0.01, 'max_depth': 4, 'n_estimators': 300}
      serial_number        collect_time
761    server_10807 2019-06-03 08:05:00
903    server_10961 2019-06-09 00:10:00
905    server_10961 2019-06-09 00:20:00
906    server_10961 2019-06-09 00:25:00
907    server_10961 2019-06-09 00:30:00
...             ...                 ...
54688   server_9924 2019-06-01 15:10:00
54689   server_9924 2019-06-04 14:30:00
54694   server_9924 2019-06-10 13:05:00
54695   server_9924 2019-06-11 12:50:00
54697   server_9924 2019-06-13 12:25:00

[520 rows x 2 columns]


## 分段说明

In [None]:
from datetime import timedelta
from sklearn.ensemble import IsolationForest
import pandas as pd
import os
import xgboost
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

kernel_log_data_path = r'memory_sample_kernel_log_round1_a_train.csv'
failure_tag_data_path = r'memory_sample_failure_tag_round1_a_train.csv'
PARENT_FOLDER = 'data'

**NOTE**：加入前向填充(这应该是整个代码文件中唯一一个有效的，你也可以尝试其他代码)

**一定要先进行前向填充再进行数据聚合**

In [None]:
# 计算每个agg_time区间的和
def etl(path, agg_time):
    data = pd.read_csv(os.path.join(PARENT_FOLDER, path))
    # 检查缺失值
    if data.isnull().sum().sum() > 0:
        data = data.fillna(method='ffill')  # 填充缺失值
    # 降低时间精度 向上取整
    data['collect_time'] = pd.to_datetime(data['collect_time']).dt.ceil(agg_time)
    group_data = data.groupby(['serial_number', 'collect_time'], as_index=False).agg('sum')
    return group_data

# 设置聚合时间粒度
AGG_VALUE = 5
AGG_UNIT = 'min'
AGG_TIME = str(AGG_VALUE) + AGG_UNIT

In [None]:
# 示例仅使用了kernel数据
group_min = etl(kernel_log_data_path, AGG_TIME)
failure_tag = pd.read_csv(os.path.join(PARENT_FOLDER, failure_tag_data_path))
failure_tag['failure_time'] = pd.to_datetime(failure_tag['failure_time'])

# 为数据打标
merged_data = pd.merge(group_min, failure_tag[['serial_number', 'failure_time']], how='left', on=['serial_number'])
merged_data['failure_tag'] = (merged_data['failure_time'].notnull()) & ((merged_data['failure_time']
                                                                         - merged_data['collect_time']).dt.seconds <= AGG_VALUE * 60)
merged_data['failure_tag'] = merged_data['failure_tag'] + 0
feature_data = merged_data.drop(['serial_number', 'collect_time', 'manufacturer', 'vendor', 'failure_time'], axis=1)

**NOTE：** 下面单元格中的采样方式并没有被使用。因为他被后面的sample变量取代了

In [None]:
# 计算正负样本的数量
num_positive_samples = len(feature_data[feature_data['failure_tag'] == 1])
num_negative_samples = len(feature_data[feature_data['failure_tag'] == 0])

# 根据正样本的数量决定上采样倍数
upsampling_factor = num_positive_samples // num_negative_samples

# 负样本上采样
sampled_negative_data = feature_data[feature_data['failure_tag'] == 0].sample(n=upsampling_factor, replace=True)

# 合并正负样本
upsampled_data = pd.concat([sampled_negative_data, feature_data[feature_data['failure_tag'] == 1]])

# 重新采样后的数据集
sample = upsampled_data

# 将负样本的类别标签从1改为0
upsampled_data['failure_tag'] = upsampled_data['failure_tag'].replace({1: 0})


**NOTE：** 下采样，比例为类别0的0.02.这里的sample最后被使用

In [None]:
# 负样本下采样
sample_0 = feature_data[feature_data['failure_tag'] == 0].sample(frac=0.02)
sample = pd.concat([sample_0, feature_data[feature_data['failure_tag'] == 1]])

**NOTE:** 下面所有构建的特征都没有使用，因为最后训练用的数据集使用的是sample，而这些特征都没有加入到sample中

In [None]:
# 计算异常值得分，将异常值得分作为特征，注意异常值检测特征的位置
clf = IsolationForest(contamination=0.05)  # 设置异常值比例
merged_data['outlier_score'] = clf.fit_predict(feature_data.iloc[:, :-1])

# 时间特征：提取更多时间特征，如小时、分钟、周几等。
merged_data['hour'] = merged_data['collect_time'].dt.hour
merged_data['day_of_week'] = merged_data['collect_time'].dt.dayofweek

# 将异常值得分作为特征
feature_data['outlier_score'] = merged_data['outlier_score']

# 在计算特征后插入异常值检测特征工程
clf = IsolationForest(contamination=0.05)  # 设置异常值比例
merged_data['outlier_score'] = clf.fit_predict(feature_data.iloc[:, :-1])

# 将异常值得分作为特征
feature_data['outlier_score'] = merged_data['outlier_score']

# 过去故障次数
merged_data['past_failure_count'] = merged_data.groupby('serial_number')['failure_tag'].cumsum().shift()

**NOTE:** 
- 这里加入了网格搜索，但是基本没用
- 因为本数据集是严重倾斜的，而这里使用的交叉验证却是常规的交叉验证，如果要使用，应该使用分层交叉验证；如果你的采样比例是0.005，使用常规交叉验证或许是有效的
- learning_rate默认为0.01
- 使用分层交叉划分数据集不应该随机划分，因为该数据集包含时间序列
- 尽管没什么用，但不会对最终结果造成太大影响
- 模型未控制随机性，每次运行可能会有不同的结果

In [None]:
# 负样本上采样   #添加一个网格搜索代码
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

param_grid = {
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300]
}

xgb = XGBClassifier()
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=5)
grid_search.fit(sample.iloc[:, :-1], sample['failure_tag'])
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

# 使用最佳参数训练模型
clf = XGBClassifier(**best_params)
clf.fit(sample.iloc[:, :-1], sample['failure_tag'])

**NOTE:** pti改成了5

In [None]:
# 测试数据
group_data_test = etl(r'memory_sample_kernel_log_k12_round1_a_test.csv', AGG_TIME)
group_min_sn_test = pd.DataFrame(group_data_test[['serial_number', 'collect_time']])
group_min_test = group_data_test.drop(['serial_number', 'collect_time', 'manufacturer', 'vendor'], axis=1)

# 模型预测
res = clf.predict(group_min_test)
group_min_sn_test['predict'] = res

# 保存结果
group_min_sn_test = group_min_sn_test[group_min_sn_test['predict'] == 1]
group_min_sn_res = group_min_sn_test.drop('predict', axis=1)

output_path = os.path.join('./', 'memory_predit_res_svm.csv')
print(group_min_sn_res)

# 由于预测pti对分数影响不大，先直接末尾增加pti为1
pti = 5
with open(output_path, 'w') as result_fp:
    for _, _row in group_min_sn_res.iterrows():
        result_fp.write("{},{},{}\n".format(_row.serial_number, _row.collect_time, pti))

## 特征重要性

In [None]:
import matplotlib.pyplot as plt

# 特征名称
feature_names = sample.columns[:-1]  

# 获取特征重要性并创建 DataFrame
importances = clf.feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
})

# 排序特征重要性
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# 绘制特征重要性条形图
plt.figure(figsize=(12, 8))
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'], color='skyblue')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance')
plt.gca().invert_yaxis()  # 反转Y轴，让重要性最高的特征显示在顶部
plt.show()

In [None]:
feature_importance_df