In [14]:
import pandas as pd

data = pd.read_csv('../data/切割液数据.csv', sep='\t')

features = [ 
            '成品电导率', 
            '成品PH值', 
            '成品浓度', 
            '成品浊度',
            '成品温度', 
            '成品表面张力', 
            '成品COD', 
            # '喷淋流量',
            # '在线流量'
            # '柠檬酸', 
            # '置换量', 
            # '离子液添加量', 
            # '原液添加比例', 
            # '清液池电导率',
            # '清液池PH', 
            # '清液池浓度', 
            # '清液池温度', 
            # '清液池COD', 
            ]

data['异常类型'] = data['异常类型'].apply(lambda x: 1 if x in ['断上下线', '断缝', '跳线', '断上线', '断下线'] else 0)
data['柠檬酸'] = data['柠檬酸'].map({'柠檬酸80g': 80, '柠檬酸100g': 100, '转换阶段': 20, '0': 0})

import numpy as np
data = data[features + ['异常类型']]
data.replace('#DIV/0!', np.nan, inplace=True)
data.dropna(how='any', inplace=True)
data['异常类型'].value_counts()

异常类型
0    3065
1    1062
Name: count, dtype: int64

## 分箱

In [15]:
import pandas as pd
from optbinning import OptimalBinning

global bs
bs = dict()

# X 是特征，y 是目标变量
X = data.drop(columns=['异常类型'])
y = data['异常类型']

# 使用 optbinning 进行分箱，但不显示分箱结果
def apply_optimal_binning(data, feature, target):
    opt_bin = OptimalBinning(name=feature, dtype="numerical", solver="cp")
    opt_bin.fit(data[feature], target)
    
    # 获取并显示分箱区间
    binning_splits = opt_bin.splits
    
    # 获取最小值和最大值
    min_val = data[feature].min()
    max_val = data[feature].max()
    
    # 构建完整的区间
    intervals = [min_val] + list(binning_splits) + [max_val]
    interval_pairs = [(intervals[i], intervals[i+1]) for i in range(len(intervals)-1)]
    
    # 使用分箱后的数据替换原始数据
    binned_data = opt_bin.transform(data[feature])
    
    def get_interval(x):
        for i, (a, b) in enumerate(interval_pairs):
            if a <= x <= b:
                return str([a, b])
        return str([])
    
    interval_data = data[feature].apply(get_interval)
    
    dd = pd.DataFrame({'区间': interval_data, '分箱后数据': binned_data})
    dd.drop_duplicates(inplace=True)
    dd.sort_values(by='区间', inplace=True)
    bs[feature] = dd
    
    def get_data(x):
        for i, (a, b) in enumerate(interval_pairs):
            if a <= x <= b:
                return (a+b)/2
        return -1

    binned_data = data[feature].apply(get_data)
    
    return binned_data

 
# 对每个特征进行分箱并替换原始数据
for col in X.columns:
    X[col] = apply_optimal_binning(data, col, y)



In [16]:

import pwlf # type: ignore
from plotly import express as px
model = dict()
dataxy = pd.concat([X, y], axis=1)
for col in features:
    dataxyg = dataxy.groupby(col).mean().reset_index()
    dataxyg.sort_values(col, inplace=True)
    fig = px.scatter(dataxyg, x=col, y='异常类型', title=col)
    r = pwlf.PiecewiseLinFit(dataxyg[col], dataxyg['异常类型'])
    r.fit(len(dataxyg))
    model[col] = r
    fig.add_scatter(x=dataxyg[col], y=r.predict(dataxyg[col]), mode='lines', name='拟合曲线')
    fig.show() 

In [17]:

dataxy_cov = dataxy.groupby(features).mean().reset_index()
for col in features:
    dataxy_cov[col] = model[col].predict(dataxy_cov[col].values)
    
display(dataxy_cov.head(2))

from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

r = Ridge(fit_intercept=False)
r.fit(dataxy_cov[features], dataxy_cov['异常类型'])
display(np.sqrt(mean_squared_error(dataxy_cov['异常类型'], r.predict(dataxy_cov[features])))/dataxy_cov['异常类型'].mean(), dataxy_cov['异常类型'].mean())

# 设置浮点数显示格式，禁用科学计数法
pd.set_option('display.float_format', '{:.2f}'.format)
np.set_printoptions(suppress=True, precision=2)
r.coef_, r.intercept_

Unnamed: 0,成品电导率,成品PH值,成品浓度,成品浊度,成品温度,成品表面张力,成品COD,异常类型
0,0.24,0.26,0.26,0.16,0.19,0.2,0.26,0.09
1,0.24,0.26,0.26,0.16,0.3,0.26,0.26,0.13


0.45788618122722496

0.2560564077851739

(array([0.2 , 0.13, 0.06, 0.18, 0.11, 0.09, 0.22]), 0.0)

In [18]:
import copy
dataxyn = dataxy.groupby(features).mean().reset_index()
dataxyng = copy.deepcopy(dataxyn[features])
for col in features:
    dataxyn[col] = model[col].predict(dataxyn[col].values)

dataxyn['异常类型'] = r.predict(dataxyn[features])
dataxyng['异常率'] = dataxyn['异常类型']
dataxyng.sort_values('异常率', ascending=True, inplace=True)
dataxyng[:10]

Unnamed: 0,成品电导率,成品PH值,成品浓度,成品浊度,成品温度,成品表面张力,成品COD,异常率
163,14.35,5.43,0.01,53.93,24.7,28.15,23806.25,0.17
167,14.35,5.43,0.01,66.47,24.7,28.15,23806.25,0.18
164,14.35,5.43,0.01,53.93,25.05,28.15,23806.25,0.18
162,14.35,5.43,0.01,53.93,23.78,28.0,23806.25,0.18
166,14.35,5.43,0.01,53.93,25.65,28.15,23806.25,0.18
170,14.35,5.43,0.01,84.47,25.4,27.87,23806.25,0.19
97,11.94,5.43,0.01,36.25,25.05,28.15,23806.25,0.19
98,11.94,5.43,0.01,43.07,23.78,28.15,23806.25,0.21
161,14.35,5.43,0.01,43.07,23.78,27.87,23704.38,0.21
165,14.35,5.43,0.01,53.93,25.05,28.15,24014.38,0.21


In [19]:
bsn = dict()
bsn2 = dict()
for k, vs in bs.items():
    bsn[k] = dict()
    bsn2[k] = dict()
    for v in vs['区间'].values:
        nk = (eval(v)[0] + eval(v)[1])/2
        nv = [round(eval(v)[0], 2), round(eval(v)[1], 2)]
        bsn[k][nk] = nv

for col in features:
    dataxyng[col] = dataxyng[col].apply(lambda x: bsn[col][x])

dataxyng[:10]

Unnamed: 0,成品电导率,成品PH值,成品浓度,成品浊度,成品温度,成品表面张力,成品COD,异常率
163,"[13.93, 14.77]","[5.36, 5.51]","[0.01, 0.01]","[45.95, 61.9]","[24.55, 24.85]","[28.05, 28.25]","[23768.75, 23843.75]",0.17
167,"[13.93, 14.77]","[5.36, 5.51]","[0.01, 0.01]","[61.9, 71.05]","[24.55, 24.85]","[28.05, 28.25]","[23768.75, 23843.75]",0.18
164,"[13.93, 14.77]","[5.36, 5.51]","[0.01, 0.01]","[45.95, 61.9]","[24.85, 25.25]","[28.05, 28.25]","[23768.75, 23843.75]",0.18
162,"[13.93, 14.77]","[5.36, 5.51]","[0.01, 0.01]","[45.95, 61.9]","[23.0, 24.55]","[27.95, 28.05]","[23768.75, 23843.75]",0.18
166,"[13.93, 14.77]","[5.36, 5.51]","[0.01, 0.01]","[45.95, 61.9]","[25.45, 25.85]","[28.05, 28.25]","[23768.75, 23843.75]",0.18
170,"[13.93, 14.77]","[5.36, 5.51]","[0.01, 0.01]","[71.05, 97.9]","[25.35, 25.45]","[27.8, 27.95]","[23768.75, 23843.75]",0.19
97,"[9.95, 13.93]","[5.36, 5.51]","[0.01, 0.01]","[32.3, 40.2]","[24.85, 25.25]","[28.05, 28.25]","[23768.75, 23843.75]",0.19
98,"[9.95, 13.93]","[5.36, 5.51]","[0.01, 0.01]","[40.2, 45.95]","[23.0, 24.55]","[28.05, 28.25]","[23768.75, 23843.75]",0.21
161,"[13.93, 14.77]","[5.36, 5.51]","[0.01, 0.01]","[40.2, 45.95]","[23.0, 24.55]","[27.8, 27.95]","[23640.0, 23768.75]",0.21
165,"[13.93, 14.77]","[5.36, 5.51]","[0.01, 0.01]","[45.95, 61.9]","[24.85, 25.25]","[28.05, 28.25]","[23843.75, 24185.0]",0.21


In [24]:
# 众数
for col in features:
    mm = dataxyng[col][:5].mode().values[0]
    print(col, mm)

成品电导率 [13.93, 14.77]
成品PH值 [5.36, 5.51]
成品浊度 [45.95, 61.9]
成品温度 [24.55, 24.85]
成品表面张力 [28.05, 28.25]
成品COD [23768.75, 23843.75]


In [20]:
# data = pd.read_csv('../data/切割液数据.csv', sep='\t')

# features = [ 
#             '成品电导率', 
#             '成品PH值',  
#             '成品浊度',
#             '成品温度', 
#             '成品表面张力', 
#             '成品COD', 
#             # '喷淋流量',
#             # '在线流量'
#             # '柠檬酸', 
#             # '置换量', 
#             # '离子液添加量', 
#             # '原液添加比例', 
#             # '清液池电导率',
#             # '清液池PH', 
#             # '清液池浓度', 
#             # '清液池温度', 
#             # '清液池COD', 

#             ]

# features2 = [
#     '柠檬酸', 
#     '置换量', 
#     '离子液添加量', 
#     '原液添加比例',
# ]

# data['异常类型'] = data['异常类型'].apply(lambda x: 1 if x in ['断上下线', '断缝', '跳线', '断上线', '断下线'] else 0)
# data['柠檬酸'] = data['柠檬酸'].map({'柠檬酸80g': 80, '柠檬酸100g': 100, '转换阶段': 20, '0': 0})

# data = data[features + features2].copy()



# for col in features:
#     mpp = bsn[col]
#     def get_data(x):
#         for k, v in mpp.items():
#             if v[0] <= x <= v[1]:
#                 return str(v)
#         return -1
#     data[col] = data[col].apply(get_data)

# c1 = (data['成品电导率']=='[13.93, 14.77]')
# c2 = (data['成品PH值']=='[5.36, 5.51]')
# c3 = (data['成品表面张力']=='[28.05, 28.25]')
# c4 = (data['成品COD']=='[23768.75, 23843.75]')

# datasel = data[c1 & c2 & c3 & c4]
# datasel[features2].describe()