In [1]:
import pandas as pd

ffs = [
    '成品电导率', 
    '成品PH值', 
    '成品浓度', 
    '成品浊度',
    '成品温度', 
    '成品表面张力', 
    '成品COD', 
    '柠檬酸', 
    '置换量', 
    '离子液添加量', 
    '原液添加比例', 
    '异常类型',
    '单耗'
]

df = pd.read_csv('../data/切割液数据09132.csv', sep='\t', usecols=ffs+['钢线类型'])
df = df[df['钢线类型'] == 'JU-C-32'].copy()
df.pop('钢线类型')

print(f'初始数据量：{len(df)}')

features = [ 
            '成品电导率', 
            '成品PH值', 
            '成品浓度', 
            '成品浊度',
            '成品温度', 
            '成品表面张力', 
            '成品COD', 
            '单耗'
            # '柠檬酸', 
            # '置换量', 
            # '离子液添加量', 
            # '原液添加比例', 
            # '清液池电导率',
            # '清液池PH', 
            # '清液池浓度', 
            # '清液池温度', 
            # '清液池COD', 
            ]

error_type = ['断缝', '跳线', '']
df = df[df['异常类型'].isin(error_type) | df['异常类型'].isna()].copy()
print(f'断缝/正常/跳线数据量：{len(df)}')
df['异常类型2'] = df['异常类型'].apply(lambda x: x if x in ['断缝', '跳线'] else '正常')
df['异常类型'] = df['异常类型'].apply(lambda x: 1 if x in ['断缝', '跳线'] else 0)
df['柠檬酸'] = df['柠檬酸'].map({'柠檬酸80g': 80, '柠檬酸100g': 100, '转换阶段': 40, '0': 0})

import numpy as np
df.replace('#DIV/0!', np.nan, inplace=True)
df.dropna(how='any', inplace=True)
for c in df.columns:
    if c not in ['异常类型2', '钢线类型'] :
        try:
            df[c] = df[c].astype(float)
        except Exception as e:
            print(e)
            print(c)
            df[c] = df[c].apply(lambda x: float(x.replace('%', ''))/100)

初始数据量：11552
断缝/正常/跳线数据量：10597
could not convert string to float: '1.28%'
成品浓度
could not convert string to float: '6.00%'
置换量
could not convert string to float: '0.00%'
离子液添加量


## 分箱

In [12]:
import pandas as pd
from optbinning import OptimalBinning

global bs
bs = dict()


def remark(data, name):
    # 分箱
    # X 是特征，y 是目标变量
    X = data.drop(columns=['异常类型'])
    y = data['异常类型']

    # 使用 optbinning 进行分箱，但不显示分箱结果
    def apply_optimal_binning(data, feature, target):
        opt_bin = OptimalBinning(name=feature, dtype="numerical", min_bin_size=0.05, min_bin_n_event=10)
        opt_bin.fit(data[feature], target)
        
        # 获取并显示分箱区间
        binning_splits = opt_bin.splits
        
        # 获取最小值和最大值
        min_val = data[feature].min()
        max_val = data[feature].max()
        
        # 构建完整的区间
        intervals = [min_val] + list(binning_splits) + [max_val]
        interval_pairs = [(intervals[i], intervals[i+1]) for i in range(len(intervals)-1)]
        
        # 使用分箱后的数据替换原始数据
        binned_data = opt_bin.transform(data[feature])
        
        def get_interval(x):
            for _, (a, b) in enumerate(interval_pairs):
                if a <= x <= b:
                    return str([a, b])
            return str([])
        
        interval_data = data[feature].apply(get_interval)
        
        dd = pd.DataFrame({'区间': interval_data, '分箱后数据': binned_data})
        dd.drop_duplicates(inplace=True)
        dd.sort_values(by='区间', inplace=True)
        bs[feature] = dd
        
        def get_data(x):
            for _, (a, b) in enumerate(interval_pairs):
                if a <= x <= b:
                    return (a+b)/2
            return -1

        binned_data = data[feature].apply(get_data)
        
        return binned_data

    
    # 对每个特征进行分箱并替换原始数据
    for col in features:
        X[col] = apply_optimal_binning(data, col, y)


    # 画图
    import pwlf # type: ignore
    from plotly import express as px
    model = dict()
    dataxy = pd.concat([X, y], axis=1)
    for col in features:
        dataxyg = dataxy.groupby(col).mean().reset_index()
        dataxyg.sort_values(col, inplace=False)
        fig = px.scatter(dataxyg, x=col, y='异常类型', title=f'{name}的{col}作用')
        r = pwlf.PiecewiseLinFit(dataxyg[col], dataxyg['异常类型'])
        r.fit(len(dataxyg))
        model[col] = r
        fig.add_scatter(x=dataxyg[col], y=r.predict(dataxyg[col]), mode='lines', name='拟合曲线')
        # fig.show() 
        
    
    # 拟合
    dataxy_cov = dataxy.groupby(features).mean().reset_index()
    for col in features:
        dataxy_cov[col] = model[col].predict(dataxy_cov[col].values)
        
    display(dataxy_cov.head(2))

    from sklearn.linear_model import Ridge
    from sklearn.metrics import mean_squared_error

    r = Ridge(fit_intercept=True)
    r.fit(dataxy_cov[features], dataxy_cov['异常类型'])
    ess = np.sqrt(mean_squared_error(dataxy_cov['异常类型'], r.predict(dataxy_cov[features])))
    print(f'''均方误差：{ess}''')
    print(f'''误差百分比：{ess/dataxy_cov['异常类型'].mean()}''')

    # 设置浮点数显示格式，禁用科学计数法
    pd.set_option('display.float_format', '{:.2f}'.format)
    np.set_printoptions(suppress=True, precision=2)
    r.coef_, r.intercept_
    
    # 成品值预测
    import copy
    dataxyn = dataxy.groupby(features).mean().reset_index()
    dataxyng = copy.deepcopy(dataxyn[features])
    for col in features:
        dataxyn[col] = model[col].predict(dataxyn[col].values)


    dataxyng['异常率历史'] = dataxyn['异常类型']
    dataxyn['异常类型'] = r.predict(dataxyn[features])
    dataxyng['异常率'] = dataxyn['异常类型']
    dataxyng.sort_values('异常率', ascending=True, inplace=True)
    dataxyng[:10]
    
    
    bsn = dict()
    bsn2 = dict()
    for k, vs in bs.items():
        bsn[k] = dict()
        bsn2[k] = dict()
        for v in vs['区间'].values:
            nk = (eval(v)[0] + eval(v)[1])/2
            nv = [round(eval(v)[0], 4), round(eval(v)[1], 4)]
            bsn[k][nk] = nv

    for col in features:
        dataxyng[col] = dataxyng[col].apply(lambda x: bsn[col][x])

    display(dataxyng[:1])
    
    # 众数
    best_sel = dict()
    print(f'{name}最佳配方:')
    for col in features:
        mm = dataxyng[col][:1].mode().values[0]
        print(col, mm)
        best_sel[col] = mm
        
    features2 = [
        '柠檬酸', 
        '置换量',
        '离子液添加量', 
        '原液添加比例',
    ]
    data2 = df[features + features2].copy()
    for col in features:
        # print(col)
        mpp = bsn[col]
        def get_data(x):
            for _, v in mpp.items():
                if v[0] <= x <= v[1]:
                    return str(v)
            return -1
        data2[col] = data2[col].apply(get_data)
    
    datasel = data2.copy()
    for k, v in best_sel.items():
        datasel = datasel[datasel[k] == str(v)].copy()
        # print(k, v, len(datasel))
    
    display(datasel[features2].drop_duplicates(), len(datasel))
    
    return dataxy, model

# data2 = df[df['异常类型2'].isin(['跳线', '正常'])].copy()
# data2 = data2[features + ['异常类型']]
# dataxy2 = remark(data2, '跳线')

In [13]:
data1 = df[df['异常类型2'].isin(['断缝'])].copy()
data11 = df[df['异常类型2'].isin(['正常'])].copy()
# data11 = data11.sample(n=len(data1)*3, random_state=666)
data1 = pd.concat([data1, data11], axis=0)
data1 = data1[features + ['异常类型']]
print(len(data1))
dataxy1, model1 = remark(data1, '断缝')

8178


Unnamed: 0,成品电导率,成品PH值,成品浓度,成品浊度,成品温度,成品表面张力,成品COD,单耗,异常类型
0,0.14,0.09,0.09,0.1,0.09,0.1,0.11,0.11,0.14
1,0.14,0.1,0.09,0.1,0.12,0.1,0.12,0.1,0.1


均方误差：0.11171801891679123
误差百分比：1.1298723864943105


Unnamed: 0,成品电导率,成品PH值,成品浓度,成品浊度,成品温度,成品表面张力,成品COD,单耗,异常率历史,异常率
545,"[8.705, 13.465]","[5.985, 6.095]","[0.0122, 0.0129]","[51.05, 53.45]","[25.65, 25.75]","[27.85, 27.95]","[22972.5, 23241.25]","[3.15, 3.595]",0.05,0.08


断缝最佳配方:
成品电导率 [8.705, 13.465]
成品PH值 [5.985, 6.095]
成品浓度 [0.0122, 0.0129]
成品浊度 [51.05, 53.45]
成品温度 [25.65, 25.75]
成品表面张力 [27.85, 27.95]
成品COD [22972.5, 23241.25]
单耗 [3.15, 3.595]


Unnamed: 0,柠檬酸,置换量,离子液添加量,原液添加比例
11263,0.0,0.04,0.06,0.61


24

In [14]:
data1 = df[df['异常类型2'].isin(['跳线'])].copy()
data11 = df[df['异常类型2'].isin(['正常'])].copy()
# data11 = data11.sample(n=len(data1)*3, random_state=666)
data2 = pd.concat([data1, data11], axis=0)
data2 = data2[features + ['异常类型']]
print(len(data2))
dataxy2, model2 = remark(data2, '跳线')

8479


Unnamed: 0,成品电导率,成品PH值,成品浓度,成品浊度,成品温度,成品表面张力,成品COD,单耗,异常类型
0,0.13,0.14,0.13,0.12,0.13,0.13,0.13,0.11,0.0
1,0.13,0.14,0.13,0.12,0.11,0.13,0.13,0.11,0.0


均方误差：0.12035173028795346
误差百分比：0.9810167976553351


Unnamed: 0,成品电导率,成品PH值,成品浓度,成品浊度,成品温度,成品表面张力,成品COD,单耗,异常率历史,异常率
340,"[7.415, 9.195]","[6.115, 6.135]","[0.013, 0.0132]","[30.725, 35.6]","[25.75, 26.15]","[27.7, 27.85]","[23858.75, 25875.0]","[4.565, 5.69]",0.0,0.11


跳线最佳配方:
成品电导率 [7.415, 9.195]
成品PH值 [6.115, 6.135]
成品浓度 [0.013, 0.0132]
成品浊度 [30.725, 35.6]
成品温度 [25.75, 26.15]
成品表面张力 [27.7, 27.85]
成品COD [23858.75, 25875.0]
单耗 [4.565, 5.69]


Unnamed: 0,柠檬酸,置换量,离子液添加量,原液添加比例
9953,0.0,0.05,0.0,0.67


6

In [15]:
data1 = df[df['异常类型2'].isin(['跳线', '断缝'])].copy()
data11 = df[df['异常类型2'].isin(['正常'])].copy()
# data11 = data11.sample(n=len(data1)*3, random_state=666)
data3 = pd.concat([data1, data11], axis=0)
data3 = data3[features + ['异常类型']]
print(len(data3))
dataxy3, model3 = remark(data3, '跳线+断缝')

9274


Unnamed: 0,成品电导率,成品PH值,成品浓度,成品浊度,成品温度,成品表面张力,成品COD,单耗,异常类型
0,0.24,0.2,0.21,0.16,0.22,0.2,0.2,0.21,0.18
1,0.24,0.2,0.21,0.2,0.22,0.2,0.2,0.2,0.17


均方误差：0.13769970696793243
误差百分比：0.69424004466637


Unnamed: 0,成品电导率,成品PH值,成品浓度,成品浊度,成品温度,成品表面张力,成品COD,单耗,异常率历史,异常率
412,"[9.055, 9.155]","[5.545, 6.135]","[0.0126, 0.013]","[30.725, 35.6]","[26.25, 27.9]","[27.85, 27.95]","[23668.75, 25875.0]","[4.565, 5.69]",0.0,0.17


跳线+断缝最佳配方:
成品电导率 [9.055, 9.155]
成品PH值 [5.545, 6.135]
成品浓度 [0.0126, 0.013]
成品浊度 [30.725, 35.6]
成品温度 [26.25, 27.9]
成品表面张力 [27.85, 27.95]
成品COD [23668.75, 25875.0]
单耗 [4.565, 5.69]


Unnamed: 0,柠檬酸,置换量,离子液添加量,原液添加比例
10297,100.0,0.05,0.0,0.67


5

In [16]:
import plotly.graph_objs as go
import pwlf

dd = {
    '跳线': dataxy2,
    '断缝': dataxy1,
    '跳线+断缝': dataxy3,
}


for col in features:
    fig = go.Figure()
    for k, dataxy in dd.items():
        dataxyg = dataxy.groupby(col).mean().reset_index()
        dataxyg.sort_values(col, inplace=True)
        fig.add_scatter(x=dataxyg[col], y=dataxyg['异常类型'], mode='markers', name=k)
        r = pwlf.PiecewiseLinFit(dataxyg[col], dataxyg['异常类型'])
        r.fit(len(dataxyg))
        fig.add_scatter(x=dataxyg[col], y=r.predict(dataxyg[col]), mode='lines', name=f'「{k}」拟合')
        fig.update_layout(title=f'{col}作用', xaxis_title=col, yaxis_title='异常率')
    
    fig.show()
        
