In [1]:
import pandas as pd
import numpy as np

ffs = [
    '清液池电导率',
    '清液池PH',
    '清液池浓度',
    '清液池温度',
    '清液池COD',
    '单刀供液量',
    '柠檬酸添加比例', 
    '置换量', 
    '离子液添加量', 
    '原液添加比例', 
    '异常类型',
]

df = pd.read_csv('../data/切割液数据0918.csv', sep='\t', usecols=ffs+['钢线类型'], low_memory=False)
df = df[df['钢线类型'] == 'JU-C-32'].copy()
df.pop('钢线类型')

print(f'初始数据量：{len(df)}')

features = [ 
    '清液池电导率',
    '清液池PH',
    '清液池浓度',
    '清液池温度',
    '清液池COD',
    '单刀供液量',
    ]

error_type = ['断缝', '跳线', '']
df = df[df['异常类型'].isin(error_type) | df['异常类型'].isna()].copy()
print(f'断缝/正常/跳线数据量：{len(df)}')
df['异常类型2'] = df['异常类型'].apply(lambda x: x if x in ['断缝', '跳线'] else '正常')
df['异常类型'] = df['异常类型'].apply(lambda x: 1 if x in ['断缝', '跳线'] else 0)
df['柠檬酸添加比例'] = df['柠檬酸添加比例'].replace('#VALUE!', np.nan)


df.replace('#DIV/0!', np.nan, inplace=True)
df.dropna(how='any', inplace=True)
for c in df.columns:
    if c not in ['异常类型2', '钢线类型'] :
        try:
            df[c] = df[c].astype(float)
        except Exception as e:
            print(c, e)
            df[c] = df[c].apply(lambda x: float(x.replace('%', ''))/100)

初始数据量：11412
断缝/正常/跳线数据量：10458
置换量 could not convert string to float: '6.000%'
离子液添加量 could not convert string to float: '0.00%'
清液池浓度 could not convert string to float: '1.23%'


## 分箱

需要刀数，统计异常率，拟合异常率


In [2]:
import pandas as pd
from optbinning import OptimalBinning
import numpy as np
import pwlf # type: ignore
from sklearn.linear_model import Ridge
import copy
import itertools

# global bs
# bs = dict()

def apply_optimal_binning(data, feature, target):
    '''
    使用 OptimalBinning 对数据进行分箱处理
    :data: 数据
    :feature: 特征
    :target: 目标变量
    '''
    
    opt_bin = OptimalBinning(name=feature, dtype="numerical", min_bin_size=0.005, min_bin_n_event=10)
    opt_bin.fit(data[feature], target)
    
    # 获取并显示分箱区间
    binning_splits = opt_bin.splits
    
    # 获取最小值和最大值
    min_val = data[feature].min()
    max_val = data[feature].max()
    
    # 构建完整的区间
    intervals = [min_val] + list(binning_splits) + [max_val]
    interval_pairs = [(intervals[i], intervals[i+1]) for i in range(len(intervals)-1)]
    
    def get_interval(x):
        for _, (a, b) in enumerate(interval_pairs):
            if a <= x <= b:
                return str([round(a, 4), round(b, 4)])
        return str([])
    
    binned_data_interval = data[feature].apply(get_interval)
    
    def get_data(x):
        for _, (a, b) in enumerate(interval_pairs):
            if a <= x <= b:
                return (a+b)/2
        return -1

    binned_data_mean = data[feature].apply(get_data)
    
    return binned_data_mean, binned_data_interval


def remark(dataall, features=features, features_formul=['柠檬酸添加比例', '置换量', '离子液添加量', '原液添加比例']):
    '''
    对数据进行分箱处理，并使用线性回归模型进行拟合
    :data: 数据
    :features: 成品特征
    :features_formul: 配方特征
    '''
    data = dataall[features + ['异常类型']].copy()
    # X 是特征，y 是目标变量
    X1 = data.drop(columns=['异常类型'])
    X2 = copy.deepcopy(X1)
    y = data['异常类型']

    # 对每个特征进行分箱并替换原始数据
    for col in X1.columns:
        X1[col], X2[col] = apply_optimal_binning(data, col, y)
    
    # 选择特征
    features2 = [x + '区间' for x in features]
    X2 = X2[features]
    X2.columns = features2
    
    featuresplus = features + features2
    
    # 拟合单个元素的作用
    # 返回modelpwlfs
    modelpwlfs = dict()
    dataxy = pd.concat([X1, X2, y], axis=1)
    for col in features:
        dataxyg = dataxy[[col, '异常类型']].groupby(col).mean().reset_index() # 单个特征与异常类型的均值
        dataxyg.sort_values(col, inplace=True)
        r = pwlf.PiecewiseLinFit(dataxyg[col], dataxyg['异常类型'])
        r.fit(len(dataxyg))
        modelpwlfs[col] = r
        
    # 拟合单个元素产生的y值
    dataxy_cov = dataxy.groupby(featuresplus).agg(
        异常率统计=('异常类型', 'mean'), 
        刀数=('异常类型', 'count')
        ).reset_index() # 多个特征与异常类型的均值
    dataxy_cov_pwlf = copy.deepcopy(dataxy_cov)

    # 转化为用区间均值预测的数据
    for col in features:
        dataxy_cov_pwlf[col] = modelpwlfs[col].predict(dataxy_cov_pwlf[col].values)

    r = Ridge(fit_intercept=False)
    r.fit(dataxy_cov_pwlf[features], dataxy_cov_pwlf['异常率统计'])
    
    # 寻找最好的成品值
    dataxy_cov['异常率估计'] = r.predict(dataxy_cov_pwlf[features])
    dataxy_cov['刀数'] = dataxy_cov['刀数']
    dataxy_cov.sort_values('异常率估计', ascending=True, inplace=True)

    # 寻找最最佳配方
    def get_best_formula(x, features_formul):
        '''
        获取最佳配方
        :data: 数据
        :features_formul: 配方特征
        '''
        best_quality = x.to_dict()
        data_formul = pd.concat([X2, dataall[features_formul]], axis=1)
        for k, v in best_quality.items():
            data_formul = data_formul[data_formul[k] == v]
        
        data_formul.sort_values(by=features_formul[:-1], inplace=True, ascending=True)
        return data_formul[features_formul].iloc[0].T.to_dict()
    
    dataxy_cov['配比'] = dataxy_cov.apply(lambda x: get_best_formula(x[features2], features_formul), axis=1)
    return dataxy_cov[features2 + ['异常率统计', '异常率估计', '刀数', '配比']]



In [3]:
name = '0918清液池'

In [4]:
data1 = df[df['异常类型2'].isin(['断缝'])].copy()
data11 = df[df['异常类型2'].isin(['正常'])].copy()
data1 = pd.concat([data1, data11], axis=0)
data1 = data1[ffs]
data1 = data1.sample(frac=1).reset_index(drop=True)
print('断缝最优比例')
data = remark(data1)
# data.to_csv('../model/断缝最优比例.csv', index=False)
data['异常率统计'] = data['异常率统计'] * data['刀数']
data['异常率估计'] = data['异常率估计'] * data['刀数']
data['配比'] = data['配比'].apply(lambda x: str(x))
datag = data.groupby('配比').agg(
    刀数=('刀数', 'sum'),
    异常率统计=('异常率统计', 'sum'),
    异常率估计=('异常率估计', 'sum')
    ).reset_index()

datag['异常率统计'] = datag['异常率统计'] / datag['刀数']
datag['异常率估计'] = datag['异常率估计'] / datag['刀数']
datag.sort_values('异常率统计', inplace=True)
datag.rename(columns={'异常率统计': '配比异常率统计', '异常率估计': '配比异常率估计', '刀数': '配比刀数'}, inplace=True)
# datag
df2 = pd.merge(data, datag, on='配比', how='inner')
df2.to_csv(f'../model/{name}_断缝聚合.csv', index=False)
df2.head(5)

断缝最优比例


Unnamed: 0,清液池电导率区间,清液池PH区间,清液池浓度区间,清液池温度区间,清液池COD区间,单刀供液量区间,异常率统计,异常率估计,刀数,配比,配比刀数,配比异常率统计,配比异常率估计
0,"[7.76, 8.365]","[5.32, 5.445]","[0.0119, 0.0121]","[22.1, 23.35]","[22862.5, 22972.5]","[4.935, 5.065]",8.0,12.898713,193,"{'柠檬酸添加比例': 0.0, '置换量': 0.04, '离子液添加量': 0.06, ...",1011,0.053412,0.076844
1,"[7.525, 7.76]","[5.585, 5.625]","[0.0119, 0.0121]","[22.1, 23.35]","[22862.5, 22972.5]","[4.935, 5.065]",1.0,2.714552,40,"{'柠檬酸添加比例': 0.0, '置换量': 0.04, '离子液添加量': 0.06, ...",1011,0.053412,0.076844
2,"[8.365, 9.145]","[5.32, 5.445]","[0.0119, 0.0121]","[22.1, 23.35]","[22862.5, 22972.5]","[4.935, 5.065]",2.0,3.101536,45,"{'柠檬酸添加比例': 0.0, '置换量': 0.04, '离子液添加量': 0.06, ...",1011,0.053412,0.076844
3,"[7.76, 8.365]","[5.32, 5.445]","[0.0119, 0.0121]","[23.65, 24.55]","[22862.5, 22972.5]","[4.935, 5.065]",4.0,3.03211,43,"{'柠檬酸添加比例': 0.0, '置换量': 0.04, '离子液添加量': 0.06, ...",1011,0.053412,0.076844
4,"[7.76, 8.365]","[5.445, 5.515]","[0.0119, 0.0121]","[22.1, 23.35]","[22862.5, 22972.5]","[4.935, 5.065]",2.0,3.107898,44,"{'柠檬酸添加比例': 0.0, '置换量': 0.04, '离子液添加量': 0.06, ...",1011,0.053412,0.076844


In [5]:
data1 = df[df['异常类型2'].isin(['跳线'])].copy()
data11 = df[df['异常类型2'].isin(['正常'])].copy()
data1 = pd.concat([data1, data11], axis=0)
data1 = data1[ffs]
data1 = data1.sample(frac=1).reset_index(drop=True)
print('断缝最优比例')
data = remark(data1)
# data.to_csv('../model/断缝最优比例.csv', index=False)
data['异常率统计'] = data['异常率统计'] * data['刀数']
data['异常率估计'] = data['异常率估计'] * data['刀数']
data['配比'] = data['配比'].apply(lambda x: str(x))
datag = data.groupby('配比').agg(
    刀数=('刀数', 'sum'),
    异常率统计=('异常率统计', 'sum'),
    异常率估计=('异常率估计', 'sum')
    ).reset_index()

datag['异常率统计'] = datag['异常率统计'] / datag['刀数']
datag['异常率估计'] = datag['异常率估计'] / datag['刀数']
datag.sort_values('异常率统计', inplace=True)
datag.rename(columns={'异常率统计': '配比异常率统计', '异常率估计': '配比异常率估计', '刀数': '配比刀数'}, inplace=True)
# datag
df2 = pd.merge(data, datag, on='配比', how='inner')
df2.to_csv(f'../model/{name}_跳线聚合.csv', index=False)
df2.head(5)

断缝最优比例


Unnamed: 0,清液池电导率区间,清液池PH区间,清液池浓度区间,清液池温度区间,清液池COD区间,单刀供液量区间,异常率统计,异常率估计,刀数,配比,配比刀数,配比异常率统计,配比异常率估计
0,"[8.045, 8.16]","[5.32, 5.415]","[0.0119, 0.0121]","[22.1, 23.85]","[22862.5, 22928.75]","[4.915, 5.145]",4.0,5.430492,49,"{'柠檬酸添加比例': 0.0, '置换量': 0.04, '离子液添加量': 0.06, ...",1025,0.099512,0.117461
1,"[8.045, 8.16]","[5.415, 5.475]","[0.0119, 0.0121]","[24.55, 25.15]","[23381.25, 24377.5]","[4.915, 5.145]",1.0,3.327271,30,"{'柠檬酸添加比例': 0.0, '置换量': 0.04, '离子液添加量': 0.06, ...",1025,0.099512,0.117461
2,"[7.81, 8.045]","[5.415, 5.475]","[0.0121, 0.0122]","[22.1, 23.85]","[22862.5, 22928.75]","[4.915, 5.145]",4.0,5.070165,45,"{'柠檬酸添加比例': 0.0, '置换量': 0.04, '离子液添加量': 0.06, ...",1025,0.099512,0.117461
3,"[7.81, 8.045]","[5.415, 5.475]","[0.0119, 0.0121]","[22.1, 23.85]","[22928.75, 23092.5]","[4.915, 5.145]",4.0,4.850956,43,"{'柠檬酸添加比例': 0.0, '置换量': 0.04, '离子液添加量': 0.06, ...",1025,0.099512,0.117461
4,"[8.16, 9.035]","[5.415, 5.475]","[0.0119, 0.0121]","[22.1, 23.85]","[22862.5, 22928.75]","[4.915, 5.145]",7.0,5.670576,50,"{'柠檬酸添加比例': 0.0, '置换量': 0.04, '离子液添加量': 0.06, ...",1025,0.099512,0.117461


In [6]:
data1 = df[df['异常类型2'].isin(['断缝', '跳线'])].copy()
data11 = df[df['异常类型2'].isin(['正常'])].copy()
data1 = pd.concat([data1, data11], axis=0)
data1 = data1[ffs]
data1 = data1.sample(frac=1).reset_index(drop=True)
print('断缝最优比例')
data = remark(data1)
# data.to_csv('../model/断缝最优比例.csv', index=False)
data['异常率统计'] = data['异常率统计'] * data['刀数']
data['异常率估计'] = data['异常率估计'] * data['刀数']
data['配比'] = data['配比'].apply(lambda x: str(x))
datag = data.groupby('配比').agg(
    刀数=('刀数', 'sum'),
    异常率统计=('异常率统计', 'sum'),
    异常率估计=('异常率估计', 'sum')
    ).reset_index()

datag['异常率统计'] = datag['异常率统计'] / datag['刀数']
datag['异常率估计'] = datag['异常率估计'] / datag['刀数']
datag.sort_values('异常率统计', inplace=True)
datag.rename(columns={'异常率统计': '配比异常率统计', '异常率估计': '配比异常率估计', '刀数': '配比刀数'}, inplace=True)
# datag
df2 = pd.merge(data, datag, on='配比', how='inner')
df2.to_csv(f'../model/{name}_断缝+跳线聚合.csv', index=False)
df2.head(5)

断缝最优比例


Unnamed: 0,清液池电导率区间,清液池PH区间,清液池浓度区间,清液池温度区间,清液池COD区间,单刀供液量区间,异常率统计,异常率估计,刀数,配比,配比刀数,配比异常率统计,配比异常率估计
0,"[7.585, 7.735]","[5.475, 5.95]","[0.0119, 0.0121]","[22.1, 23.35]","[22862.5, 22928.75]","[4.935, 5.065]",4.0,7.200034,43,"{'柠檬酸添加比例': 0.0, '置换量': 0.04, '离子液添加量': 0.06, ...",1113,0.143756,0.183942
1,"[7.585, 7.735]","[5.475, 5.95]","[0.0119, 0.0121]","[23.35, 23.95]","[22862.5, 22928.75]","[4.935, 5.065]",8.0,9.537087,56,"{'柠檬酸添加比例': 0.0, '置换量': 0.04, '离子液添加量': 0.06, ...",1113,0.143756,0.183942
2,"[7.735, 8.16]","[5.32, 5.425]","[0.0119, 0.0121]","[22.1, 23.35]","[22862.5, 22928.75]","[4.935, 5.065]",7.0,8.857272,52,"{'柠檬酸添加比例': 0.0, '置换量': 0.04, '离子液添加量': 0.06, ...",1113,0.143756,0.183942
3,"[7.585, 7.735]","[5.475, 5.95]","[0.0121, 0.0122]","[22.1, 23.35]","[22862.5, 22928.75]","[4.935, 5.065]",4.0,6.618988,38,"{'柠檬酸添加比例': 0.0, '置换量': 0.04, '离子液添加量': 0.06, ...",1113,0.143756,0.183942
4,"[8.16, 8.45]","[5.32, 5.425]","[0.0119, 0.0121]","[22.1, 23.35]","[22862.5, 22928.75]","[4.935, 5.065]",7.0,10.316056,59,"{'柠檬酸添加比例': 0.0, '置换量': 0.04, '离子液添加量': 0.06, ...",1113,0.143756,0.183942


## 聚类

In [7]:
# mdf = pd.DataFrame(data['配比'].tolist())

# # 聚类
# from sklearn.cluster import KMeans
# from sklearn.preprocessing import StandardScaler
# from sklearn.metrics import silhouette_score

# scaler = StandardScaler()
# X = mdf.values
# X = scaler.fit_transform(X)
# n_clusters = range(2, 15)
# silhouette = []
# for n in n_clusters:
#     kmeans = KMeans(n_clusters=n, random_state=0)
#     kmeans.fit(X)
#     silhouette.append(silhouette_score(X, kmeans.labels_))

# import plotly.express as px
# fig = px.line(x=n_clusters, y=silhouette, title='最佳聚类数')
# fig.show()
