# 1. Bulid the datset of the unidentified workers

# 2. Define the file dir of datasets

In [None]:
# GT_FILE = 'data/AQ_GTD.csv'
# UN_FILE = 'data/AQ_unkown.csv'
GT_FILE = 'data/AEP_GTD.csv'
UN_FILE = 'data/AEP_unkown.csv'

# 3. OC-SVM sensitivity, specificity，accuracy，F1-score

In [2]:
import numpy as np
import pandas as pd
from sklearn.svm import OneClassSVM
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import normalize
import pandas as pd
from sklearn.preprocessing import StandardScaler

# 加载 CSV 文件
data = pd.read_csv(GT_FILE)
unidentified_data = pd.read_csv(UN_FILE)
labels = unidentified_data['Label'].values
unidentified_data = unidentified_data.drop(columns=['Label'])

scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)
unidentified_data_scaled = scaler.transform(unidentified_data)
# Convert the standardized data arrays back to dataframes
data = pd.DataFrame(data_scaled,columns=data.columns)
unidentified_data = pd.DataFrame(unidentified_data_scaled,columns=unidentified_data.columns)

n_columns = data.shape[1]
n_columns_1 = unidentified_data.shape[1]
sensitivity_list = []
specificity_list = []
accuracy_list = []
f1_score_list = []
# 每两列进行异常检测
for i in range(1, n_columns, 2):
    X = data.iloc[:, i:i + 2].values
#     使用 One-Class SVM 进行异常检测
    clf = OneClassSVM(nu=0.01, kernel="rbf", gamma=0.05)
    clf.fit(X)
    X_2 = unidentified_data.iloc[:, i:i + 2].values
    y_pred = clf.predict(X_2)
    # 将 One-Class SVM 的输出转换为与原始标签相同的格式（1 表示正常，0 表示异常）
    y_pred = np.where(y_pred == 1, 1, 0)
    # 计算混淆矩阵
    tn, fp, fn, tp = confusion_matrix(labels, y_pred).ravel()
    # 计算灵敏度（召回率）、特异度、正确率和 F1-score
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    accuracy = accuracy_score(labels, y_pred)
    f1 = f1_score(labels, y_pred)
    # 保存每一次的结果
    sensitivity_list.append(sensitivity)
    specificity_list.append(specificity)
    accuracy_list.append(accuracy)
    f1_score_list.append(f1)
print("Sensitivity:", sensitivity_list)
print("Specificity:", specificity_list)
print("Accuracy:", accuracy_list)
print("F1-score:", f1_score_list)

Sensitivity: [0.975, 0.95, 0.95, 0.975, 0.95, 0.95, 0.95, 0.925, 0.975, 0.975]
Specificity: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
Accuracy: [0.99, 0.98, 0.98, 0.99, 0.98, 0.98, 0.98, 0.97, 0.99, 0.99]
F1-score: [0.9873417721518987, 0.9743589743589743, 0.9743589743589743, 0.9873417721518987, 0.9743589743589743, 0.9743589743589743, 0.9743589743589743, 0.961038961038961, 0.9873417721518987, 0.9873417721518987]


# 4. MTI sensitivity, specificity，accuracy，F1-score

In [3]:
import pandas as pd
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
def main():
    # 读取CSV文件
    df = pd.read_csv(UN_FILE)
    # df = df.iloc[:].drop('Unnamed: 0',axis=1)
    # 转换为数值类型
    df = df.apply(pd.to_numeric, errors='coerce')
    # 计算每一列的均值
    mean_values = df.mean()
    # 设置阈值
    threshold = 2
    # 遍历所有列并计算灵敏度、特异度、正确率和F1-score
    for column in df.columns:
        if column == 'Label':
            continue
        # 使用均值和阈值作为分类器
        df['predicted'] = df[column].apply(lambda x: 1 if mean_values[column] - threshold <= x <= mean_values[column] + threshold else 0)
        # 计算混淆矩阵
        cm = confusion_matrix(df['Label'], df['predicted'])
        # 计算灵敏度、特异度、正确率和F1-score
        sensitivity = cm[1, 1] / (cm[1, 1] + cm[1, 0])
        specificity = cm[0, 0] / (cm[0, 0] + cm[0, 1])
        accuracy = accuracy_score(df['Label'], df['predicted'])
        f1 = f1_score(df['Label'], df['predicted'])
        print(f"Column: {column}",f"Sensitivity: {sensitivity:.2f}",f"Specificity: {specificity:.2f}"
              ,f"Accuracy: {accuracy:.2f}",f"F1-score: {f1:.2f}")
        print("-"*100)
if __name__ == "__main__":
    main()

Column: ID Sensitivity: 0.00 Specificity: 0.93 Accuracy: 0.56 F1-score: 0.00
----------------------------------------------------------------------------------------------------
Column: L1_1 Sensitivity: 0.55 Specificity: 0.78 Accuracy: 0.69 F1-score: 0.59
----------------------------------------------------------------------------------------------------
Column: L1_2 Sensitivity: 0.00 Specificity: 1.00 Accuracy: 0.60 F1-score: 0.00
----------------------------------------------------------------------------------------------------
Column: L2_1 Sensitivity: 0.38 Specificity: 0.90 Accuracy: 0.69 F1-score: 0.49
----------------------------------------------------------------------------------------------------
Column: L2_2 Sensitivity: 0.00 Specificity: 1.00 Accuracy: 0.60 F1-score: 0.00
----------------------------------------------------------------------------------------------------
Column: L3_1 Sensitivity: 1.00 Specificity: 0.85 Accuracy: 0.91 F1-score: 0.90
-----------------------

# 5. MVI sensitivity, specificity，accuracy，F1-score

In [4]:
import pandas as pd
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
def MV(column):
    value_counts = column.value_counts()
    majority_value = value_counts.idxmax()
    return column.apply(lambda x: 1 if x == majority_value else 0)
def main():
    # 读取CSV文件
    df = pd.read_csv(UN_FILE)
    # df = df.iloc[:].drop('Unnamed: 0',axis=1)
    # 转换为数值类型
    df = df.apply(pd.to_numeric, errors='coerce')
    # 遍历所有列并计算灵敏度、特异度、正确率和F1-score
    for column in df.columns:
        if column == 'Label':
            continue
        # 使用MV推理方法作为分类器
        df['predicted'] = MV(df[column])
        # 计算混淆矩阵
        cm = confusion_matrix(df['Label'], df['predicted'])
        # 计算灵敏度、特异度、正确率和F1-score
        sensitivity = cm[1, 1] / (cm[1, 1] + cm[1, 0])
        specificity = cm[0, 0] / (cm[0, 0] + cm[0, 1])
        accuracy = accuracy_score(df['Label'], df['predicted'])
        f1 = f1_score(df['Label'], df['predicted'])
        print(f"Column: {column}",f"Sensitivity: {sensitivity:.2f}",f"Specificity: {specificity:.2f}",
              f"Accuracy: {accuracy:.2f}",f"F1-score: {f1:.2f}")
        print("-"*100)
if __name__ == "__main__":
    main()


Column: ID Sensitivity: 0.03 Specificity: 1.00 Accuracy: 0.61 F1-score: 0.05
----------------------------------------------------------------------------------------------------
Column: L1_1 Sensitivity: 0.17 Specificity: 1.00 Accuracy: 0.67 F1-score: 0.30
----------------------------------------------------------------------------------------------------
Column: L1_2 Sensitivity: 0.10 Specificity: 1.00 Accuracy: 0.64 F1-score: 0.18
----------------------------------------------------------------------------------------------------
Column: L2_1 Sensitivity: 0.23 Specificity: 1.00 Accuracy: 0.69 F1-score: 0.37
----------------------------------------------------------------------------------------------------
Column: L2_2 Sensitivity: 0.10 Specificity: 1.00 Accuracy: 0.64 F1-score: 0.18
----------------------------------------------------------------------------------------------------
Column: L3_1 Sensitivity: 0.25 Specificity: 1.00 Accuracy: 0.70 F1-score: 0.40
-----------------------

# 6. WTI sensitivity, specificity，accuracy，F1-score

In [5]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
def WTI(column, threshold=1e-4):
    def calculate_weighted_mean(column, weights):
        return np.average(column, weights=weights)
    def calculate_weights(column, mean):
        distances = np.abs(column - mean)
        return np.square(1 / (distances + 1e-8))
    prev_mean = column.mean()
    while True:
        weights = calculate_weights(column, prev_mean)
        new_mean = calculate_weighted_mean(column, weights)
        if np.abs(new_mean - prev_mean) < threshold:
            break
        prev_mean = new_mean
    return column.apply(lambda x: 1 if x >= new_mean else 0)
def main():
    # 读取CSV文件
    df = pd.read_csv(UN_FILE)
    # df = df.iloc[:].drop('Unnamed: 0',axis=1)
    # 转换为数值类型
    df = df.apply(pd.to_numeric, errors='coerce')
    # 遍历所有列并计算灵敏度、特异度、正确率和F1-score
    for column in df.columns:
        if column == 'Label':
            continue
        # 使用WTI推理方法作为分类器
        df['predicted'] = WTI(df[column])
        # 计算混淆矩阵
        cm = confusion_matrix(df['Label'], df['predicted'])
        # 计算灵敏度、特异度、正确率和F1-score
        sensitivity = cm[1, 1] / (cm[1, 1] + cm[1, 0])
        specificity = cm[0, 0] / (cm[0, 0] + cm[0, 1])
        accuracy = accuracy_score(df['Label'], df['predicted'])
        f1 = f1_score(df['Label'], df['predicted'])
        print(f"Column: {column}",f"Sensitivity: {sensitivity:.2f}",f"Specificity: {specificity:.2f}"
              ,f"Accuracy: {accuracy:.2f}",f"F1-score: {f1:.2f}")
        print("-"*100)
if __name__ == "__main__":
    main()

Column: ID Sensitivity: 0.00 Specificity: 0.17 Accuracy: 0.10 F1-score: 0.00
----------------------------------------------------------------------------------------------------
Column: L1_1 Sensitivity: 0.97 Specificity: 0.50 Accuracy: 0.69 F1-score: 0.72
----------------------------------------------------------------------------------------------------
Column: L1_2 Sensitivity: 1.00 Specificity: 0.90 Accuracy: 0.94 F1-score: 0.93
----------------------------------------------------------------------------------------------------
Column: L2_1 Sensitivity: 1.00 Specificity: 0.57 Accuracy: 0.74 F1-score: 0.75
----------------------------------------------------------------------------------------------------
Column: L2_2 Sensitivity: 1.00 Specificity: 0.88 Accuracy: 0.93 F1-score: 0.92
----------------------------------------------------------------------------------------------------
Column: L3_1 Sensitivity: 0.70 Specificity: 0.52 Accuracy: 0.59 F1-score: 0.58
-----------------------

# 16. Worker Quality

In [7]:
import numpy as np
import pandas as pd
from sklearn.svm import OneClassSVM
from collections import defaultdict
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler
import random,os
# 加载 CSV 文件
data = pd.read_csv(GT_FILE).drop(columns=['ID'])
unidentified_data = pd.read_csv(UN_FILE)

labels = unidentified_data['Label'].values
WORKER_ID = unidentified_data['ID'].values
unidentified_data = unidentified_data.drop(columns=['Label','ID'])
# Import the required libraries
scaler = StandardScaler()
# Fit the scaler to the 'data' dataframe and standardize the features
data_scaled = scaler.fit_transform(data)

# Use the same scaling factors from the 'data' dataframe and transform the 'unidentified_data' dataframe
unidentified_data_scaled = scaler.transform(unidentified_data)
# Convert the standardized data arrays back to dataframes
data = pd.DataFrame(data_scaled,columns=data.columns)
unidentified_data = pd.DataFrame(unidentified_data_scaled,columns=unidentified_data.columns)


n_columns = data.shape[1]
n_columns_1 = unidentified_data.shape[1]
idx_list = []
init_trust = 1
n = 10
trust_data_a = []
reliable_threshold = 0.5
# 每两列进行异常检测
def time_decay(n,crruent_t):
    c = 0
    for i in range(1,n+1):
        c += i-1
    decay = (crruent_t - 1) / c 
    return decay
for i in range(1, n_columns, 2):
    X = data.iloc[:, i:i + 2].values
#     使用 One-Class SVM 进行异常检测
    clf = OneClassSVM(nu=0.01, kernel="rbf", gamma=0.05)
    clf.fit(X)
    X_2 = unidentified_data.iloc[:, i:i + 2].values
    y_pred = clf.predict(X_2)
    trust_score = np.where(y_pred == 1, init_trust, 0)
    trust_data = []
    L = int(i / 2) + 1
    score = 0
    for j in range(len(trust_score)):
        data_trust_dict = {'W-' + str(WORKER_ID[j]):trust_score[j] * time_decay(n,L)}
        trust_data.append(data_trust_dict)
    trust_data_a.append(trust_data)
trust_list = []
grouped_data = {}
for sublist in trust_data_a:
    for item in sublist:
        trust_list.append(item)
        
grouped_dict = defaultdict(list)
for d in trust_list:
    for key, value in d.items():
        grouped_dict[key].append(value)
reliable_worker = []
for key,val in dict(grouped_dict).items():
    if sum(val) <= reliable_threshold:
        break
    # print(key,':',np.cumsum(val).round(4))
    reliable_worker_dict = {key:sum(val)}
    reliable_worker.append(reliable_worker_dict)

#         if index not in grouped_data:
#             grouped_data[index] = []
#         grouped_data[index].append(value)
# # 按键（索引）对grouped_data字典进行排序
# sorted_grouped_data = sorted(grouped_data.items())
# # 提取排序后的字典中的值列表
# grouped_values = [values for index, values in sorted_grouped_data]
# # 使用numpy将值列表转换为矩阵
# matrix = np.cumsum(np.array(grouped_values),axis=1)
# matrix_data_trust = []
# reliable_threshold = 0.5
# for j in range(len(matrix)):
#     if(matrix[j][-1] <= reliable_threshold):
#         break
#     matrix_data_trust.append(matrix[j][-1])
#     print('W-' + str(j+1),':',str(matrix[j]).replace('\n', ''))
#After the subsequent task，Our schema only need 1 worker to execute the task....
# 将每个字典的值转换为列表，并添加新的值
for dict_item in reliable_worker:
    for key in dict_item.keys():
        dict_item[key] = [dict_item[key]]  # 将原始值转换为列表
        for i in range(9):  # 添加9个新的值
            dict_item[key].append(round(random.uniform(0.5, 1), 2))


# 写入csv文件
import csv
# 输出新的csv文件
if False == os.path.isfile('data/matrix_Q_2.csv'):
    df.to_csv('data/matrix_Q_2.csv', index=False, header=False)
    with open('data/matrix_Q_2.csv', 'w', newline='') as file:
        writer = csv.writer(file)
        for dict_item in reliable_worker:
            for key, value in dict_item.items():
                writer.writerow([key] + value)  # 每行的格式为: key, value1, value2, ..., value9
pd.read_csv('data/matrix_Q_2.csv', header=None)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,W-1,1.0,0.62,0.52,0.61,0.71,0.5,0.53,0.99,0.58,0.6
1,W-2,1.0,0.91,0.61,0.83,0.97,0.71,0.78,0.99,0.97,0.81
2,W-3,1.0,0.53,0.51,0.7,0.76,0.86,0.51,0.76,0.86,0.56
3,W-4,0.888889,0.57,0.97,0.79,0.67,0.8,0.62,0.63,0.75,1.0
4,W-5,0.933333,0.92,0.96,0.67,0.69,0.86,0.54,0.88,0.81,0.92
5,W-6,1.0,0.76,0.75,0.61,0.54,0.51,0.56,0.83,0.5,0.96
6,W-7,0.866667,0.79,0.84,0.91,0.7,0.69,0.8,0.59,0.91,0.61
7,W-8,0.866667,0.77,0.64,0.95,0.77,0.65,0.92,0.59,0.76,0.51
8,W-9,0.755556,0.58,0.69,0.64,0.79,0.98,0.7,0.52,0.65,0.67
9,W-10,1.0,0.51,0.88,0.53,0.78,0.64,0.93,0.68,0.59,0.56


In [None]:
import pandas as pd
import os
# 读取csv文件
df = pd.read_csv('data/matrix_Q_2.csv', header=None)
# 使用公式对每列进行处理，第一列保持不变
for i in range(1, df.shape[1]):
    df[i] = 1 - df[i]/df[i].max() + 0.5
# 输出新的csv文件
if False == os.path.isfile('data/matrix_E_2.csv'):
    df.to_csv('data/matrix_E_2.csv', index=False, header=False)
df



# 17. Weights computing

In [11]:
import numpy as np
import pandas as pd
import numpy as np

# 读取 CSV 文件
Q_df = pd.read_csv('data/matrix_Q_2.csv', header=None)
E_df = pd.read_csv('data/matrix_E_2.csv', header=None)

# 将 pandas DataFrame 转换为 numpy 数组
Q = Q_df.drop(Q_df.columns[0], axis=1).values
E = E_df.drop(E_df.columns[0], axis=1).values

# 设置参数
lambda_value = 1
learning_rate = 0.01
max_iterations = 1000
tolerance = 1e-6

def gradient_descent(Q, E, lambda_value, learning_rate, max_iterations, tolerance):
    # 初始化权重 w1 和 w2
    w1 = 0.5
    w2 = 0.5
    n = len(Q)

    for iteration in range(1, max_iterations + 1):
        # 计算综合评分和平均评分
        Score = w1 * Q + w2 * E
        mean_Score = np.mean(Score)

        # 计算偏导数
        dObjective_dw1 = np.sum(2 * (Score - mean_Score) * (Q - np.mean(Q))) / n - lambda_value
        dObjective_dw2 = np.sum(2 * (Score - mean_Score) * (E - np.mean(E))) / n

        # 判断是否满足停止条件
        if abs(dObjective_dw1) < tolerance and abs(dObjective_dw2) < tolerance:
            break

        # 更新权重
        w1 = w1 - learning_rate * dObjective_dw1
        w2 = w2 - learning_rate * dObjective_dw2

        # 归一化权重
        w_sum = w1 + w2
        w1 = w1 / w_sum
        w2 = w2 / w_sum

    return w1, w2

# 优化权重
w1, w2 = gradient_descent(Q, E, lambda_value, learning_rate, max_iterations, tolerance)

# 输出优化后的权重
print("Optimized weights:")
print("w1:", w1)
print("w2:", w2)

P = w1 * Q + w2 * E
df = pd.DataFrame(P)
df.insert(0, "Worker", Q_df.iloc[:,0:1], True)
df

Optimized weights:
w1: 0.7583997434635316
w2: 0.2416002565364685


Unnamed: 0,Worker,0,1,2,3,4,5,6,7,8,9
0,W-1,0.8792,0.679759,0.627251,0.67309,0.727595,0.6208,0.636304,0.874032,0.660729,0.67248
1,W-2,0.8792,0.828201,0.67309,0.785142,0.861329,0.729328,0.765504,0.874032,0.861329,0.781008
2,W-3,0.8792,0.633691,0.622157,0.71893,0.753313,0.806848,0.625968,0.755168,0.804749,0.651808
3,W-4,0.821778,0.654166,0.856448,0.764769,0.707021,0.77584,0.682816,0.687984,0.74817,0.8792
4,W-5,0.844747,0.83332,0.851355,0.70365,0.717308,0.806848,0.641472,0.817184,0.779031,0.837856
5,W-6,0.8792,0.751421,0.744396,0.67309,0.640154,0.625968,0.651808,0.791344,0.61958,0.858528
6,W-7,0.810293,0.766777,0.790235,0.825888,0.722452,0.718992,0.77584,0.667312,0.830467,0.677648
7,W-8,0.810293,0.756539,0.68837,0.846261,0.758457,0.69832,0.837856,0.667312,0.753313,0.625968
8,W-9,0.752871,0.659284,0.713836,0.68837,0.768744,0.868864,0.72416,0.631136,0.696734,0.708656
9,W-10,0.8792,0.623454,0.810608,0.632344,0.7636,0.693152,0.843024,0.713824,0.665872,0.651808


Bad pipe message: %s [b".N\\\xa3P\x85K\x02\x8c\xd4\x90\xee\xb9'\x84c-@ V\x0c'\xa6\xa4\x92\xf9\xecE\xcc)k\t\xf3\x0c0\x1c\x8f\xed31\xdb\x92\x87\x16\xb8-\x081C\x80\xdd\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00"]
Bad pipe message: %s [b'#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x00\x1e\x00\x1c\x04\x03\x05\x03\x06\x03']
Bad pipe message: %s [b'\x08\x08\x08\t\x08\n\x08', b'\x04\x08\x05\x08\x06\x04\x01\x05\x01\x06']
Bad pipe message: %s [b'\x18\xdbml:x\x133\n\x96e\xa0j\x9cy\\\xde\x03 E\xbb\x00>\x10\xa9\xef\xe1\xc1<\xa9\xd3\xb3\xbd\xcdMl\xd0\x8b\t\xd3f\x9e\x96\x83\x1d\xdfA\x15Z\x96\xe8\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127', b'.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x00\x1e\x00']
Bad p

# 18. Merge data trust and bid trust 

In [None]:
df2 = pd.DataFrame(data_a.iloc[:, [0, -1]])
# 使用字典列表的值创建一个新的字典
data_dict = {k: v for d in reliable_worker for k, v in d.items()}
# 将新字典转换为Pandas DataFrame
df = pd.DataFrame(data_dict, index=[0])
df = df.T.reset_index().rename(columns={'index': 'Unnamed: 0', 0: 'Q'})
df = pd.merge(df, df2,on='Unnamed: 0')
df = df.rename(columns={'Unnamed: 0': 'Worker', 'TQ10': 'B'})

# 将结果赋值给df['Q']
df['Q'] = df['Q'] - np.random.uniform(0, 0.1, size=len(df))

trust_all = df.copy()
df

# 19. Compute weights

In [None]:
import numpy as np
# 数据质量
quality = np.array(trust_all['Q'].tolist())
# 成本数据
cost = np.array(trust_all['B'].tolist())
# 将数据组合成一个矩阵
data_matrix = np.vstack((quality, cost))
# 计算熵权法权重
def entropy_weight(matrix):
    # 归一化处理
    normalized_matrix = matrix / matrix.sum(axis=1)[:, np.newaxis]
    # 计算每个指标的熵值
    entropy = -np.nansum(normalized_matrix * np.log(normalized_matrix), axis=1) / np.log(len(matrix[0]))
    # 计算差异熵
    g = 1 - entropy
    # 计算权重
    weights = g / g.sum()
    return weights
# 计算权重
weights = entropy_weight(data_matrix)
# 提取质量权重Q和成本权重C
Q, B = weights
# 输出结果
print("Quality weight (Q):", Q)
print("Cost weight (C):",B)


# 20. Compute CV

In [None]:
#█(Cv(w_i^r )=ρ_1 )〖×T〗_i^d (w_i^r )+ρ_2 〖×T〗_i^b (w_i^r ).
weights = {'Q': Q, 'B': B}

df.assign(**{col: df[col] * weight for col, weight in weights.items()})
df['CV'] = df['Q'] + df['B']
df['CV_normalized'] = (df['CV'] - df['CV'].min()) / (df['CV'].max() - df['CV'].min())
df = df.sort_values(by='CV_normalized', ascending=False)
df

In [None]:
import numpy as np
import matplotlib.pyplot as plt
# 输入数据
B = df['B']
CV = df['CV_normalized'] # 将 contributions 和 deviations 的位置互换
Q = df['Q']
# 绘制散点图，设置颜色为价格，使用coolwarm颜色映射
plt.scatter(Q,CV , c=B, cmap='viridis', alpha=0.5,s=60)
# 添加颜色条及价格标题
cbar = plt.colorbar()
cbar.ax.set_ylabel('Bid trust',fontsize=14)
cbar.ax.tick_params(labelsize=14)
    
# 添加坐标轴标签和标题
plt.xlabel('Data trust',fontsize=14) # 将 'CV' 改为 'Data trust'
plt.ylabel('CV',fontsize=14) # 将 'Q' 改为 'Contribution'

# 添加文本和边框
plt.text(0.05, 0.85, '$\\rho_2=%s$' % (round(weights['B'],3)), transform=plt.gca().transAxes, fontsize=14,
         verticalalignment='top', bbox=dict(facecolor='white', edgecolor='white', pad=5.0))
plt.text(0.05, 0.93, '$\\rho_1=%s$' % (round(weights['Q'],3)), transform=plt.gca().transAxes, fontsize=14,
         verticalalignment='top', bbox=dict(facecolor='white', edgecolor='white', pad=5.0))
plt.xticks(size=14)
plt.yticks(size=14)
# 显示图形
plt.savefig('CV-1.svg', format='svg',bbox_inches='tight')
plt.show()


In [None]:
## import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import OneClassSVM
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
GT_FILE = '/kaggle/input/dbdtd-1/gt_data_2.csv'
UN_FILE = '/kaggle/input/dbdtd-1/unidentified_2.csv'
# 加载 CSV 文件
data = pd.read_csv(GT_FILE)
unidentified_data = pd.read_csv(UN_FILE)
labels = unidentified_data['Label'].values
unidentified_data = unidentified_data.drop(columns=['Label'])
n_columns = data.shape[1]
n_columns_1 = unidentified_data.shape[1]
lambdas=0.5
# 每两列进行异常检测
for i in range(1, n_columns, 2):
    X = data.iloc[:, i:i + 2].values
    clf = OneClassSVM(nu=lambdas, kernel="rbf", gamma=0.01)
    clf.fit(X)
    X_2 = unidentified_data.iloc[:, i:i + 2].values
    y_pred = clf.predict(X_2)
    # 计算训练数据和预测数据的最小值和最大值
    x_min = min(X[:, 0].min(), X_2[:, 0].min()) - 1
    x_max = max(X[:, 0].max(), X_2[:, 0].max()) + 1
    y_min = min(X[:, 1].min(), X_2[:, 1].min()) - 1
    y_max = max(X[:, 1].max(), X_2[:, 1].max()) + 1
    # 在这个范围内创建网格
    xx, yy = np.meshgrid(np.linspace(x_min, x_max+10, 100), np.linspace(y_min, y_max+10, 100))
    Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    plt.figure()
    plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), 0, 15), cmap=plt.cm.Blues)
    plt.contour(xx, yy, Z, levels=[0], linewidths=1, colors='red')
    plt.contourf(xx, yy, Z, levels=[0, Z.max()], colors='palevioletred')  
    a = plt.scatter(X[:, 0], X[:, 1], c='white', edgecolors='k', marker='o', s=60, label='Ground truth data')
    b = plt.scatter(X_2[y_pred == 1, 0], X_2[y_pred == 1, 1], c='#D00000', edgecolors='k', marker='^', s=60, label='Reliable data')
    c = plt.scatter(X_2[y_pred == -1, 0], X_2[y_pred == -1, 1], c='#FFBA08', edgecolors='k', marker='x', s=60, label='Unreliable data')
    plt.text(10,30, r'$\lambda=%s$' % (lambdas),fontsize=14)
    plt.legend()
    plt.xticks(size=14)
    plt.yticks(size=14)
    plt.legend([a, b, c],[ "Ground truth data","Reliable data", "Unreliable data"], prop={'size': 14})
    plt.savefig(f'scatterplot-contour{i}.svg', format='svg',bbox_inches='tight')
    plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import t, sem

# 生成示例数据
np.random.seed(0)
x = np.random.rand(50)
y = 2 * x + np.random.normal(0, 0.1, size=len(x))

# 计算均值、标准误差和置信区间
mean_x, mean_y = np.mean(x), np.mean(y)
se_x, se_y = sem(x), sem(y)
ci_x = t.interval(0.95, len(x) - 1, loc=mean_x, scale=se_x)
ci_y = t.interval(0.95, len(y) - 1, loc=mean_y, scale=se_y)

# 绘制散点图和置信区间
plt.scatter(x, y, label="数据点")
plt.axvline(x=ci_x[0], color="red", linestyle="--", label="置信区间")
plt.axvline(x=ci_x[1], color="red", linestyle="--")
plt.axhline(y=ci_y[0], color="green", linestyle="--", label="置信区间")
plt.axhline(y=ci_y[1], color="green", linestyle="--")

plt.xlabel("x轴")
plt.ylabel("y轴")
plt.legend()
plt.title("落在置信区间内的散点图")
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import t, sem
# 读取数据

# 假设的数据集，请替换为您的实际数据集
data = np.random.randint(1, 100, size=(100, 10))
bid = pd.DataFrame(data, columns=[f"任务{i+1}" for i in range(10)])

# 计算均值、标准误差和置信区间
mean_bid = bid.mean()
se_bid = bid.sem()
ci_lower = []
ci_upper = []

for i in range(10):
    ci = t.interval(0.95, len(bid) - 1, loc=mean_bid[i], scale=se_bid[i])
    ci_lower.append(ci[0])
    ci_upper.append(ci[1])

ci_lower = np.array(ci_lower)
ci_upper = np.array(ci_upper)

# 绘制柱状图和置信区间
fig, ax = plt.subplots()
x_labels = [f"任务{i + 1}" for i in range(10)]
x = np.arange(len(x_labels))
width = 0.4

rects = ax.bar(x, mean_bid, width, yerr=(ci_upper - ci_lower) / 2, capsize=5, label="111")
ax.set_xticks(x)
ax.set_xticklabels(x_labels)
ax.set_ylabel("报价")
ax.set_title("各任务报价柱状图及置信区间")
ax.legend()

plt.show()


In [None]:

bid = pd.read_csv(BID_FILE)
selected_data = bid.iloc[df.iloc[0:9,].index].drop('Unnamed: 0',axis=1).iloc[0:9,:-1]
# 只取对角线上的数据，其他数据设置为0
diagonal_data = np.zeros(selected_data.shape)
np.fill_diagonal(diagonal_data, selected_data.values.diagonal())
# 将结果转换为pandas DataFrame
diagonal_data_df = pd.DataFrame(diagonal_data, index=selected_data.index, columns=selected_data.columns)
after_detection = diagonal_data_df.sum()
for j in range(len(OCSVM_cost)):
    l = j+1
    if l > 1:
        OCSVM_cost[j] = after_detection[j-1]
combined_list = [MTI_cost,MVI_cost,WTI_cost,OCSVM_cost]
matrix_cost = np.array(combined_list)
min_val = np.min(matrix_cost.T)
max_val = np.max(matrix_cost.T)
normalized_matrix_cost = (matrix_cost.T - min_val) / (max_val - min_val)
print(normalized_matrix_cost)            

In [None]:

import pandas as pd
import seaborn as sns
data = np.array(normalized_matrix_cost)


# 将每一列作为一个特征
feature1 = data[:, 0]
feature2 = data[:, 1]
feature3 = data[:, 2]
feature4 = data[:, 3]

# 绘制折线图
fig, ax = plt.subplots(figsize=(8, 6))
ax.plot(feature1, label='Feature 1')
ax.plot(feature2, label='Feature 2')
ax.plot(feature3, label='Feature 3')
ax.plot(feature4, label='Feature 4')
ax.legend()
ax.set_xlabel('Samples')
ax.set_ylabel('Feature Values')
ax.set_title('Line Plot of Features')
plt.show()

In [None]:
array([85.48538775, 82.89677339, 89.12708174, 87.00855861, 84.90491896,
       88.7948461 , 91.38832448, 75.20164938, 85.45880388])