# PCS

In [None]:
import os
import numpy as np
import pandas as pd

def PCS(save_path, meta_file, feature_file, alpha=0.7):
    # Load metadata of index data
    meta = pd.read_csv(meta_file)
    meta["broad_sample"] = meta["Treatment"].str.split("@", expand=True)[0]

    features = np.load(feature_file)
    print(features.shape)

    # 用序号的方式来指定 control Wells
    dmso_indices = meta.index[meta['Compound'] == 'DMSO'].tolist()
    # dmso_indices = meta.index[meta['Treatment'] == 'NA@NA'].tolist()  # for 36
    # dmso_indices = meta.index[meta['pert_name'] == 'EMPTY_'].tolist() # for 37
    print(len(dmso_indices))

    # 提取 meta 中的第一列 'Metadata_Plate'
    metadata_plate = meta['Metadata_Plate'].values.reshape(-1, 1)

    # 将 'Metadata_Plate' 列融合到 features 的第一列
    combined_features = np.hstack((metadata_plate, features))
    combined_df = pd.DataFrame(combined_features, columns=['Metadata_Plate'] + [f'{i}' for i in range(1, features.shape[1]+1)])
    print(combined_df.shape)

    # 只保留dmso_indices对应的行
    filtered_combined_df = combined_df.iloc[dmso_indices]
    print(filtered_combined_df.shape)

    # control well 对应的 特征
    control_df = filtered_combined_df.groupby(["Metadata_Plate"]).mean().reset_index()

    # 减去 control_df 中相同 Metadata_Plate 的行对应的后面 672 特征维度
    def subtract_control_features(row, control_df):
        plate = row['Metadata_Plate']
        control_row = control_df[control_df['Metadata_Plate'] == plate]
        if not control_row.empty:
            row.iloc[1:] = row.iloc[1:] - control_row.iloc[0, 1:] * alpha
        return row

    adjusted_combined_df = combined_df.apply(subtract_control_features, axis=1, control_df=control_df)

    # 移除第一列 'Metadata_Plate'，然后变成 (66558, 672)
    adjusted_combined_df = adjusted_combined_df.drop(columns=['Metadata_Plate'])

    # 保存为 .npy 文件
    np.save(save_path+ str(alpha) + '.npy', adjusted_combined_df.values)
    print(adjusted_combined_df.shape)

# 调用函数
PCS(
    feature_file="Fig5/BBBC022/PhenoProfiler_Alltrain_22test.npy", # your file path
    meta_file="/data/boom/bbbc022/profiling.csv",
    alpha=0.7,
    save_path="PhenoProfiler_Alltrain_22test_"
)

(66558, 672)
11519 [99, 100, 101, 102, 103, 104, 105, 106, 107, 108]
(66558, 673)
(11519, 673)
(66558, 672)
