# 查找最小数据集   
### version 1.0   
利用已有的分类数据分别计算各类别的质心，然后计算样本到质心的距离，删除特定距离的样本，查找最小数据集

In [1]:
from pathlib import Path
import datetime
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from utils import *

## 1. Initialization

In [2]:
min_dataset_threshold = 0.80        # minimum minimum dataset threshold for the dataset
min_dataset_method = 'proportion'         # minimum dataset method:        'first' , 'last' or 'proportion'
taxonomy = 'Berg'                   # choose which taxonomy:        'Berg', 'SEA_AD' or 'Mouse'
cell_subset = 'Superficial'         # choose which cells:           'All', 'Superficial' or 'Deep'
modality = 'Ephys'                  # choose which modality:        'Both', 'Ephys' or 'Morph'
rowfilte = True                     # choose if you want to row filter: True or False
scaler_method = None                # choose your data format:      'standard','minmax','robust','normalizer','quantile'
impute_method = 'knn'                # choose which imputer:         None or 'knn' or 'liner' or 'Polynomial' or 'decisiontree':
oversample_method = 'random'            # choose which oversampler:      None or 'random' or 'smote' or 'smoten' or 'smotenc' or 'borderline' or 'adasyn' or 'svmsmote' or 'kmeans'
# 设置缺省参数
min_cells_per_type = 5              # minimum cells necessary in the training data per group/label/cell-type (e.g. 5 or 10)
null_threshold = 0.2                       # row filter null value threshold
test_size =0.20                     # test size for cross validation (e.g. 0.2)
teacher_model_path = 'models\\Teacher\\'
student_model_path = 'models\\Student\\'
train_dataset = DATA_PATH / Path(f'train_data_{cell_subset}_{modality}_{taxonomy}.csv')
test_dataset = DATA_PATH / Path(f'test_data_{cell_subset}_{modality}_{taxonomy}.csv')

In [3]:
#计算质心
def get_centroid(df):
    '''
    计算质心
    参数:
        df: dataframe,数据样本
    返回值:
        centroid: series,数据样本质心
    '''
    centroid = df.mean()
    return centroid

In [4]:
def get_distance(x, y, method=2):
    '''
    计算两个样本的距离

    参数:
      x: series,第一个样本的坐标
      y: series,第二个样本的坐标
      method: int,1:曼哈顿距离,2:欧氏距离
    返回值:
      distance: float,x到y的距离      
    '''   
    distance = np.power((x-y).abs().pow(method).sum(),1/method)
    return distance

In [5]:
def get_min_dataset(df,min_dataset_threshold,method='first'):
    """
    计算数据集的质心,然后计算各样本到质心的距离,按距离排序,取min_dataset_threshold%的数据
    
    参数:
    df: 数据框，包含要处理的数据
    min_dataset_threshold: 最小数据集的阈值，表示要提取的数据占总数据的百分比
    method: 计算质心的方法，默认值为 'first'，表示取前 min_dataset_threshold% 的数据
                                   'last' 表示取后 min_dataset_threshold% 的数据
                                   'proportion' 表示按比例取数据，min_dataset_threshold 表示要提取的数据占总数据的比例

    返回值:
    df: 处理后的数据框
    """
    # 计算数据集和最小数据集的大小
    df_size=df.shape[0]
    min_dataset_size = round(df_size*min_dataset_threshold)
    # 计算数据的质心
    centroid=get_centroid(df)
    # 计算各样本到质心的距离
    df['distance']=df.apply(lambda x: get_distance(x,centroid),axis=1)
    # 按样本到质心的距离排序
    df.sort_values(by='distance',ascending=True,inplace=True)
    # 删除distance列
    df.drop(columns=['distance'],inplace=True)
    if method=='first':
        # 取前 min_dataset_threshold% 的数据
        end_location=min_dataset_size
        df = df.iloc[:end_location,:]
    elif method=='last':
        # 取后 min_dataset_threshold% 的数据
        start_location=df_size-min_dataset_size
        df=df.iloc[start_location:,:]
    elif method=='proportion':
        # 按比例取数据
        step=round(df_size/(df_size-min_dataset_size))
        drop_list = df.iloc[::step].index
        df.drop(index=drop_list,inplace=True)
    
    return df

## 2. Load dataset
加载数据文件合并数据集,并分离标注数据和非标注数据,如果数据目录中已存在train集和test集数据则直接加载

In [6]:
if train_dataset.is_file() and test_dataset.is_file():
    train_data = pd.read_csv(train_dataset, index_col=0)
    test_data = pd.read_csv(test_dataset, index_col=0)
    print('--- load Train & Test dataset ---')
    print('Train_data:    ', train_data.shape)
    print('Test_data:     ', test_data.shape)

else:
    # Define the path to the files and read in the data
    meta_data_path = Path("..\data\meta_data_withVU.csv")        # read the meta data file
    ephys_path = Path("..\data\ephys_data_withVU.csv")           # read the ephys data file
    morph_path = Path("..\data\morph_data_withVU.csv")           # read the morph data file
    # Read in the data files
    meta_data = pd.read_csv(meta_data_path, index_col=0) 
    ephys_data = pd.read_csv(ephys_path, index_col=0)
    morph_data = pd.read_csv(morph_path, index_col=0)
    print('原始数据文件：')
    print('ephys_dataset:', ephys_data.shape)
    print('morph_dataset:', morph_data.shape)
    labeled_data, unlabeled_data =  load_data_l23_depth_normalized(meta_data, ephys_data, morph_data, taxonomy = taxonomy, modality = modality, cell_subset = cell_subset)
    print('--- Init Setup ---')
    print('Cell_subset: ',cell_subset)
    print('Modality:    ', modality)
    print('Taxonomy:    ', taxonomy)
    print('Labeled_data:    ', labeled_data.shape)
    print('Unlabeled_data:  ', unlabeled_data.shape)

原始数据文件：
ephys_dataset: (522, 35)
morph_dataset: (170, 25)
--- Init Setup ---
Cell_subset:  Superficial
Modality:     Ephys
Taxonomy:     Berg
Labeled_data:     (230, 37)
Unlabeled_data:   (209, 37)


## 3. Data preprocessing   
如果不存在train_data,则进行数据预处理(行过滤、补插缺失值、标准化、数据拆分未训练集和测试集)

In [7]:
if 'train_data' not in locals():

    if rowfilte:
        print('-------------------------------            Dataset Row Filte            -------------------------------')    
        labeled_data = filter(labeled_data,min_cells_per_type=min_cells_per_type, threshold=null_threshold)

    if impute_method:
        print('-------------------------------            Imputing Dataset            -------------------------------')
        print('1. Imputing Labeled Dataset')
        # 不包含标签列进行补插
        print('补插前标记数据集信息:')
        t_type_labels = labeled_data.iloc[:, -1].values
        print(labeled_data.info())
        labeled_data = impute(labeled_data.iloc[:,0:labeled_data.columns.size-1],method=impute_method)
        labeled_data[LABEL_COLUMN] = t_type_labels
        print('补插后标记数据集信息:')
        print(labeled_data.info())
        print('2. Imputing Unlabeled Dataset')
        # 对unlabel_data数据集补插 
        print('补插前未标记数据集信息:')
        print(unlabeled_data.info())
        unlabeled_data = impute(unlabeled_data.iloc[:,0:unlabeled_data.columns.size-1],method=impute_method)
        unlabeled_data[LABEL_COLUMN] = np.nan
        print('补插后未标记数据集信息:')
        print(unlabeled_data.info())

    # 转换为numpy数组
    data_array = labeled_data.iloc[:,0:labeled_data.columns.size-1].values              # transform the data to an array
    # Check if data needs to be scoring
    if scaler_method:
        print('-------------------------------             Scaling Dataset            -------------------------------')
        data_array = scale(data_array, scalemethod = scaler_method)                     # apply scoring method 
    cell_ids_subset = labeled_data.index                                                # extract the cell IDs of the subset
    # Create a list of features
    feature_list = labeled_data.iloc[:,0:labeled_data.columns.size-1].keys().values
    # normalized_depths = labeled_data['L23_depth_normalized'].values                     # extract normalized depth values
    t_type_labels = labeled_data.iloc[:, -1].values                                     # extract t-type labels of the data subset
    t_types_updated = np.unique(t_type_labels)
    print('dataset rownum:',data_array.shape[0])
    print('feature num:',len(feature_list))
    print('feature list:',feature_list)
    print('cell type num:',len(t_types_updated))
    print('cell type list:',t_types_updated)
    # Split the data into a training and test dataset
    print('-------------------------------    Train & Test Dataset Split    -------------------------------')
    X_train, X_test, y_train, y_test = train_test_split(data_array, t_type_labels, test_size=test_size, stratify=t_type_labels, random_state=RANDOM_STATE)

    # 组合X_train数据为dataframe，未包含label列
    train_data= pd.DataFrame(data = X_train, columns = feature_list)
    train_data[LABEL_COLUMN] = y_train
    print('train:',train_data.shape)

    # 测试集
    test_data_nolabel = pd.DataFrame(data = X_test, columns = feature_list)
    test_data = test_data_nolabel.copy()
    test_data[LABEL_COLUMN] = pd.DataFrame(y_test, columns = [LABEL_COLUMN])
    print('test:',test_data.shape)

    # 保存训练集和测试集
    train_data.to_csv(f'..\data\\train_data_{cell_subset}_{modality}_{taxonomy}.csv')
    test_data.to_csv(f'..\data\\test_data_{cell_subset}_{modality}_{taxonomy}.csv')

-------------------------------            Dataset Row Filte            -------------------------------
-------------------------------            Imputing Dataset            -------------------------------
1. Imputing Labeled Dataset
补插前标记数据集信息:
<class 'pandas.core.frame.DataFrame'>
Index: 230 entries, 541549258 to 811939096
Data columns (total 37 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   input_resistance      230 non-null    float64
 1   sag                   230 non-null    float64
 2   VmatSag               230 non-null    float64
 3   vmbaseM               230 non-null    float64
 4   tau                   230 non-null    float64
 5   FAP_rheobase          230 non-null    float64
 6   FAP_num_APs           230 non-null    float64
 7   TS1_rheobase          230 non-null    float64
 8   TS1_num_APs           230 non-null    float64
 9   TS2_rheobase          230 non-null    float64
 10  TS2_num_APs           

## 4. Reduce Dataset   
按类计算各类的质心和每个样本到质心的距离，按照从小到大的顺序排列，按min_dataset_threshold比例过滤每个类样本，并合并未新的训练集。  

In [8]:
new_train_data=None
for group in train_data.groupby(by=[LABEL_COLUMN]):
    df=group[1].iloc[:,0:group[1].columns.size-1]
    print(f'process group:{group[0]}:{df.shape[0]}')
    df=get_min_dataset(df,min_dataset_threshold,method=min_dataset_method)
    df[LABEL_COLUMN]=group[1].iloc[:,-1]
    if new_train_data is None:
        new_train_data=df
    else:
        new_train_data=pd.concat([new_train_data,df])

print(new_train_data[LABEL_COLUMN].value_counts())
print('new_train_data:',new_train_data.shape)
train_data = new_train_data
# 按索引排序,否则训练结果会与原数据集训练结果不一致
train_data.sort_index(inplace=True)

process group:('Exc L2 LAMP5 LTK',):61
process group:('Exc L2-3 LINC00507 FREM3 superficial',):103
process group:('Exc L2-4 LINC00507 GLP2R',):20
label
Exc L2-3 LINC00507 FREM3 superficial    82
Exc L2 LAMP5 LTK                        48
Exc L2-4 LINC00507 GLP2R                16
Name: count, dtype: int64
new_train_data: (146, 37)


## 5. oversample
过采样

In [9]:
y_train = train_data.iloc[:,-1]
train_data = train_data.iloc[:,:train_data.columns.size-1]
feature_list = train_data.columns.tolist()
if oversample_method:
    print('-------------------------------           Dataset Oversample          -------------------------------')
    # 过采样的数据集不能有空值
    # 当程序未设置插值方法,这里使用默认的knn方法,否则使用设定的插值方法
    if impute_method:
        train_data = impute(train_data, method = impute_method, random_state = RANDOM_STATE)
    else:
        train_data = impute(train_data, random_state = RANDOM_STATE)
    # 进行过采样
    X_train, y_train = oversampler(train_data.values, y_train, method = oversample_method)    
    train_data= pd.DataFrame(data = X_train, columns = feature_list)
train_data[LABEL_COLUMN] = y_train
print('oversample train:',train_data.shape)

-------------------------------           Dataset Oversample          -------------------------------
oversample train: (246, 37)


## 6. train model   

In [10]:
#设置训练精度
presets='medium_quality'
eval_metric = 'accuracy'
verbosity = 2
time_limit = 3600
auto_stack=False
# bagging的折数,缺省好像未8
num_bag_folds=5
# stacking的级别
num_stack_levels=2
num_bag_sets=1
# n折交叉验证,设置auto_stack为True和dynamic_stacking为True
dynamic_stacking = False
# n_fold为交叉验证折数,n_repeats为交叉验证重复次数
ds_args = {
    'n_folds': 5,
    'n_repeats': 1,
}
# 模型的保存路径，详细的模型命名规则见：https://auto.gluon.ai/stable/api/autogluon.tabular.models.html
save_path = teacher_model_path + presets + datetime.datetime.now().strftime("-%Y%m%d-%H%M%S")

# 开始训练
predictor = TabularPredictor(label=LABEL_COLUMN, verbosity=verbosity,eval_metric=eval_metric, path=save_path, log_to_file=True,log_file_path='auto')
if auto_stack and dynamic_stacking:
    predictor.fit(train_data, presets=presets, time_limit=time_limit, auto_stack=auto_stack, num_bag_folds=num_bag_folds, 
                        num_stack_levels=num_stack_levels, num_bag_sets=num_bag_sets,dynamic_stacking=dynamic_stacking, ds_args=ds_args)
elif auto_stack and not dynamic_stacking:
    predictor.fit(train_data, presets=presets, time_limit=time_limit, auto_stack=auto_stack, num_bag_folds=num_bag_folds,
                        num_stack_levels=num_stack_levels, num_bag_sets=num_bag_sets)
elif not auto_stack and dynamic_stacking:
    predictor.fit(train_data, presets=presets, time_limit=time_limit, auto_stack=auto_stack, dynamic_stacking=dynamic_stacking, ds_args=ds_args)
else:
    predictor.fit(train_data, presets=presets, time_limit=time_limit)

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.11.7
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.19045
CPU Count:          8
Memory Avail:       9.16 GB / 15.92 GB (57.5%)
Disk Space Avail:   307.31 GB / 931.51 GB (33.0%)
Presets specified: ['medium_quality']
Beginning AutoGluon training ... Time limit = 3600s
AutoGluon will save models to "models\Teacher\medium_quality-20241106-204955"
Train Data Rows:    246
Train Data Columns: 36
Label Column:       label
AutoGluon infers your prediction problem is: 'multiclass' (because dtype of label-column == object).
	3 unique label values:  ['Exc L2-3 LINC00507 FREM3 superficial', 'Exc L2 LAMP5 LTK', 'Exc L2-4 LINC00507 GLP2R']
	If 'multiclass' is not the correct problem_type, please manually specify the problem_type parameter during Predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression', 'quantile'])
Problem Type:       multiclass
Preproce

In [11]:
print("AutoGluon infers problem type is: ", predictor.problem_type)
print("AutoGluon identified the following types of features:")
print(predictor.feature_metadata)
print("Best Model:", predictor.model_best)
result = predictor.fit_summary(show_plot=True)

AutoGluon infers problem type is:  multiclass
AutoGluon identified the following types of features:
('float', []) : 36 | ['input_resistance', 'sag', 'VmatSag', 'vmbaseM', 'tau', ...]
Best Model: WeightedEnsemble_L2
*** Summary of fit() ***
Estimated performance of each model:
                  model  score_val eval_metric  pred_time_val   fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0   WeightedEnsemble_L2       0.90    accuracy       0.013357  10.543807                0.000000           0.219320            2       True         14
1        NeuralNetTorch       0.90    accuracy       0.013357  10.324487                0.013357          10.324487            1       True         12
2       NeuralNetFastAI       0.86    accuracy       0.013275   2.551274                0.013275           2.551274            1       True          3
3              LightGBM       0.84    accuracy       0.000000   0.397178                0.000000           0.397178    



## 7. test

In [12]:
from sklearn.metrics import classification_report
y_pred = predictor.predict(test_data.iloc[:,:test_data.columns.size-1])#,model=predictor.model_best)
print(predictor.model_best)
results = pd.concat([y_pred, test_data.iloc[:,-1]], axis=1)
results.columns=['predicted', 'actual']
# 打印每个类的精确度，召回率，F1值, 由于样本量较少,会出现被0除, 计算结果为0，但会出现警告错误：
# UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. 
# Use `zero_division` parameter to control this behavior._warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
print(classification_report(results['actual'], results['predicted'], target_names=np.unique(test_data.iloc[:,-1])))
# 输出预测值与实际值对比
#results

WeightedEnsemble_L2
                                      precision    recall  f1-score   support

                    Exc L2 LAMP5 LTK       0.44      0.47      0.45        15
Exc L2-3 LINC00507 FREM3 superficial       0.69      0.69      0.69        26
            Exc L2-4 LINC00507 GLP2R       0.75      0.60      0.67         5

                            accuracy                           0.61        46
                           macro avg       0.63      0.59      0.60        46
                        weighted avg       0.62      0.61      0.61        46



In [13]:
leaderboard = predictor.leaderboard(test_data,extra_info=False, silent=True)
print(leaderboard)
pst = presets.split('_')[0]
cv = ''
if auto_stack and dynamic_stacking:
    validation_procedure=ds_args['validation_procedure']
    folds=ds_args['n_folds']
    repeats=ds_args['n_repeats']
    cv =f"-{validation_procedure}{folds}{repeats}"
leaderboard.to_csv(f'mindataset-{min_dataset_method}-{int(min_dataset_threshold*100)}-{modality}-{cell_subset}-{pst}{cv}.csv')

                  model  score_test  score_val eval_metric  pred_time_test  \
0            LightGBMXT    0.739130       0.82    accuracy        0.005002   
1              LightGBM    0.717391       0.84    accuracy        0.002997   
2       NeuralNetFastAI    0.673913       0.86    accuracy        0.025736   
3         LightGBMLarge    0.673913       0.84    accuracy        0.033640   
4      RandomForestGini    0.673913       0.82    accuracy        0.094247   
5        ExtraTreesEntr    0.673913       0.84    accuracy        0.102520   
6      RandomForestEntr    0.673913       0.82    accuracy        0.105177   
7              CatBoost    0.652174       0.82    accuracy        0.007412   
8        ExtraTreesGini    0.630435       0.84    accuracy        0.112040   
9   WeightedEnsemble_L2    0.608696       0.90    accuracy        0.025383   
10       NeuralNetTorch    0.608696       0.90    accuracy        0.025383   
11              XGBoost    0.543478       0.84    accuracy      