In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
import joblib
import warnings 
warnings.filterwarnings ('ignore') 

In [2]:
# Data loading
df_train_result = pd.read_csv('PE_original_data.csv', low_memory=False, encoding="utf-8")

In [3]:
def preprocess_data(data, numeric_cols, excluded_cols):
    # Define all columns, excluding the numeric types already specified and the remaining columns after the excluded columns as categorical variables
    all_cols = data.columns.tolist()
    if excluded_cols:
        categorical_cols = [col for col in all_cols if col not in numeric_cols and col not in excluded_cols]
    else:
        categorical_cols = [col for col in all_cols if col not in numeric_cols]

    # Reset the DataFrame's indexes to ensure error-free merging of data
    data = data.reset_index(drop=True)

    # Handling numeric data: scaling after filling in missing values
    numeric_imputer = SimpleImputer(strategy='mean')
    numeric_data = numeric_imputer.fit_transform(data[numeric_cols])
    scaler = MinMaxScaler()
    scaled_numeric_data = scaler.fit_transform(numeric_data)
    scaled_numeric_df = pd.DataFrame(scaled_numeric_data, columns=numeric_cols)

    # Handling categorical variables: conditional coding after filling in missing values
    categorical_imputer = SimpleImputer(strategy='constant', fill_value=-1)
    categorical_data = categorical_imputer.fit_transform(data[categorical_cols])
    categorical_df = pd.DataFrame(categorical_data, columns=categorical_cols)

    encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
    encoded_data = pd.DataFrame() 
    for col in categorical_cols:
        unique_values = categorical_df[col].unique()
        if len(unique_values) > 2:
            encoded_col = encoder.fit_transform(categorical_df[[col]])
            col_names = [f"{col}_{category}" for category in encoder.categories_[0]]
            encoded_data = pd.concat([encoded_data, pd.DataFrame(encoded_col, columns=col_names)], axis=1)
        else:
            encoded_data = pd.concat([encoded_data, categorical_df[[col]]], axis=1)
            
    if excluded_cols:
        processed_data = pd.concat([scaled_numeric_df, encoded_data, data[excluded_cols]], axis=1)
    else:
        processed_data = pd.concat([scaled_numeric_df, encoded_data], axis=1)

    return processed_data

In [4]:
# Define the list of numeric columns
numeric_cols = [
    'Vmax_mean', 'Vmin_mean', '就诊年龄', '孕', '月经周期', 'PI_mean', 'SD_mean', 'PT国际标准化比值', 'RBC体积分布宽度CV', 'RBC体积分布宽度SD', 
    'γ-谷氨酰基转移酶', '丙型肝炎抗体', '丙氨酸氨基转移酶', '中核酸浓度网织红细胞百分率', '低密度脂蛋白胆固醇', '低成熟度网织红细胞比率(高)', 
    '低核酸浓度网织红细胞百分率', '促甲状腺激素', '凝血酶时间', '前白蛋白', '单核细胞比率', '单核细胞计数', '单胺氧化酶', '嗜碱性粒细胞比率', 
    '嗜碱性粒细胞计数', '嗜酸性粒细胞比率', '嗜酸性粒细胞计数', '大血小板比率', '尿_酸碱度', '尿比重', '尿素', '尿酸', '平均RBC血红蛋白含量', 
    '平均RBC血红蛋白浓度', '平均红细胞体积', '平均血小板体积', '总胆固醇', '总胆汁酸', '总胆红素', '总蛋白', '活化的部分凝血活酶时间', 
    '淋巴细胞比率', '淋巴细胞计数', '游离三碘甲状原氨酸', '游离甲状腺素', '球蛋白', '甘油三酯', '白蛋白', '白蛋白/球蛋白', '直接胆红素', 
    '碱性磷酸酶', '碳酸氢根(总二氧化碳)', '粒细胞比率', '粒细胞计数', '红细胞压积', '网织红细胞百分率', '网织红细胞绝对值', '胆碱酯酶', 
    '腺苷脱氨酶', '血_白细胞计数', '血_红细胞计数', '血_肌酐', '血_葡萄糖', '血小板体积分布宽度', '血小板比积', '血小板计数', '血浆D—二聚体', 
    '血浆凝血酶原时间', '血浆纤维蛋白原', '血清α-L-岩藻糖苷酶', '血红蛋白', '载脂蛋白AⅠ', '载脂蛋白AⅠ/B', '载脂蛋白B', '钙', '铁蛋白', 
    '门冬氨酸氨基转移酶', '间接胆红素', '高密度脂蛋白胆固醇', '高核酸浓度网织红细胞百分率', '产', '初潮年龄'
]

excluded_cols = ['LABEL', 'medical_record_number']

In [5]:
# Preprocess the data
df_train_result = preprocess_data(df_train_result, numeric_cols, excluded_cols)

In [6]:
# Split the data
X = df_train_result.drop(columns=excluded_cols)
y = df_train_result['LABEL']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [7]:
from sklearn.linear_model import LassoCV

# Using LassoCV to find the optimal alpha
lasso = LassoCV(cv=5).fit(X_train, y_train)

# Acquisition of features with non-zero significance
important_features = np.where(lasso.coef_ != 0)[0]

# Updating X_train and X_test with important features
X_train_important = X_train.iloc[:, important_features]
X_test_important = X_test.iloc[:, important_features]

In [8]:
# Save the processed datasets
joblib.dump((X_train_important, y_train), 'train_data.pkl')
joblib.dump((X_test_important, y_test), 'test_data.pkl')

['test_data.pkl']