In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
import joblib
from tqdm import tqdm


data_path = 'merged_data.xlsx'
df = pd.read_excel(data_path, nrows=5000)  


print(df.head())


print(df.isnull().sum())


df = df.dropna()  # 删除含有缺失值的行


features = ['icd_code_diagnoses', 'ndc_prescriptions', 'icd_code_procedures', 'itemid']
target = 'FI_score'


for feature in tqdm(features, desc="Converting data types"):
    df[feature] = df[feature].astype(str)


encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_features = encoder.fit_transform(df[features])


encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(features))

X_train, X_test, y_train, y_test = train_test_split(encoded_df, df[target], test_size=0.2, random_state=42)

# 定义模型
model = SVR(kernel='linear', C=0.1, cache_size=1000)

# 训练和评估模型
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)


print(f"Support Vector Regression - MSE: {mse}, R^2: {r2}")


model_path = 'best_model_SVR.pkl'
joblib.dump(model, model_path)
print(f"Model saved to {model_path}")


encoder_path = 'saved_encoder.pkl'
joblib.dump(encoder, encoder_path)
print(f"Encoder saved to {encoder_path}")


    hadm_id icd_code_diagnoses  ndc_prescriptions icd_code_procedures  itemid  \
0  25860671               4240       5.539001e+10                3893   51006   
1  26133978            Z6826         9.046573e+08                 NaN   51678   
2  26184834            T45515A       9.003928e+06             0BH17EZ   51275   
3  23581541            V8538         4.091762e+08                3612   51301   
4  20345487               7850       1.820508e+08                3491   50831   

   FI_score  
0  0.222222  
1  0.444444  
2  0.277778  
3  0.388889  
4  0.361111  
hadm_id                  0
icd_code_diagnoses       1
ndc_prescriptions       18
icd_code_procedures    530
itemid                   0
FI_score                 0
dtype: int64


Converting data types: 100%|██████████| 4/4 [00:00<00:00, 801.82it/s]


Support Vector Regression - MSE: 0.009861109419326131, R^2: -0.025012940655109617
Model saved to best_model_SVR.pkl
Encoder saved to saved_encoder.pkl


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.svm import SVR
import joblib
import os


model_path = 'best_model_SVR_rbf.pkl'
encoder_path = 'saved_encoder.pkl'


model = joblib.load(model_path)


encoder = joblib.load(encoder_path)


data_path = 'merged_data.xlsx'
df = pd.read_excel(data_path, nrows=10000) 
df = df.dropna()  # 删除含有缺失值的行


features = ['icd_code_diagnoses', 'ndc_prescriptions', 'icd_code_procedures', 'itemid']
target = 'FI_score'


for feature in features:
    df[feature] = df[feature].astype(str)


sample_df = df.sample(n=1000, random_state=42)

encoded_features = encoder.transform(sample_df[features])
y = sample_df[target].values

kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_results = {'Fold': [], 'MSE': [], 'RMSE': [], 'MAE': []}

for fold, (train_index, val_index) in enumerate(kf.split(encoded_features)):
    X_train, X_val = encoded_features[train_index], encoded_features[val_index]
    y_train, y_val = y[train_index], y[val_index]
    

    model.fit(X_train, y_train)
    

    y_val_pred = model.predict(X_val)

    mse = mean_squared_error(y_val, y_val_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_val, y_val_pred)
    

    cv_results['Fold'].append(fold + 1)
    cv_results['MSE'].append(mse)
    cv_results['RMSE'].append(rmse)
    cv_results['MAE'].append(mae)

cv_results_df = pd.DataFrame(cv_results)
print(cv_results_df)


avg_mse = np.mean(cv_results['MSE'])
avg_rmse = np.mean(cv_results['RMSE'])
avg_mae = np.mean(cv_results['MAE'])

print("\nAverage performance metrics across all folds:")
print(f"Average MSE: {avg_mse:.4f}")
print(f"Average RMSE: {avg_rmse:.4f}")
print(f"Average MAE: {avg_mae:.4f}")


fi_scores_pred = model.predict(encoded_features)


sample_df['Predicted_FI_Score'] = fi_scores_pred


folder_path = 'Untitled Folder'
if not os.path.exists(folder_path):
    os.makedirs(folder_path)


output_path = os.path.join(folder_path, 'predicted_FI_scores.csv')
sample_df.to_csv(output_path, index=False)

print(f"Predicted FI scores saved to: {output_path}")


   Fold       MSE      RMSE       MAE
0     1  0.009693  0.098453  0.078356
1     2  0.008464  0.092001  0.072450
2     3  0.008930  0.094498  0.076373
3     4  0.010632  0.103112  0.081650
4     5  0.010109  0.100543  0.080617

Average performance metrics across all folds:
Average MSE: 0.0096
Average RMSE: 0.0977
Average MAE: 0.0779
Predicted FI scores saved to: Untitled Folder/predicted_FI_scores.csv
