In [None]:
from sklearn.ensemble import RandomForestClassifier
import pickle
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import PowerTransformer
import statsmodels.api as sm 
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt

In [None]:
# Load the existing variables
with open(r'...', 'rb') as pickle_file:
    data = pickle.load(pickle_file)

# Load the existing variables
with open(r"...", 'rb') as pickle_file:
    processed_data = pickle.load(pickle_file)

pred_features = data["pred_features"]
pred_num = data["pred_num"]

class CustomPowerTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns, all_features):
        """
        Initialize the CustomPowerTransformer.

        Parameters:
        - columns: list of feature names to power-transform
        - all_features: list of all feature names
        """
        self.columns = columns
        self.all_features = all_features

    def fit(self, X, y=None):
        """
        Fit the transformer.
        Here we compute the names for columns to power-transform and fit the PowerTransformer
        and StandardScaler on the appropriate data.
        """
        # Determine indices of columns to power-transform
        self.column_names = self.all_features[: X.shape[1]]
        self.column_indices = [
            i
            for i, col_name in enumerate(self.column_names)
            if col_name in self.columns
        ]

        # Convert X to numpy array if it's a DataFrame
        if isinstance(X, pd.DataFrame):
            X = X.values

        # Fit the power transformer to the specified columns
        self.power_transformer = PowerTransformer(
            method="yeo-johnson", standardize=True
        )
        self.power_transformer.fit(X[:, self.column_indices])

        # Transform the specified columns
        X_transformed = X.copy()
        X_transformed[:, self.column_indices] = self.power_transformer.transform(
            X[:, self.column_indices]
        )

        return self

    def transform(self, X, y=None):
        """
        Apply power transformation and scaling to specified columns.
        """
        # Convert X to numpy array if it's a DataFrame
        if isinstance(X, pd.DataFrame):
            X = X.values

        # Apply power transformation to specified columns
        X_transformed = X.copy()
        X_transformed[:, self.column_indices] = self.power_transformer.transform(
            X[:, self.column_indices]
        )

        return X_transformed

power_and_scale_transformer = CustomPowerTransformer(
    columns=pred_num, all_features=pred_features
)

X_train = processed_data["X_train"][pred_features]
y_train = processed_data["y_train"]

X_train = pd.DataFrame(power_and_scale_transformer.fit_transform(X_train), columns=X_train.columns)

# Copy the original DataFrame
X_train = X_train.copy()
X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)

In [None]:
binary_columns = ["Gender", "CCI_YN", "binary_ASIA", "Tumor C-level", "Tumor T-level", "Tumor L-level", "Tumor S-level",
                          "Functional_Stat_1", "Visceral", "Brain", "Path_Fract", "Prev_Syst", "Pre_Chem", "Opioid"]
integer_columns = ["Age", "BMI", "Katagiri_Group", "grouped_KPS", "B_Mob", "B_Sel", "B_Usu",
                           "B_Dis", "B_Anx"]
continuous_columns = ["KPS", "B_Index"]

X_train[binary_columns] = X_train[binary_columns].astype(int)
X_train[integer_columns] = X_train[integer_columns].astype(int)
X_train[continuous_columns] = X_train[continuous_columns].astype(float)

In [None]:
# Ensure X_train[features] includes a constant term for the intercept
X = sm.add_constant(X_train)

# Fit the logistic regression model
model = sm.Logit(y_train, X).fit()

# Display the model summary to see the statistical significance of each variable
print(model.summary())

In [None]:
# Initialize the classifier
model = RandomForestClassifier(random_state=42)

# Initialize RFECV with cross-validation
rfecv = RFECV(estimator=model, step=1, cv=StratifiedKFold(5), scoring='roc_auc')
rfecv.fit(X_train, y_train) 

# Number of optimal features
print("Optimal number of features: %d" % rfecv.n_features_)

# Which features are selected
print("Selected features: %s" % list(X_train.columns[i] for i in range(X_train.shape[1]) if rfecv.support_[i]))

# Plot the cross-validation scores vs. number of features
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross-validation score (accuracy)")
plt.plot(range(1, len(rfecv.cv_results_['mean_test_score']) + 1), rfecv.cv_results_['mean_test_score'])
plt.show()