<font size="4" color="red">NOTE : This is a Work-In-Progress in refactoring the code to make it easier to run multiple models against the same dataset that has gone through Data Analysis and Preparation for modeling.</font>
This specific file will run the classic models RandomForest and XGBoost on the datasets and predict the outcome.

## Modeling
### Using classic ML models 


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import plot_tree
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import auc
from sklearn.preprocessing import label_binarize
from sklearn.preprocessing import LabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix
import joblib



In [None]:
# Function to plot Confusion Matrix
def plot_confusion_matrix(conf_matrix:ndarray, labels:ndarray, title="Confusion Matrix"):

    conf_matrix = conf_matrix[::-1]
    # labels = labels[::-1]

    #heat map
    fig = go.Figure(data=go.Heatmap(
        z=conf_matrix,
        x=labels,
        y=labels[::-1], #reverse the order to align labels with way Conf matrix is output
        colorscale='Rainbow', # 'Hot', # 'YlOrRd', # 'YlGnBu', #'Viridis',
        texttemplate="%{z}",
        textfont={"size": 10}
    ))

    fig.update_layout(
        title_text = title,
        xaxis_title="Predicted Class",
        yaxis_title="Actual Class",
        # xaxis={'side': 'top'},
        # yaxis={'autorange': 'reversed'},
        width=500,
        height=500,
    )

    fig.show()



In [None]:

# Using plotly's graph_objects
def plot_feature_importance_comparison_plotly(pca_model: PCA, classifier_model, feature_importances: Series, X_pre_pca_df: DataFrame):

    # top_features = feature_importances.head(10)

    # fig = px.bar(features_df, x=features_df.index, y=features_df[0])
    fig = go.Figure()
    fig.add_trace(go.Bar(
        x=feature_importances.index,
        y=feature_importances.values,
        marker=dict(color='indianred'),
        marker_color='indianred'
    ))
    fig.update_layout(title=f'Gene expression contribution to the model\n{classifier_model}', template='plotly_white')
    fig.show()



In [None]:
def get_pca_features_weights(pca_model: PCA, classifier_model, feature_names: Index) :
    '''
    Identify the overall weights of each feature in terms of its contribution to the given model. From the given Classifier model
    it retrieves feature importances and this data is merged with the PCA components matrix.
    @param pca_model : PCA object, after it has been fit / trained on the data
    @param classifier_model : Model used for classification
    @param feature_names : List of all features used, before PCA was run.
    @return Series consisting of contribution of each feature to the model
    '''

    # get feature contributions for each Principal Component
    # feature_names = X_pre_pca_df.columns
    n_components = len(pca_model.components_)
    pca_components_df = pd.DataFrame(pca_model.components_.T,
                                     columns=[f'PC{i+1}' for i in range(n_components)],
                                     index=feature_names)


    # Random forest importances for each component
    if (hasattr(classifier_model, 'feature_importances_')):
        rf_importances = pd.Series(
            classifier_model.feature_importances_,
            index=[f'PC{i+1}' for i in range(n_components)]
        )

        # --- calculate original feature importance by weighted combination ---
        orig_importances = pca_components_df.dot(rf_importances).abs() # we only care abt the magnitude
        # sum of all shud be 1, hence find each value's contrib to 100%
        orig_importances = orig_importances / orig_importances.sum()
        return orig_importances.sort_values(ascending=False)

    return None



In [None]:
def analyze_tree_path(pca_model: PCA, classifier_model, X_pca_df: DataFrame, y_series:Series, feature_names: Index, class_names=class_unique_vals):
    '''
    For each class / target, analyze the contribution of each feature by traversing the tree, finding the leaf node and
    then merging that with each feature's individual weights, calculated using the function #get_pca_features_weights.
    @param pca_model : PCA model, after the data has been fit aka trained on the data.
    @param classifier_model : Model used for classification, after it has been trained on the data
    @param X_pca_df : The DataFrame after PCA has been run and data has been transformed.
    @param y_series : The Y series corresponding to above X_pca_df dataset i.e if above is `test` dataset, this should also be test dataset
    @param feature_names : All the features used prior to running PCA
    @param class_names : List of unique Classes / Target Variables, the data represents. In this case we have the 5 tumors.
    @return DataFrame with columns as the 4 target classes and rows as features aka gene expressions.
    '''
    # feature_names = X_pca_df.columns
    #init dictionary that will hold each class details
    class_importances = {class_name: np.zeros(len(feature_names)) for class_name in class_unique_vals}

    #loop through each DecisionTree used by the model
    for tree in classifier_model.estimators_:
        tree_importances = tree.feature_importances_
        # get index of leaf node where sample is predicted
        leaf_nodes = tree.apply(X_pca_df)

        for row_idx, leaf_id in enumerate(leaf_nodes):
            # leaf_nodes has only the node number. Not the index that matches against corresponding index in y_series. Hence we use X_pca_df to get the index
            #row_idx is sequential increment of leaf_nodes
            record_index = X_pca_df.index[row_idx]
            predicted_class = y_series[record_index]

            original_importances = get_pca_features_weights(pca_model, classifier_model, feature_names)
            class_importances[predicted_class] += original_importances


    #Normalize
    for class_name in class_names:
        total = np.sum(class_importances[class_name])
        if total > 0:
            class_importances[class_name] /= total

    return pd.DataFrame(class_importances)



In [None]:
# Using SHAP to explain - #TODO
import shap

def plot_SHAP(pca_model:PCA, classifier_model, X_df: DataFrame):
    explainer = shap.Explainer(classifier_model)
    shap_values = explainer(X_df) #X_pca_dataframe
    original_shap = shap_values.values @ pca_model.components_


In [None]:
def convert_to_binary(y_series: Series):
    label_binzer = LabelBinarizer()
    label_binzer.fit(y_series)
    y_series_bin = np.array(label_binzer.transform(y_series))
    return y_series_bin


In [None]:
def create_pipeline(model):

    pca_pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('pca', pca),
        ('model', model)
    ])
    return pca_pipeline


def create_grid_model(model, model_params: dict):
    # call create Pipeline
    pipeline = create_pipeline(model)
    # cross validation param is default = 5. n_jobs configured as param
    grid_search = GridSearchCV(estimator=pipeline, param_grid=model_params, scoring='accuracy', n_jobs=1, refit=True, verbose=1)

    return grid_search


In [None]:
models_config = {
    'Random Forest': {
        'model': RandomForestClassifier(random_state=42),
        'params': {
            'model__n_estimators':[50, 100, 200],
            'model__max_depth': [None, 10, 20],
            'model__min_samples_split': [2, 5]
        }
    },
    'XGBoost': {
        'model': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),
        'params': {
            'model__n_estimators':[50, 100, 200],
            'model__max_depth': [None, 10, 20],
            'model__eta': [0.2, None, 0.4],
            #'model__n_jobs': [1]
        }
    }
}

In [None]:
# best_results = {}
def fit_model(model, model_params:dict, X_df: DataFrame, y_series:Series):
    '''
    @param model : Classifier model
    @param model_params : Params used for tuning the hyperparameters of the given model
    @param X_df : DataFrame of the independent vars with data
    @param y_series : A Series object with target class data.
    @return dict : Consisting of keys : Execution Time, Accuracy, Precision, Recall, F1-Score, Confusion matrix, Best Estimate, Best Params, Best Accuracy
    '''
    # for model_name, model_params in models_config.items():
    start_time = time()

    grid_search = create_grid_model(model=model, model_params=model_params)
    grid_search.fit(X_df, y_series)

    end_time = time()

    return grid_search



def predict_model(grid_search: GridSearchCV, X_df: DataFrame, y_series: Series):
    start_time = time()

    y_preds = grid_search.predict(X_df)

    end_time = time()

    best_results = {
        'Execution Time': (end_time - start_time),
        'Accuracy': accuracy_score(y_series, y_preds),
        'Precision': precision_score(y_series, y_preds, average='weighted'),
        'Recall': recall_score(y_series, y_preds,  average='weighted'),
        'F1-Score': f1_score(y_series, y_preds, average='weighted'),
        'Confusion matrix': confusion_matrix(y_series, y_preds),
        # 'GridSearchCV': grid_search,
        'Best Estimate': grid_search.best_estimator_,
        'Best Params': grid_search.best_params_,
        'Best Accuracy': grid_search.best_score_
    }

    return best_results



In [None]:

# best_results = fit_predict_model(X_train_pca, y_train)

# Random Forest :
rf_model = models_config['Random Forest']['model']
rf_model_config = models_config['Random Forest']['params']
rf_grid_search = fit_model(model=rf_model, model_params=rf_model_config, X_df=X_train_pca, y_series=y_train)



Fitting 5 folds for each of 18 candidates, totalling 90 fits


In [None]:
rf_X_test_pca = clean_input_data(X_test, df_feature_outliers)

rf_best_results = predict_model(grid_search=rf_grid_search, X_df=rf_X_test_pca, y_series=y_test)
print(rf_best_results)


In [None]:
rf_conf_matrix = rf_best_results['Confusion matrix']
plot_confusion_matrix(rf_conf_matrix, labels=class_unique_vals, title='Random Forest with PCA : Confusion Matrix')


In [None]:
# Estimator :
rf_pca_model = rf_grid_search.best_estimator_['pca']
rf_trained_model = rf_grid_search.best_estimator_['model']


In [None]:
# Find contribution of each feature to the model
rf_pca_weights_df = get_pca_features_weights(pca_model=rf_pca_model,
                                            classifier_model=rf_trained_model,
                                            feature_names=rf_X_test_pca.columns)
if rf_pca_weights_df is not None:
    # plot_feature_importance_comparison(pca_model, rd_model, pca_weights_df, X_pca_dataframe)
    plot_feature_importance_comparison_plotly(rf_pca_model, rf_model, rf_pca_weights_df, rf_X_test_pca)
else :
    print(f'PCA Weights for {rf_trained_model} could not be calculated')


In [None]:

rf_pca_weights_df[rf_pca_weights_df > 0][1:11]

In [None]:

joblib.dump(rf_trained_model, f"{MODELS_DIR}RForest_model.pkl")

['models/RForest_model.pkl']

In [None]:
# test to make sure persisted model can be reloaded correctly.
rf_model_joblib: RandomForestClassifier = joblib.load(f"{MODELS_DIR}RForest_model.pkl")
print(rf_model_joblib.feature_importances_)

In [None]:
#SHAP :
# plot_SHAP(pca_model=rf_pca_model, classifier_model=rf_trained_model, X_df=rf_X_test_pca)


In [None]:
# XGB requires target vars in binary format and hence cannot reuse the fn used for RandomForest
def fit_xgb_classifier(model: XGBClassifier, model_params:dict, X_df: DataFrame, y_series:ndarray):
    '''
    @param model : Classifier model
    @param model_params : Params used for tuning the hyperparameters of the given model
    @param X_df : DataFrame of the independent vars with data
    @param y_series : An ndarray object with binary representation of target classes
    @return dict : Consisting of keys : Execution Time, Accuracy, Precision, Recall, F1-Score, Confusion matrix, Best Estimate, Best Params, Best Accuracy
    '''
    # for model_name, model_params in models_config.items():
    start_time = time()

    grid_search = create_grid_model(model=model, model_params=model_params)
    grid_search.fit(X_df, y_series)

    end_time = time()

    return grid_search


def predict_xgb_classifier(grid_search: GridSearchCV, X_df: DataFrame, y_series_bin: ndarray):
    start_time = time()

    y_preds = grid_search.predict(X_df)

    end_time = time()

    y_test_bin_arr = np.array(y_series_bin)
    # confusion matrix
    cm = confusion_matrix(y_true=np.argmax(y_test_bin_arr, axis=1), y_pred=np.argmax(y_preds, axis=1)) #, labels=class_unique_vals)
    # plot_confusion_matrix(cm, labels=class_unique_vals, title='XGBoost with PCA : Confusion Matrix')

    best_results = {
        'Execution Time': (end_time - start_time),
        'Accuracy': accuracy_score(y_series_bin, y_preds),
        'Precision': precision_score(y_series_bin, y_preds, average='weighted'),
        'Recall': recall_score(y_series_bin, y_preds,  average='weighted'),
        'F1-Score': f1_score(y_series_bin, y_preds, average='weighted'),
        'Confusion matrix': cm,
        # 'GridSearchCV': grid_search,
        'Best Estimate': grid_search.best_estimator_,
        'Best Params': grid_search.best_params_,
        'Best Accuracy': grid_search.best_score_
    }

    return best_results



In [None]:
xgb_model = models_config['XGBoost']['model']
xgb_model_config = models_config['XGBoost']['params']

y_train_bin = convert_to_binary(y_train)
# print(y_train_bin)
# Uncomment below to identify mappings between binary format and the actual label
# 0=BRCA 1=COAD, 2=KIRC, 3=LUAD, 4=PRAD
# print("Binary data: ", y_train_bin)
# print("Label to int mapping: ", label_binzer.inverse_transform(np.array(y_train_bin)))

xgb_grid_search = fit_xgb_classifier(model=xgb_model, model_params=xgb_model_config, X_df=X_train_pca, y_series=y_train_bin)

# xgb_train_best_results = predict_xgb_classifier(grid_search=xgb_grid_search, X_df=X_train_pca, y_series_bin=y_train_bin)
# print(xgb_train_best_results)


Fitting 5 folds for each of 27 candidates, totalling 135 fits


In [None]:
xgb_y_test_bin = convert_to_binary(y_test)
xgb_X_test_pca = clean_input_data(X_test, df_feature_outliers)

xgb_best_results = predict_xgb_classifier(grid_search=xgb_grid_search, X_df=xgb_X_test_pca, y_series_bin=xgb_y_test_bin)
print(xgb_best_results)


In [None]:
xgb_conf_matrix = xgb_best_results['Confusion matrix']
plot_confusion_matrix(xgb_conf_matrix, labels=class_unique_vals, title='XGBoost with PCA : Confusion Matrix')


In [None]:
xgb_pca_model = xgb_grid_search.best_estimator_['pca']
xgb_trained_model = xgb_grid_search.best_estimator_['model']


In [None]:
# Find contribution of each feature to the model
xgb_pca_weights_df = get_pca_features_weights(pca_model=xgb_pca_model,
                                            classifier_model=xgb_trained_model,
                                            feature_names=xgb_X_test_pca.columns)
if xgb_pca_weights_df is not None:
    # plot_feature_importance_comparison(pca_model, rd_model, pca_weights_df, X_pca_dataframe)
    plot_feature_importance_comparison_plotly(xgb_pca_model, xgb_trained_model, xgb_pca_weights_df, xgb_X_test_pca)
else :
    print(f'PCA Weights for {rf_trained_model} could not be calculated')


In [None]:
# Persisting data for tests
# print(xgb_X_test_pca.head())
xgb_X_test_pca.to_csv(DATA_DIR + 'xgb_X_after_pca_dataset.csv')
print('516: ', y_test[516])
print('329: ', y_test[329])
print('52: ', y_test[52])
print('141: ', y_test[141])



# Generate statistical description of these columns to help generate random values on the App side - deprecated
len(xgb_X_test_pca.columns)
xgb_test_descr_df = xgb_X_test_pca.describe().T
# xgb_test_descr_df.to_csv(DATA_DIR + 'xgb_test_X_describe.csv')


In [None]:
xgb_pca_model.get_feature_names_out()

In [None]:
# Persist just the XGBooster model. If used the inputs would be 640 PCA components which can be identified using xgb_pca_model.get_feature_names_out()
joblib.dump(xgb_trained_model, f"{MODELS_DIR}XGBoost_model.pkl")

# persis the whole trained GridSearchCV
joblib.dump(xgb_grid_search, f"{MODELS_DIR}xgb_GridSearch_Pipeline.pkl")

['models/xgb_GridSearch_Pipeline.pkl']

In [None]:
# test to make sure persisted model can be reloaded correctly.
# xgb_model_joblib: XGBClassifier = joblib.load(f"{MODELS_DIR}XGBoost_model.pkl")
# print(xgb_model_joblib.feature_importances_)

xgb_gridcv_pipeline_joblib: GridSearchCV = joblib.load(f"{MODELS_DIR}xgb_GridSearch_Pipeline.pkl")
xgb_gridcv_pipeline_joblib.best_estimator_

In [None]:
# Unit testing : to make sure model prediction is same as when input from Web App
unittest_data_X = pd.read_csv(DATA_ANALYSIS_DIR + 'request.csv')
# print("Printing req data: \n", type(unittest_data_X.iloc[0:1, 1:]))
unittest_data_pred = xgb_grid_search.predict(unittest_data_X.iloc[0:1, 1:])

# unittest_data_pred = xgb_grid_search.predict(xgb_X_test_pca.iloc[0:1, 0:])
unittest_data_pred

In [None]:
results_df = pd.DataFrame([rf_best_results, xgb_best_results]).T #.sort_values(by='Accuracy', ascending=False)
results_df.columns = ['Random Forest', 'XGBoost']
results_df

In [None]:
# Area under Curve - REF : https://www.geeksforgeeks.org/interpreting-random-forest-classification-results/
# -------- #TODO : need to fix the roc_curve function
def plot_aoc_randomforest(rd_test_pred_proba: ndarray, y_test: Series) :
    target_vals = y_test.unique()
    # y_test_bin = label_binarize(y_test, classes=[0,1,2,3,4])
    label_binzer = LabelBinarizer()
    label_binzer.fit(y_test)
    y_test_bin = np.array(label_binzer.transform(y_test))
    print(y_test_bin[0])
    fpr = dict()
    tpr = dict()
    roc_auc = dict()

    print(f"y_test_bin : {y_test_bin[1]}")
    print(f"rd_preds_prob : {rd_test_pred_proba[:, 1]}")
    # print(f"y_test_bin : {len(y_test_bin[:, 1])}")
    # print(f"rd_preds_prob : {rd_preds_prob[:, 1]}")
    # print(rd_preds_prob)

    for index in range(len(target_vals)):
        fpr[index], tpr[index], _ = roc_curve(y_test_bin[index], rd_test_pred_proba[:, index])
        # print(f"FPR at {index}: \n{fpr[index]}")
        # print(f"TPR at {index}: \n{tpr[index]}")
        roc_auc[index] = auc(fpr[index], tpr[index])

    # Plot ROC curve
    # plt.figure()
    # for index in range(len(target_vals)) :
    #     plt.plot(fpr[index], tpr[index], lw=2, label=f"ROC curve of class {target_vals[index]} (area = {roc_auc[index]:.2f})")

    # # plt.plot([0,1], [0,1], color='navy', lw=2, linestyle='--')
    # plt.xlim([0.0, 1.0])
    # plt.ylim([0.0, 1.05])
    # plt.xlabel('False Positive Rate')
    # plt.ylabel('True Positive Rate')
    # plt.title('Receiver Operating Characterstic for Tumor classes')
    # plt.legend(loc="lower right")
    # plt.show()

