# Ensembling XGBoost models with Logistic Regression as Meta-Learner

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from keras.preprocessing.image import load_img, img_to_array
import cv2
from keras.applications.vgg19 import VGG19
from keras.layers import GlobalAveragePooling2D
from keras.models import Model
from imblearn.over_sampling import RandomOverSampler

In [None]:
def load_data(csv_path, image_folder, image_id_column, class_column, target_classes):
    """
    Load data from CSV file and images from the specified folder.

    Args:
    - csv_path (str): Path to the CSV file containing data.
    - image_folder (str): Path to the folder containing images.
    - image_id_column (str): Name of the column containing image IDs.
    - class_column (str): Name of the column containing class labels.
    - target_classes (list): List of target class labels.

    Returns:
    - data (DataFrame): Loaded data from CSV.
    - images (ndarray): Loaded images as numpy array.
    - labels (ndarray): Encoded class labels.
    """
    data = pd.read_csv(csv_path)
    images = []
    for image_id in data[image_id_column]:
        img_path = os.path.join(image_folder, f"{image_id}.jpg")
        img = load_img(img_path, target_size=image_size)
        img_array = img_to_array(img).flatten()
        images.append(img_array)
    images = np.array(images)
    label_encoder = LabelEncoder()
    data[class_column] = label_encoder.fit_transform(data[class_column])
    labels = data[class_column]
    return data, images, labels

def train_xgboost_model(X_train, y_train, X_val, y_val, **kwargs):
    """
    Train an XGBoost model.

    Args:
    - X_train (ndarray): Training features.
    - y_train (ndarray): Training labels.
    - X_val (ndarray): Validation features.
    - y_val (ndarray): Validation labels.
    - **kwargs: Additional keyword arguments for XGBoost training.

    Returns:
    - model: Trained XGBoost model.
    """
    ros = RandomOverSampler(random_state=42)
    X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)
    model = xgb.XGBClassifier(**kwargs)
    model.fit(X_train_resampled, y_train_resampled, 
              eval_set=[(X_train_resampled, y_train_resampled), (X_val, y_val)], 
              eval_metric=["merror", "mlogloss"], 
              early_stopping_rounds=10, 
              verbose=True)
    return model

def extract_image_features(data, image_folder, image_id_column, cnn_model):
    """
    Extract image features using a pre-trained CNN model.

    Args:
    - data (DataFrame): Data containing image IDs.
    - image_folder (str): Path to the folder containing images.
    - image_id_column (str): Name of the column containing image IDs.
    - cnn_model: Pre-trained CNN model.

    Returns:
    - extracted_features (ndarray): Extracted image features.
    - image_ids (ndarray): Image IDs.
    """
    extracted_features = []
    image_ids = []
    for index, row in data.iterrows():
        image_id = row[image_id_column]
        image_path = os.path.join(image_folder, f"{image_id}.jpg")
        image = cv2.imread(image_path)
        image = cv2.resize(image, (224, 224))
        image_array = img_to_array(image)
        image_array = np.expand_dims(image_array, axis=0)
        image_array = image_array / 255.0
        features = cnn_model.predict(image_array)
        extracted_features.append(features.flatten())
        image_ids.append(image_id)
    extracted_features = np.array(extracted_features)
    image_ids = np.array(image_ids)
    return extracted_features, image_ids

def preprocess_data(csv_path, image_folder, image_id_column, class_column, target_classes):
    """
    Preprocess data by loading, encoding labels, and extracting image features.

    Args:
    - csv_path (str): Path to the CSV file containing data.
    - image_folder (str): Path to the folder containing images.
    - image_id_column (str): Name of the column containing image IDs.
    - class_column (str): Name of the column containing class labels.
    - target_classes (list): List of target class labels.

    Returns:
    - data (DataFrame): Preprocessed data from CSV.
    - images (ndarray): Loaded and preprocessed images.
    - labels (ndarray): Encoded class labels.
    - well_log_data (DataFrame): Processed data excluding image-related columns.
    """
    data, images, labels = load_data(csv_path, image_folder, image_id_column, class_column, target_classes)
    well_log_data = data.drop(columns=['IMAGE_ID', 'DEPTH_WMSF (m)'])
    return data, images, labels, well_log_data

def load_cnn_model(input_shape=(224, 224, 3)):
    """
    Load a pre-trained CNN model (VGG19).

    Args:
    - input_shape (tuple): Input shape of the model.

    Returns:
    - cnn_model: Pre-trained CNN model.
    """
    base_model = VGG19(weights='imagenet', include_top=False, input_shape=input_shape)
    x = base_model.output
    x = GlobalAveragePooling2D()(x)
    cnn_model = Model(inputs=base_model.input, outputs=x)
    return cnn_model

def train_and_evaluate_image_model(data, images, labels, cnn_model):
    """
    Train and evaluate the image model.

    Args:
    - data (DataFrame): Data containing image IDs.
    - images (ndarray): Loaded and preprocessed images.
    - labels (ndarray): Encoded class labels.
    - cnn_model: Pre-trained CNN model.
    """
    X_train_img, X_test_img, y_train_img, y_test_img = train_test_split(images, labels, test_size=0.2, random_state=42)
    X_train_img, X_val_img, y_train_img, y_val_img = train_test_split(X_train_img, y_train_img, test_size=0.25, random_state=42)
    extracted_features, _ = extract_image_features(data, image_folder, image_id_column, cnn_model)
    image_model = train_xgboost_model(X_train_img, y_train_img, X_val_img, y_val_img)
    plot_training_curves(image_model.evals_result())
    evaluate_model(image_model, X_test_img, y_test_img)

def train_and_evaluate_tabular_model(well_log_data, labels):
    """
    Train and evaluate the tabular model.

    Args:
    - well_log_data (DataFrame): Processed data excluding image-related columns.
    - labels (ndarray): Encoded class labels.
    """
    X_train_tabular, X_test_tabular, y_train_tabular, y_test_tabular = train_test_split(well_log_data, labels, test_size=0.2, random_state=42)
    X_train_tabular, X_val_tabular, y_train_tabular, y_val_tabular = train_test_split(X_train_tabular, y_train_tabular, test_size=0.25, random_state=42)
    tabular_model = train_xgboost_model(X_train_tabular, y_train_tabular, X_val_tabular, y_val_tabular)
    plot_training_curves(tabular_model.evals_result())
    evaluate_model(tabular_model, X_test_tabular, y_test_tabular)

In [None]:
def train_and_evaluate_ensemble_model(data, images, well_log_data, labels, cnn_model, **kwargs):
    """
    Train and evaluate the ensemble model.

    Args:
    - data (DataFrame): Data containing image IDs.
    - images (ndarray): Loaded and preprocessed images.
    - well_log_data (DataFrame): Processed data excluding image-related columns.
    - labels (ndarray): Encoded class labels.
    - cnn_model: Pre-trained CNN model.
    - **kwargs: Additional keyword arguments for XGBoost training and stacking classifier.

    Returns:
    - stacking_model: Trained stacking classifier.
    """
    X_train_img, X_test_img, y_train_img, y_test_img = train_test_split(images, labels, test_size=0.2, random_state=42)
    X_train_img, X_val_img, y_train_img, y_val_img = train_test_split(X_train_img, y_train_img, test_size=0.25, random_state=42)
    
    extracted_features, _ = extract_image_features(data, image_folder, image_id_column, cnn_model)
    
    ros = RandomOverSampler(random_state=42)
    X_train_img_resampled, y_train_img_resampled = ros.fit_resample(X_train_img, y_train_img)
    X_train_tabular_resampled, y_train_tabular_resampled = ros.fit_resample(well_log_data, labels)
    
    image_model = train_xgboost_model(X_train_img_resampled, y_train_img_resampled, X_val_img, y_val_img, **kwargs)
    tabular_model = train_xgboost_model(X_train_tabular_resampled, y_train_tabular_resampled, X_val_tabular, y_val_tabular, **kwargs)
    
    y_pred_img_val = cross_val_predict(image_model, X_val_img, y_val_img, **kwargs)
    y_pred_tabular_val = cross_val_predict(tabular_model, X_val_tabular, y_val_tabular, **kwargs)

    final_estimator = LogisticRegression()
    estimators = [('image', image_model), ('tabular', tabular_model)]
    stacking_model = StackingClassifier(estimators=estimators, final_estimator=final_estimator)
    stacking_model.fit(np.column_stack((y_pred_img_val, y_pred_tabular_val)), y_val_tabular)

    return stacking_model

In [None]:
# Define paths and columns
csv_path = "/path/to/csv/file.csv"
image_folder = "/path/to/image/folder"
image_id_column = "IMAGE_ID" 
class_column = "ROCK_CLASS"
target_classes = ['Wackestone', 'Packstone', 'Grainstone', 'Floatstone', 'Rudstone']

# Define image size
image_size = (224, 224)

# Preprocess data
data, images, labels, well_log_data = preprocess_data(csv_path, image_folder, image_id_column, class_column, target_classes)

# Load CNN model
cnn_model = load_cnn_model()

# Train and evaluate ensemble model
stacking_model = train_and_evaluate_ensemble_model(data, images, well_log_data, labels, cnn_model, **{'cv': 5})