# Rock Class Prediction using XGBoost
## Image Model: Feature Extraction from FMS Image (VGG19) and Final Rock Class Prediction using XGBoost
## Tabular Model: Rock Classification using Wireline Log Data 

In [None]:
import numpy as np
import pandas as pd
import os
import cv2
from tensorflow.keras.applications import VGG19
from tensorflow.keras.models import Model
from tensorflow.keras.layers import GlobalAveragePooling2D
from tensorflow.keras.preprocessing.image import img_to_array, load_img
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from imblearn.over_sampling import RandomOverSampler

In [None]:
def load_data(csv_path, image_folder, image_id_column, class_column, target_classes):
    data = pd.read_csv(csv_path)

    images = []
    for image_id in data[image_id_column]:
        img_path = os.path.join(image_folder, f"{image_id}.jpg")
        img = load_img(img_path, target_size=image_size)
        img_array = img_to_array(img).flatten()
        images.append(img_array)

    images = np.array(images)

    label_encoder = LabelEncoder()
    data[class_column] = label_encoder.fit_transform(data[class_column])

    labels = data[class_column]

    return data, images, labels

def extract_image_features(data, image_folder, image_id_column, cnn_model):
    extracted_features = []
    image_ids = []

    for index, row in data.iterrows():
        image_id = row[image_id_column]
        image_path = os.path.join(image_folder, f"{image_id}.jpg")

        image = cv2.imread(image_path)
        image = cv2.resize(image, (224, 224))
        image_array = img_to_array(image)
        image_array = np.expand_dims(image_array, axis=0)
        image_array = image_array / 255.0

        features = cnn_model.predict(image_array)

        extracted_features.append(features.flatten())
        image_ids.append(image_id)

    extracted_features = np.array(extracted_features)
    image_ids = np.array(image_ids)

    return extracted_features, image_ids

def train_xgboost_model(X_train, y_train, X_val, y_val):
    ros = RandomOverSampler(random_state=42)
    X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

    model = xgb.XGBClassifier()
    model.fit(X_train_resampled, y_train_resampled, 
              eval_set=[(X_train_resampled, y_train_resampled), (X_val, y_val)], 
              eval_metric=["merror", "mlogloss"], 
              early_stopping_rounds=10, 
              verbose=True)

    return model

def plot_training_curves(history):
    train_logloss = history['validation_0']['mlogloss']
    val_logloss = history['validation_1']['mlogloss']
    train_error = history['validation_0']['merror']
    val_error = history['validation_1']['merror']
    epochs = range(len(train_logloss))

    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plt.plot(epochs, train_logloss, label='Train')
    plt.plot(epochs, val_logloss, label='Validation')
    plt.xlabel('Epochs')
    plt.ylabel('Log Loss')
    plt.legend()
    plt.title('XGBoost Log Loss')
    plt.subplot(1, 2, 2)
    plt.plot(epochs, train_error, label='Train')
    plt.plot(epochs, val_error, label='Validation')
    plt.xlabel('Epochs')
    plt.ylabel('Classification Error')
    plt.legend()
    plt.title('XGBoost Classification Error')
    plt.show()

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted Labels")
    plt.ylabel("True Labels")
    plt.show()

In [None]:
# Load and preprocess the CSV file
def preprocess_data(csv_path, image_folder, image_id_column, class_column, target_classes):
    data, images, labels = load_data(csv_path, image_folder, image_id_column, class_column, target_classes)

    well_log_data = data.drop(columns=['IMAGE_ID', 'DEPTH_WMSF (m)'])

    return data, images, labels, well_log_data

# Load the pre-trained VGG19 model
def load_cnn_model(input_shape=(224, 224, 3)):
    base_model = VGG19(weights='imagenet', include_top=False, input_shape=input_shape)
    x = base_model.output
    x = GlobalAveragePooling2D()(x)
    cnn_model = Model(inputs=base_model.input, outputs=x)
    return cnn_model

# Train and evaluate image model
def train_and_evaluate_image_model(data, images, labels, cnn_model):
    X_train_img, X_test_img, y_train_img, y_test_img = train_test_split(images, labels, test_size=0.2, random_state=42)
    X_train_img, X_val_img, y_train_img, y_val_img = train_test_split(X_train_img, y_train_img, test_size=0.25, random_state=42)

    extracted_features, _ = extract_image_features(data, image_folder, image_id_column, cnn_model)

    image_model = train_xgboost_model(X_train_img, y_train_img, X_val_img, y_val_img)
    plot_training_curves(image_model.evals_result())
    evaluate_model(image_model, X_test_img, y_test_img)

# Train and evaluate tabular model
def train_and_evaluate_tabular_model(well_log_data, labels):
    X_train_tabular, X_test_tabular, y_train_tabular, y_test_tabular = train_test_split(well_log_data, labels, test_size=0.2, random_state=42)
    X_train_tabular, X_val_tabular, y_train_tabular, y_val_tabular = train_test_split(X_train_tabular, y_train_tabular, test_size=0.25, random_state=42)

    tabular_model = train_xgboost_model(X_train_tabular, y_train_tabular, X_val_tabular, y_val_tabular)
    plot_training_curves(tabular_model.evals_result())
    evaluate_model(tabular_model, X_test_tabular, y_test_tabular)

In [None]:
# Define paths and columns
csv_path = "/path/to/csv/file.csv"
image_folder = "/path/to/image/folder"
image_id_column = "IMAGE_ID" 
class_column = "ROCK_CLASS"
target_classes = ['Wackestone', 'Packstone', 'Grainstone', 'Floatstone', 'Rudstone']

# Define image size
image_size = (224, 224)

# Preprocess data
data, images, labels, well_log_data = preprocess_data(csv_path, image_folder, image_id_column, class_column, target_classes)

# Load CNN model
cnn_model = load_cnn_model()

# Train and evaluate image model
train_and_evaluate_image_model(data, images, labels, cnn_model)

# Train and evaluate tabular model
train_and_evaluate_tabular_model(well_log_data, labels)