# Feature Extraction from FMS Image (VGG19) and Combine with XGBoost for wireline logs --> Predict Rock Class (using Standard Dataset)

In [None]:
import os
import cv2
import numpy as np
import pandas as pd
from tensorflow.keras.applications import VGG19
from tensorflow.keras.models import Model
from tensorflow.keras.layers import GlobalAveragePooling2D
from tensorflow.keras.preprocessing.image import img_to_array, load_img
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import RandomOverSampler

def load_data(image_folder, csv_path, image_id_column, rock_class_column):
    # Load the pre-trained VGG19 model
    base_model = VGG19(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
    x = base_model.output
    x = GlobalAveragePooling2D()(x)
    cnn_model = Model(inputs=base_model.input, outputs=x)

    # Load the CSV file into a DataFrame
    data = pd.read_csv(csv_path)

    # Initialize lists to store extracted features and corresponding image IDs
    extracted_features = []
    image_ids = []

    # Loop through each row in the DataFrame
    for index, row in data.iterrows():
        image_id = row[image_id_column]
        image_path = os.path.join(image_folder, f"{image_id}.jpg")

        # Load and preprocess the image using OpenCV
        image = cv2.imread(image_path)
        image = cv2.resize(image, (224, 224))
        image_array = img_to_array(image)
        image_array = np.expand_dims(image_array, axis=0)
        image_array = image_array / 255.0

        # Extract features using the CNN model
        features = cnn_model.predict(image_array)

        # Store the extracted features and image ID
        extracted_features.append(features.flatten())
        image_ids.append(image_id)

    # Convert the lists to numpy arrays
    extracted_features = np.array(extracted_features)
    image_ids = np.array(image_ids)

    return data, extracted_features, image_ids

def preprocess_data(data, columns_to_standardize):
    scaler = StandardScaler()
    processed_logs = scaler.fit_transform(data[columns_to_standardize])
    return processed_logs

def train_xgboost_model(X_train, y_train, X_val, y_val):
    ros = RandomOverSampler(random_state=42)
    X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

    model = xgb.XGBClassifier(random_state=42, eval_metric=["mlogloss", "merror"])
    eval_results = [(X_train_resampled, y_train_resampled), (X_val, y_val)]
    model.fit(X_train_resampled, y_train_resampled, 
              eval_set=eval_results, early_stopping_rounds=10,
              verbose=True)

    return model

def plot_metrics(history, metric):
    train_metric = history['validation_0'][metric]
    val_metric = history['validation_1'][metric]

    plt.figure(figsize=(10, 6))
    plt.plot(train_metric, label='Training ' + metric.capitalize())
    plt.plot(val_metric, label='Validation ' + metric.capitalize())
    plt.title('XGBoost ' + metric.capitalize())
    plt.xlabel('Epochs')
    plt.ylabel(metric.capitalize())
    plt.legend()
    plt.show()

def save_results_to_csv(train_loss, train_accuracy, file_path):
    results_df = pd.DataFrame({'Epoch': range(1, len(train_loss) + 1), 'Training Loss': train_loss, 'Training Accuracy': train_accuracy})
    results_df.to_csv(file_path, index=False)

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)

    print("Classification Report:")
    print(classification_report(y_test, y_pred, zero_division=0))

    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted Labels")
    plt.ylabel("True Labels")
    plt.show()

    cm_percent = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    plt.figure(figsize=(8, 8))
    sns.heatmap(cm_percent, annot=True, fmt='.2f', cmap='Blues', cbar=False)
    plt.title("Confusion Matrix (Percentages)")
    plt.xlabel("Predicted Labels")
    plt.ylabel("True Labels")
    plt.show()

# Define paths and columns
image_folder = '/path/to/image/folder'
csv_path = '/path/to/csv/file.csv'
image_id_column = 'IMAGE_ID'
rock_class_column = 'ROCK_CLASS'

# Load data
data, extracted_features, image_ids = load_data(image_folder, csv_path, image_id_column, rock_class_column)

# Define columns to standardize
columns_to_standardize = ['HSGR (gAPI)', 'HCGR (gAPI)', 'HFK (%)', 'HTHO (ppm)', 'HURA (ppm)', 'HBHK (%)', 'IDPH (ohmm)',
                          'IMPH (ohmm)', 'SFLU (ohmm)', 'RHOM (g/cm3)', 'DRH (g/cm3)', 'PEFL (barns/e-)', 'NRHB (g/cm3)',
                          'APLC (%)', 'STOF (in)', 'SIGF (cu)', 'AFEC (cps)', 'ANEC (cps)']

# Preprocess data
processed_logs = preprocess_data(data, columns_to_standardize)

# Combine features
combined_features = np.hstack((processed_logs, extracted_features))

# Split data
X = combined_features
y = data['rock_class_encoded'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Train model
model = train_xgboost_model(X_train, y_train, X_val, y_val)

# Plot metrics
history = model.evals_result()
plot_metrics(history, 'mlogloss')
plot_metrics(history, 'merror')

# Save results to CSV
train_loss = history['validation_0']['mlogloss']
train_accuracy = history['validation_0']['merror']
save_results_to_csv(train_loss, train_accuracy, 'training_results.csv')

# Evaluate model
evaluate_model(model, X_test, y_test)
