In [None]:
# Oris data scientist test

# Requirement for the code to run
# Using python 3.12.3, run the following commands
pip install numpy
pip install pandas
pip install -U scikit-learn
pip install tensorflow
pip install keras

In [1]:
# Tensorflow for image feature extraction and scikit-learn for classification.
import os
import time
import numpy as np
import pandas as pd
from tensorflow.keras.applications import VGG16, ResNet50
from tensorflow.keras.applications.vgg16 import preprocess_input as preprocess_input_vgg
from tensorflow.keras.applications.resnet50 import preprocess_input as preprocess_input_resnet
from tensorflow.keras.preprocessing.image import load_img, img_to_array, ImageDataGenerator
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.utils import to_categorical
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report

In [None]:
# I chose both VGG16 and ResNet50 since (from my understanding) they showed recently good results for image classification
# as a feature extractions algorithms, while VGG16 is a bit old, it could be used as a benchmark for comparing results later on
# with new algorithms such as ResNet50.

# The rest of the parameters such as image size (224, 224) and batch_size, I used what the general consensus agrees on 
# when dealing with image classification problems.

In [71]:
# The purpopse of this class to have the ability to choose from different keras models
# or different classification models, here I have added VGG16 and ResNet50, but we can imagine
# adding later VGG19 or some other models, same goes for classification methods.

class QuarryClassifier:
    def __init__(self, base_model='vgg', clf='rf', batch_size=32, fine_tune_flag=False):

        if base_model is None or base_model == 'vgg':
            self.base_model = VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
            self.preprocessor = preprocess_input_vgg
        else:
            self.base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
            self.preprocessor = preprocess_input_resnet
        self.model = self.base_model

        if clf is None or clf == 'rf':
            self.clf = RandomForestClassifier(n_estimators=200, random_state=42, max_depth=2)
        else:
            self.clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=2, random_state=42)

        self.batch_size = batch_size
        self.fine_tune_flag = fine_tune_flag

    def load_and_preprocess_image(self, image_paths, preprocess_input):
        # Load and transform an image to a numpy array of size (224, 224)
        batch_images = []
        for path in image_paths:
            img = load_img(path, target_size=(224, 224))
            img_array = img_to_array(img)
            img_array = preprocess_input(img_array)
            batch_images.append(img_array)
        return np.array(batch_images)

    def extract_features(self, image_paths):
        # Batch image loading and feature extraction.
        n_samples_train = len(image_paths)

        image_data = []

        for i in range(0, n_samples_train, self.batch_size):
            train_image_batch = self.load_and_preprocess_image(image_paths.iloc[i:i+self.batch_size], self.preprocessor)
            if not self.fine_tune_flag:
                train_image_batch = self.base_model.predict(train_image_batch)
            image_data.append(train_image_batch)

        image_data = np.vstack(image_data)
        if not self.fine_tune_flag:
            image_data = image_data.reshape(image_data.shape[0], -1)
        return image_data

    def save_features(self, features_data, file_name='train.npy'):
        with open(file_name, 'wb') as file:
            np.save(file, features_data)

    def fit_classifier(self, features_data, y_train):
        # Fit a classifier using the extracted features from images.
        self.clf.fit(features_data, y_train)

    def predict_classifier(self, features_test_data):
        # Predict new image data using a classification method like RF or GBT.
        y_pred = self.clf.predict(features_test_data)
        return y_pred

    def build_fine_tune_model(self, base_model, num_classes=2):
        # For tuning purposed, build the different layers of the chosen model.
        for layer in base_model.layers:
            layer.trainable = False  # Freeze the convolutional base

        x = base_model.output
        x = GlobalAveragePooling2D()(x)
        x = Dense(256, activation='relu')(x)

        # For prediction purposes.
        predictions = Dense(num_classes, activation='softmax')(x)

        fine_tune_model = Model(inputs=base_model.input, outputs=predictions)
        return fine_tune_model

    def compile_fine_tune_models(self):
        self.model = self.build_fine_tune_model(self.base_model)
        self.model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

    def fine_tune(self, image_path, labels, val_split_rate=0.2, epochs=20, model_name='fine_tuned.keras'):
        # Fine tune a keras model using different available options like validation rate and number of epochs.
        train_images = self.extract_features(image_path)
        datagen = ImageDataGenerator(validation_split=val_split_rate)

        train_generator = datagen.flow(train_images, to_categorical(labels), batch_size=self.batch_size, subset='training')
        validation_generator = datagen.flow(train_images, to_categorical(labels), batch_size=self.batch_size, subset='validation')

        callbacks = [EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True),
                     ModelCheckpoint(model_name, save_best_only=True)]

        self.model.fit(train_generator, validation_data=validation_generator, epochs=epochs, callbacks=callbacks)

    def save_model(self, model_name):
        self.model.save(model_name)

    def load_model(self, model_name):
        self.model = load_model(model_name)
    
    def load_fine_tune_model_weights(self, model_name):
        self.model.load_weights(model_name)

    def predict(self, test_images):
        return self.model.predict(test_images)


In [2]:
# Load the dataset.

train_data = pd.read_csv('data/X_train.csv')
test_data = pd.read_csv('data/X_test.csv')
y_train = pd.read_csv('data/Y_train.csv')['output'].values
y_test = pd.read_csv('data/Y_test.csv')['output'].values

# Display the first few rows of the dataset
print(train_data.head())

# Check for missing values
print(train_data.isnull().sum())

##############################################################################################
# In this test, I only used the images to predict whether an image is a quarry or not,
# we can imagine using different availabe features but stacking the image features with latitude, longitude and site_type,
# then fit and train a random forest or any other binary classification model.
##############################################################################################

# train_structured_data = self.train_data[['latitude', 'longitude', 'site_type']].values
# test_structured_data = self.test_data[['latitude', 'longitude', 'site_type']].values

               site_name  company_name  \
0       Carrières Daniel           NaN   
1         Malet Horgues.           NaN   
2          Carrieres Plo           NaN   
3  Dastugue Jean et Fils           NaN   
4  Pema Carrières Du Roc           NaN   

                                                link  latitude  longitude  \
0  https://www.google.com/maps/place/Carri%C3%A8r...   43.0477    -0.0473   
1  https://www.google.com/maps/place/Malet+Horgue...   43.1903     0.0916   
2  https://www.google.com/maps/place/Carrieres+Pl...   42.9605     0.3937   
3  https://www.google.com/maps/place/Dastugue+Jea...   43.0689     0.3865   
4  https://www.google.com/maps/place/Pema+Carri%C...   43.6493     0.3439   

                                             address homepage    phone  \
0                      Le Village, 65100 Ger, France      NaN  #ERROR!   
1             Chem. de Mansas, 65310 Horgues, France      NaN      NaN   
2                       Village, 65410 Ilhet, France      NaN 

In [None]:
# Also we can infer from the calculating the null data, that 'company_name', 'homepage', 'phone', 
# 'sales_phone', 'email', 'sales_email', 'opening_hours' are not important due to a high number of null values.
# 'site_type' is almost identical, so it can also be dropped.
# But for the sake of simplicity, I didn't use any feature other than the image itself for training and prediction.

In [None]:
# OPTION 1: By using the base keras model to extact image features with no FINE TUNING 
# and a then predict using standard classifier

fine_tune_flag = False 
classifier = QuarryClassifier(base_model='resnet50', clf='rf', fine_tune_flag=fine_tune_flag)

train_features = classifier.extract_features(train_data['image_path'])
test_features = classifier.extract_features(test_data['image_path'])

# Saving the features for futur use and better inference time.
classifier.save_features(train_features, file_name='base_resnet_train.npy')
classifier.save_features(test_features, file_name='base_resnet_test.npy')

start_time = time.time()
# With a random forest or a gbt classifier
classifier.fit_classifier(train_features, y_train)
y_pred = classifier.predict_classifier(test_features)
end_time = time.time()
print('Ensemble classifier model report:')
print(classification_report(y_test, y_pred))
print('Inference time with an ensemble method:', str(round(end_time - start_time, 2)))

In [None]:
# OPTION 2: By FINE TUNING the base model and use it for prediction
fine_tune_flag = True 
classifier = QuarryClassifier(base_model='resnet50', fine_tune_flag=fine_tune_flag)
classifier.compile_fine_tune_models()
classifier.fine_tune(train_data['image_path'], y_train, val_split_rate=0.2, epochs=10, model_name='resnet_tuned_weights_ep10.keras')

start_time = time.time()
test_features = classifier.extract_features(test_data['image_path'])
classifier.save_features(test_features, file_name='tuned_resnet_test.npy')
y_pred = classifier.predict(test_features)
end_time = time.time()
print(classification_report(y_test, np.argmax(y_pred, axis=1)))
print('Inference Time:', str(round(end_time - start_time, 2)))

classifier.save_model('resnet_tuned_10e.keras')

In [55]:
# OPTION 3: Read previously generated image features for training and prediction and better inference time
classifier = QuarryClassifier(base_model='resnet50', clf='rf', fine_tune_flag=False)

with open('base_resnet_train.npy', 'rb') as file:
   train_features = np.load(file)
with open('base_resnet_test.npy', 'rb') as file:
   test_features = np.load(file)

start_time = time.time()
# With a random forest or a gbt classifier
classifier.fit_classifier(train_features, y_train)
y_pred = classifier.predict_classifier(test_features)
end_time = time.time()
print('Ensemble classifier model report:')
print(classification_report(y_test, y_pred))
print('Inference time with an ensemble method:', str(round(end_time - start_time, 2)))

Ensemble classifier model report:
              precision    recall  f1-score   support

           0       0.48      0.49      0.49       256
           1       0.51      0.50      0.50       269

    accuracy                           0.50       525
   macro avg       0.50      0.50      0.50       525
weighted avg       0.50      0.50      0.50       525

Inference time with an ensemble method: 4.29


In [75]:
# OPTION 4: Load previously tuned model for prediction purposes.
classifier = QuarryClassifier(base_model='restnet50')
classifier.load_model('resnet_tuned_e10.keras')
with open('tuned_resnet_test.npy', 'rb') as file:
   test_features = np.load(file)
start_time = time.time()
y_pred = classifier.predict(test_features)
end_time = time.time()
print(classification_report(y_test, np.argmax(y_pred, axis=1)))
print('Inference Time:', str(round(end_time - start_time, 2)))

[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 2s/step
              precision    recall  f1-score   support

           0       0.49      0.58      0.53       256
           1       0.52      0.43      0.47       269

    accuracy                           0.50       525
   macro avg       0.50      0.50      0.50       525
weighted avg       0.50      0.50      0.50       525

Inference Time: 32.14


In [None]:
# Enhancement
# 1) Results could be improved by using the rest of the features like 'latitude', 'longitude' and 'site_type'
# 2) No paramter tuning was done which gived a room for improvement by trying different parameters values,
# parameters such as number of epochs and validation rate, or by using cross validation for better insight of the results.

# 3) Same goes for the classifier, we can try with different number of trees or max depth depending on the algorithm.

# 4) I don't have much experience with image processing algorithms, more in-depth knowledge about the algorithms
# could give place for better understanding and better results, e.g. using segmentation algorithms instead of feature extraction.

# 5) Inference time could still be improved by saving the classifier model.