# Imports

In [41]:
import numpy as np
from PIL import Image
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

## set data directories

In [22]:
train_dir = './data/train'
test_dir = './data/test'

## Images's processing params

In [23]:
img_height, img_width = 150, 150
batch_size = 32

## Using ImageDataGenerator to load and preprocess images

In [24]:
datagen = ImageDataGenerator(rescale=1./255)

# Fonction to load and preprocess images

In [25]:
def load_and_preprocess_images(directory):
    generator = datagen.flow_from_directory(
        directory,
        target_size=(img_height, img_width),
        batch_size=batch_size,
        class_mode='binary',
        shuffle=True
    )
    images, labels = [], []
    
    # Obtenir un itérateur sur le générateur
    generator_iter = iter(generator)
    
    # Utiliser next() sur l'itérateur
    for _ in range(len(generator)):
        batch = next(generator_iter)
        images.append(batch[0])
        labels.append(batch[1])
    
    images = np.concatenate(images)
    labels = np.concatenate(labels)
    return images, labels

## load train and test images

In [26]:
train_images, train_labels = load_and_preprocess_images(train_dir)
test_images, test_labels = load_and_preprocess_images(test_dir)

Found 5216 images belonging to 2 classes.
Found 624 images belonging to 2 classes.


## Resizing images to transform tehem to feature vectors

In [28]:
train_features = train_images.reshape(train_images.shape[0], -1)
test_features = test_images.reshape(test_images.shape[0], -1)

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler


# Use a subset of data for testing

In [30]:
subset_size = 1000
indices = np.random.choice(train_features.shape[0], subset_size, replace=False)

train_features_subset = train_features[indices]
train_labels_subset = train_labels[indices]

## Initialize StandardScaler

In [31]:
scaler = StandardScaler()

## Adjust scaler on training data and transform training data

In [32]:
train_features_scaled = scaler.fit_transform(train_features_subset)


## Transform test data with scaler adjusted to training data

In [33]:
test_features_scaled = scaler.transform(test_features)

## Set the hyperparameters to test

In [34]:
param_grid = {
    'C': [0.1, 1, 10],
    'solver': ['newton-cg', 'lbfgs']
}


# Initialize the logistic regression model

In [35]:
log_reg = LogisticRegression(max_iter=1000)

## Configure GridSearchCV with cross validation

In [36]:
grid_search = GridSearchCV(log_reg, param_grid, cv=3, scoring='accuracy', n_jobs=-1)

## Run GridSearchCV to find the best hyperparameters

In [37]:
grid_search.fit(train_features_scaled, train_labels_subset)

print(f'Best hyperparameters: {grid_search.best_params_}')
print(f'Best cross-validation accuracy: {grid_search.best_score_:.2f}')

Best hyperparameters: {'C': 0.1, 'solver': 'newton-cg'}
Best cross-validation accuracy: 0.95


# Train the model with the best hyperparameters on the full set of scaled train data

In [39]:
best_log_reg = grid_search.best_estimator_
best_log_reg.fit(train_features_scaled, train_labels_subset)

# Evaluate the model on the scaled test set

In [42]:
test_predictions = best_log_reg.predict(test_features_scaled)
test_accuracy = accuracy_score(test_labels, test_predictions)
print(f'Test Accuracy: {test_accuracy:.2f}')

Test Accuracy: 0.74
