# Machine Learning Project - MRI Brain Tumor Dataset
Mouna Ahamdy et Roc de Larouzière

In [None]:
#All necessary imports :

#Import des bibliothèques de base
import os
import matplotlib.pyplot as plt
import numpy as np

#Traitement et analyse d'images
from PIL import Image
import cv2

#Import de Tensorflow et Keras
import tensorflow
from keras import datasets, layers, models
from keras.models import Sequential, Model, load_model
from keras.layers import Dense, Conv2D, MaxPool2D, Flatten, GlobalAveragePooling2D
from keras.preprocessing import image
from keras.preprocessing.image import ImageDataGenerator
from keras.applications import VGG16, ResNet50
from keras.applications.vgg16 import preprocess_input as preprocess_inputvgg16
from keras.applications.resnet50 import preprocess_input as preprocess_inputresnet50



#Import de Scikit Learn
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC, SVC
from sklearn.decomposition import PCA



TAILLE_IMAGE = 224




## STEP 1 : Loading the data


In [None]:
#Loading data thanks to Kaggle API keys -- Mouna load
! pip install -q kaggle
! mkdir ~/.kaggle/
! cp "/content/drive/MyDrive/kaggle.json" ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download -d masoudnickparvar/brain-tumor-mri-dataset
! unzip /content/brain-tumor-mri-dataset.zip -d archive

In [None]:
#Directly loading from GoogleDrive after importing the Dataset on it -- Roc load
from google.colab import drive
drive.mount('/content/drive')

## Step 2 : Visualisation of the dataset and getting the labels

In [None]:
#Training and testing directories

#Mouna
train_dir = "/content/archive/Training/"
test_dir = "/content/archive/Testing/"

#Roc
#train_dir = "/content/drive/My Drive/archive/Training/"
#test_dir = "/content/drive/My Drive/archive/Testing/"

In [None]:
#Definition des chemins pour accéder aux sous dossiers du dataset
train_subdirs = [os.path.join(train_dir, subdir) for subdir in os.listdir(train_dir)]
test_subdirs = [os.path.join(test_dir, subdir) for subdir in os.listdir(test_dir)]

#Compte du nombre d'images dans chaque sous dossiers
num_train_samples = sum([len(files) for subdir in train_subdirs for r, d, files in os.walk(subdir)])
num_test_samples = sum([len(files) for subdir in test_subdirs for r, d, files in os.walk(subdir)])


#Recuperation des labels
def SetLabels(dir) :
    labelsCorrespondance = set()
    labels = []
    listeTumeur = os.listdir(dir)
    for ssdossIndex, ssdossNom in enumerate(listeTumeur):
        ssdossChemin = os.path.join(dir, ssdossNom)
        nbimages_sousdossier = len([img for img in os.listdir(ssdossChemin)])
        labelsCorrespondance.add((ssdossIndex, ssdossNom, nbimages_sousdossier)) #ssdossIndex est l'index du dossier dans listeTumeur et ssdossNom est le nom de la tumeur (titre du sous dossier)
        labels.extend([ssdossIndex] * nbimages_sousdossier)
    return labels, labelsCorrespondance


y_train, y_trainCorrespondance = SetLabels(train_dir)
y_test, y_testCorrespondance = SetLabels(test_dir)


#Pie chart pour la visualisation des données
def PieChart(labelsCorrespondance, num_samples, title) :
  labels = [elem[1] for elem in labelsCorrespondance]
  sizes = [elem[2] / num_samples for elem in labelsCorrespondance]
  fig, ax = plt.subplots()
  ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)
  ax.axis('equal')
  plt.title(title)
  plt.show()



#Print des informations du dataset
print('DATASET TRAIN PART : ')
for elem in y_trainCorrespondance :
  print("Il y a ", elem[2], " exemples de tumeur de type ", elem[1], " pour un ensemble de ", num_train_samples, " données train")

print("\n\n")
print('DATASET TEST PART : ')
for elem in y_testCorrespondance :
  print("Il y a ", elem[2], " exemples de tumeur de type ", elem[1], " pour un ensemble de ", num_test_samples, " données test")

print("\n\n\n\n\n\n")
PieChart(y_trainCorrespondance, num_train_samples, "Repartition des données dans la partie Train du Dataset")
PieChart(y_testCorrespondance, num_test_samples, "Repartition des données dans la partie Test du Dataset")



## Step 3 : Embedding de notre DataSet

In [None]:
#Modèle vgg16 pour l'embedding de nos données X_train et X_test du dataset
def vgg16model() :
    base_model = VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

    for layer in base_model.layers:
        layer.trainable = False

    x = Flatten()(base_model.output)
    x = Dense(1024, activation='relu')(x) # Ajoute une couche Dense avec 1024 unités et activation ReLU pour passer de 50 000 unités à 1024
    x = Dense(1024, activation='relu')(x)
    x = Dense(512, activation='relu')(x)

    model = Model(inputs=base_model.input, outputs=x)  # Crée un nouveau modèle qui inclut les modifications

    return model


#Modèle ResNet pour l'embedding de nos données X_train et X_test du dataset
def resnetmodel() :
    base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

    for layer in base_model.layers:
        layer.trainable = False

    # Ajout de GlobalAveragePooling pour réduire la dimensionalité
    x = GlobalAveragePooling2D()(base_model.output)

    model = Model(inputs=base_model.input, outputs=x)   # Crée un nouveau modèle qui inclut les modifications
    return model


In [None]:
#La fonction extract features va faire un pre traitement des images du dataset grâce à la fonction ImageDataGenerator de Keras
#Pour vgg16, le paramètre modelsizeoutput = 1024 et pour ResNet, modelsizeoutput = 2048

def extract_features(directory, sample_count, modelsizeoutput, model, preprocess_input):
    model = model
    features = np.zeros((sample_count, modelsizeoutput))  # modelsizeoutput ajuste la taille pour correspondre à la sortie de la couche Dense
    generator = ImageDataGenerator(preprocessing_function=preprocess_input)
    image_data = generator.flow_from_directory(
        directory,
        target_size=(224, 224),
        batch_size=32,
        class_mode=None,
        shuffle=False
    )
    i = 0
    for inputs_batch in image_data:
        features_batch = model.predict(inputs_batch, verbose=1)
        features[i * 32 : (i + 1) * 32] = features_batch
        print(f"Traitement du lot {i+1}, {len(inputs_batch)} images traitées.")
        i += 1
        if i * 32 >= sample_count:
            break
    return features




In [None]:
# Extract features

#si le model utilisé est vgg16:
#train_features = extract_features(train_dir, num_train_samples, 1024, vgg16model(), preprocess_inputvgg16)
#test_features = extract_features(test_dir, num_test_samples, 1024, vgg16model(), preprocess_inputvgg16)

#si le model utilisé est ResNet50 :
train_features = extract_features(train_dir, num_train_samples, 2048, resnetmodel(), preprocess_inputresnet50)
test_features = extract_features(test_dir, num_test_samples, 2048, resnetmodel(), preprocess_inputresnet50)

In [None]:
#PCA

pca = PCA(n_components=256, random_state=42)
train_features_pca = pca.fit_transform(train_features)
test_features_pca = pca.transform(test_features)


## Step 4 : Classification

In [None]:
#Logistic Regression
skf = StratifiedKFold(n_splits=5)
model = LogisticRegression(solver='lbfgs', multi_class='ovr', max_iter=10000)  # multi_class='ovr' pour one-vs-all
scores = cross_val_score(model, train_features, y_train, cv=skf, scoring='accuracy', verbose=10)
print("Accuracy de chaque fold:", scores)
print("Accuracy moyenne:", np.mean(scores))

# Entraîner le modèle sur l'ensemble d'apprentissage complet
model.fit(train_features_pca, y_train)

# Prédire les étiquettes pour l'ensemble de test
y_pred = model.predict(test_features_pca)

# Calculer la précision sur l'ensemble de test
test_accuracy = accuracy_score(y_test, y_pred)

print("Accuracy sur l'ensemble de test:", test_accuracy)
lg_accuracy = test_accuracy

In [None]:
#SVM



skf = StratifiedKFold(n_splits=5)
param_grid = {'C': [0.01, 1, 5, 10, 25, 50, 80, 100]}  # Exemple de valeurs pour C

model = LinearSVC(dual=False, multi_class='ovr', max_iter=5000)


grid_search = GridSearchCV(model, param_grid, scoring='accuracy', cv=skf, verbose=10)
grid_search.fit(train_features_pca, y_train)

print("Meilleur paramètre (C):", grid_search.best_params_)
print("Meilleure accuracy de cross-validation:", grid_search.best_score_)

best_model = grid_search.best_estimator_

y_pred = best_model.predict(test_features_pca)
test_accuracy = accuracy_score(y_test, y_pred)
print("Accuracy sur l'ensemble de test:", test_accuracy)

lsvm_accuracy = test_accuracy



In [None]:
#SVM non lineaire


skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
param_grid = {'C': [6.8, 7, 8, 9], #valeurs pour resnet50, 1.19 correspondant à la meilleure accuracy
    'gamma': ['scale', 'auto']  # Exemple de valeurs pour gamma
}

model = SVC(kernel='rbf', max_iter=10000)  # Utilisation du noyau RBF

grid_search = GridSearchCV(model, param_grid, scoring='accuracy', cv=skf, verbose=10)
grid_search.fit(train_features_pca, y_train)

print("Meilleur paramètre (C, gamma):", grid_search.best_params_)
print("Meilleure accuracy de cross-validation:", grid_search.best_score_)

best_model = grid_search.best_estimator_

y_pred = best_model.predict(test_features_pca)
test_accuracy = accuracy_score(y_test, y_pred)
print("Accuracy sur l'ensemble de test:", test_accuracy)
nlsvm_accuracy = test_accuracy