# Installation librairies

In [1]:
!pip install boto3

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Fonctions

## Création du premier dataset

In [3]:
import re

def creer_dataset_initial_depuis_s3(bucket):
    colonnes = ["chemin_image", "label"]
    valeurs = recuperer_valeurs_dataset_depuis_s3(bucket, False)
    return spark.createDataFrame(valeurs, colonnes)

def recuperer_valeurs_dataset_depuis_s3(bucket, utiliser_echantillon):
    valeurs = []
    
    if utiliser_echantillon:
        expression_image = "echantillon.*\.jpg"
    else:
        expression_image = "dataset.*\.jpg"
    
    for objet_bucket in bucket.objects.all():
        if re.search(expression_image, objet_bucket.key):
            chemin_image = objet_bucket.key
            categorie_image = recuperer_categorie_image_depuis_s3(objet_bucket.key)
            valeurs.append([chemin_image, categorie_image])
            
    return valeurs

def recuperer_categorie_image_depuis_s3(cle_objet_bucket):
    liste_dossiers_fichiers = cle_objet_bucket.split('/')
    categorie_fruit = liste_dossiers_fichiers[1]
    categorie_fruit = categorie_fruit.lower()
    categorie_fruit = categorie_fruit.replace(" ", "_")
    return categorie_fruit

## Extraction des features

In [4]:
import io
import boto3
import pandas as pd
from keras.applications.vgg16 import VGG16
from keras.models import Model
from keras.preprocessing.image import load_img, img_to_array
from keras.applications.vgg16 import preprocess_input
from PIL import Image as pil_image

def extraire_features_depuis_s3(chemin_image):
    
    # Récupération du bucket
    bucket_p8 = recuperer_bucket_s3()
    objet = bucket_p8.Object(chemin_image)
    file_binary_stream = io.BytesIO()
    objet.download_fileobj(file_binary_stream)
    
    # Chargement et prétraitement de l'image
    #image = load_img(file_binary_stream, target_size=(224, 224))
    image = pil_image.open(file_binary_stream)
    image = image.resize((224, 224))
    
    # Convertir en tableau numpy
    image = img_to_array(image) 
    
    # Créer la collection d'images (un seul échantillon)
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    
    # Prétraiter l'image comme le veut VGG-16
    image = preprocess_input(image) 
    
    # Extraire les features de l'image
    cnn_extraction_features = creer_cnn_extraction_features()
    features = cnn_extraction_features.predict(image)[0]
    features = features.tolist()
    
    file_binary_stream.close()
    
    return features
    
def recuperer_bucket_s3():
    
    # Récupération des clés d'accès AWS
    access_key_id, secret_access_key = recuperer_cles_acces_aws()
    
    # Connexion à S3
    s3 = boto3.resource(
        service_name='s3',
        region_name='eu-west-3',
        aws_access_key_id=access_key_id,
        aws_secret_access_key=secret_access_key
    )
    
    return s3.Bucket("oc-p8-sb-data")

def recuperer_cles_acces_aws():
    fichier_access_keys_AWS = "/content/drive/MyDrive/Data/rootkey.csv"
    df_access_keys_AWS = pd.read_csv(fichier_access_keys_AWS)
    access_key_id = df_access_keys_AWS["AWSAccessKeyId"][0]
    secret_access_key = df_access_keys_AWS["AWSSecretKey"][0]
    return access_key_id, secret_access_key

def creer_cnn_extraction_features():
    # Initialisation du modèle VGG-16 et suppression de ses 2 dernières couches (prédiction et fully-connected)
    cnn = VGG16()
    cnn = Model(inputs=cnn.inputs, outputs=cnn.layers[-2].output)
    return cnn



## Choix du nombre optimum de composantes principales pour l'ACP

In [15]:
from pyspark.ml.feature import PCA

def choisir_nombre_optimum_composantes_principales(df_images_fruits):
    nombres_composantes_principales = [100, 150, 200, 225]
    variance_expliquee = []
    
    # Récupération des variances expliquées pour chaque nombre de composantes principales
    for nombre_composantes_principales in nombres_composantes_principales:
        print("Nombre de CP étudié : {}".format(nombre_composantes_principales))
        acp = PCA(k=nombre_composantes_principales, inputCol="features_vector", outputCol="resultats_acp_vector")
        acp = acp.fit(df_images_fruits)
        variance_expliquee.append(acp.explainedVariance.sum())
    
    # Recherche du premier index atteignant 95% de variance expliquée
    index = 0
    seuil_atteint = False
    taille_listes = len(variance_expliquee)
    while index < taille_listes and not seuil_atteint:
        if variance_expliquee[index] >= 0.95:
            seuil_atteint = True
        else:
            index += 1
    
    # Renvoi du nombre optimum de composantes principales selon si le seuil a été atteint ou non
    if seuil_atteint:
        return nombres_composantes_principales[index]
    else:
        return nombres_composantes_principales[taille_listes-1]        

## Export des résultats vers S3

In [23]:
def exporter_resultats_acp_vers_s3(df_pandas_resultat_acp):
    file_binary_stream = io.BytesIO()
    df_pandas_resultat_acp.to_csv(file_binary_stream, mode="wb", header=True, sep=";", index=False)
    file_binary_stream.seek(0)
    bucket.upload_fileobj(file_binary_stream, "sortie_acp_avec_200_images.csv")
    file_binary_stream.close()
    return None

# Récupération des clés d'accès AWS

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
import pandas as pd

fichier_access_keys_AWS = "/content/drive/MyDrive/Data/rootkey.csv"
df_access_keys_AWS = pd.read_csv(fichier_access_keys_AWS)
access_key_id = df_access_keys_AWS["AWSAccessKeyId"][0]
secret_access_key = df_access_keys_AWS["AWSSecretKey"][0]

# Connexion avec S3

In [9]:
import boto3

s3 = boto3.resource(
    service_name='s3',
    region_name='eu-west-3',
    aws_access_key_id=access_key_id,
    aws_secret_access_key=secret_access_key
)

bucket = s3.Bucket("oc-p8-sb-data")

# Chargement des images depuis S3

In [10]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, Row

# Initialisation des configurations
appName = "oc-p8-deployer-modele-cloud"
master = "local[*]"
conf = SparkConf().setAppName(appName).setMaster(master)
conf.set('spark.driver.memory', '4g')
conf.set('spark.executor.memory', '2g')
conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", "1024")

# Instanciation de la SparkSession avec la configuration définie ci-avant et du SparkContext
spark = SparkSession.builder.config(conf=conf).getOrCreate()
sc = spark.sparkContext

df_images_fruits = creer_dataset_initial_depuis_s3(bucket)
df_images_fruits.persist()
df_images_fruits.show(truncate=True)

+--------------------+--------------+
|        chemin_image|         label|
+--------------------+--------------+
|dataset/Apple Bra...|apple_braeburn|
|dataset/Apple Bra...|apple_braeburn|
|dataset/Apple Bra...|apple_braeburn|
|dataset/Apple Bra...|apple_braeburn|
|dataset/Apple Bra...|apple_braeburn|
|dataset/Apple Bra...|apple_braeburn|
|dataset/Apple Bra...|apple_braeburn|
|dataset/Apple Bra...|apple_braeburn|
|dataset/Apple Bra...|apple_braeburn|
|dataset/Apple Bra...|apple_braeburn|
|dataset/Apple Bra...|apple_braeburn|
|dataset/Apple Bra...|apple_braeburn|
|dataset/Apple Bra...|apple_braeburn|
|dataset/Apple Bra...|apple_braeburn|
|dataset/Apple Bra...|apple_braeburn|
|dataset/Apple Bra...|apple_braeburn|
|dataset/Apple Bra...|apple_braeburn|
|dataset/Apple Bra...|apple_braeburn|
|dataset/Apple Bra...|apple_braeburn|
|dataset/Apple Bra...|apple_braeburn|
+--------------------+--------------+
only showing top 20 rows



In [11]:
sc.getConf().getAll()

[('spark.driver.extraJavaOptions',
  '-XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED'),
 ('spark.executor.memory', '2g'),
 ('spark.app.submitTime', '1659534449625'),
 ('spark.driver.port', '37563'),
 ('spark.driver.memory', '4g'),
 ('spark.app.startTime', '1659534449928'),
 ('spark.sql.execution.arrow.maxRecordsPerBatch

# Extraction des features des images

In [12]:
from pyspark.sql.types import StructType, ArrayType, FloatType
from pyspark.sql.functions import udf

udf_extraire_features = udf(extraire_features_depuis_s3, ArrayType(FloatType()))

df_images_fruits = df_images_fruits.withColumn("features", udf_extraire_features("chemin_image"))
df_images_fruits.persist()
df_images_fruits.show(truncate=True)

+--------------------+--------------+--------------------+
|        chemin_image|         label|            features|
+--------------------+--------------+--------------------+
|dataset/Apple Bra...|apple_braeburn|[0.0, 0.0, 0.0, 0...|
|dataset/Apple Bra...|apple_braeburn|[0.0, 0.0, 0.0, 2...|
|dataset/Apple Bra...|apple_braeburn|[0.0, 0.0, 0.0, 3...|
|dataset/Apple Bra...|apple_braeburn|[0.0, 0.0, 0.0, 3...|
|dataset/Apple Bra...|apple_braeburn|[0.0, 0.0, 0.0, 2...|
|dataset/Apple Bra...|apple_braeburn|[0.0, 0.0, 0.0, 3...|
|dataset/Apple Bra...|apple_braeburn|[0.0, 0.0, 0.0, 3...|
|dataset/Apple Bra...|apple_braeburn|[0.0, 0.0, 0.0, 2...|
|dataset/Apple Bra...|apple_braeburn|[0.0, 0.0, 0.0, 2...|
|dataset/Apple Bra...|apple_braeburn|[0.0, 0.0, 0.0, 2...|
|dataset/Apple Bra...|apple_braeburn|[0.0, 0.0, 0.0, 2...|
|dataset/Apple Bra...|apple_braeburn|[0.0, 0.0, 0.0, 1...|
|dataset/Apple Bra...|apple_braeburn|[0.0, 0.0, 0.0, 2...|
|dataset/Apple Bra...|apple_braeburn|[0.0, 0.0, 0.0, 2..

# Réduction de dimension avec l'ACP

## Conversion des features en Vector

In [13]:
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.functions import udf

udf_convertir_en_vecteur = udf(lambda x: Vectors.dense(x), VectorUDT())

In [14]:
df_images_fruits = df_images_fruits.withColumn('features_vector', udf_convertir_en_vecteur("features"))
df_images_fruits.persist()
df_images_fruits.show(truncate=True)

+--------------------+--------------+--------------------+--------------------+
|        chemin_image|         label|            features|     features_vector|
+--------------------+--------------+--------------------+--------------------+
|dataset/Apple Bra...|apple_braeburn|[0.0, 0.0, 0.0, 0...|[0.0,0.0,0.0,0.76...|
|dataset/Apple Bra...|apple_braeburn|[0.0, 0.0, 0.0, 2...|[0.0,0.0,0.0,2.33...|
|dataset/Apple Bra...|apple_braeburn|[0.0, 0.0, 0.0, 3...|[0.0,0.0,0.0,3.04...|
|dataset/Apple Bra...|apple_braeburn|[0.0, 0.0, 0.0, 3...|[0.0,0.0,0.0,3.24...|
|dataset/Apple Bra...|apple_braeburn|[0.0, 0.0, 0.0, 2...|[0.0,0.0,0.0,2.80...|
|dataset/Apple Bra...|apple_braeburn|[0.0, 0.0, 0.0, 3...|[0.0,0.0,0.0,3.12...|
|dataset/Apple Bra...|apple_braeburn|[0.0, 0.0, 0.0, 3...|[0.0,0.0,0.0,3.00...|
|dataset/Apple Bra...|apple_braeburn|[0.0, 0.0, 0.0, 2...|[0.0,0.0,0.0,2.57...|
|dataset/Apple Bra...|apple_braeburn|[0.0, 0.0, 0.0, 2...|[0.0,0.0,0.0,2.92...|
|dataset/Apple Bra...|apple_braeburn|[0.

## ACP

In [16]:
nombre_optimum_composantes_principales = choisir_nombre_optimum_composantes_principales(df_images_fruits)

Nombre de CP étudié : 100
Nombre de CP étudié : 150
Nombre de CP étudié : 200
Nombre de CP étudié : 225


In [17]:
print("Nombre optimum de CP : {}".format(nombre_optimum_composantes_principales))

Nombre optimum de CP : 100


In [18]:
acp = PCA(k=nombre_optimum_composantes_principales, inputCol="features_vector", outputCol="resultats_acp_vector")
acp = acp.fit(df_images_fruits)

In [19]:
df_images_fruits = acp.transform(df_images_fruits)
df_images_fruits.persist()
df_images_fruits.show(truncate=True)

+--------------------+--------------+--------------------+--------------------+--------------------+
|        chemin_image|         label|            features|     features_vector|resultats_acp_vector|
+--------------------+--------------+--------------------+--------------------+--------------------+
|dataset/Apple Bra...|apple_braeburn|[0.0, 0.0, 0.0, 0...|[0.0,0.0,0.0,0.76...|[27.2808485199140...|
|dataset/Apple Bra...|apple_braeburn|[0.0, 0.0, 0.0, 2...|[0.0,0.0,0.0,2.33...|[29.1818517257581...|
|dataset/Apple Bra...|apple_braeburn|[0.0, 0.0, 0.0, 3...|[0.0,0.0,0.0,3.04...|[29.6118068740900...|
|dataset/Apple Bra...|apple_braeburn|[0.0, 0.0, 0.0, 3...|[0.0,0.0,0.0,3.24...|[29.9169282213898...|
|dataset/Apple Bra...|apple_braeburn|[0.0, 0.0, 0.0, 2...|[0.0,0.0,0.0,2.80...|[29.6598651931137...|
|dataset/Apple Bra...|apple_braeburn|[0.0, 0.0, 0.0, 3...|[0.0,0.0,0.0,3.12...|[29.7878226825621...|
|dataset/Apple Bra...|apple_braeburn|[0.0, 0.0, 0.0, 3...|[0.0,0.0,0.0,3.00...|[29.54252458

In [20]:
print('Taux de variance expliquée :', acp.explainedVariance.sum())

Taux de variance expliquée : 0.9970876176718314


# Export du résultat de la réduction de dimension vers S3

In [21]:
df_resultat_acp = df_images_fruits.select("chemin_image", "label", "resultats_acp_vector")
df_resultat_acp.show(truncate=True)

+--------------------+--------------+--------------------+
|        chemin_image|         label|resultats_acp_vector|
+--------------------+--------------+--------------------+
|dataset/Apple Bra...|apple_braeburn|[27.2808485199140...|
|dataset/Apple Bra...|apple_braeburn|[29.1818517257581...|
|dataset/Apple Bra...|apple_braeburn|[29.6118068740900...|
|dataset/Apple Bra...|apple_braeburn|[29.9169282213898...|
|dataset/Apple Bra...|apple_braeburn|[29.6598651931137...|
|dataset/Apple Bra...|apple_braeburn|[29.7878226825621...|
|dataset/Apple Bra...|apple_braeburn|[29.5425245850562...|
|dataset/Apple Bra...|apple_braeburn|[29.7766632456471...|
|dataset/Apple Bra...|apple_braeburn|[30.1339124965427...|
|dataset/Apple Bra...|apple_braeburn|[29.7994438012642...|
|dataset/Apple Bra...|apple_braeburn|[29.1209345006304...|
|dataset/Apple Bra...|apple_braeburn|[25.5335912172705...|
|dataset/Apple Bra...|apple_braeburn|[30.1633188728771...|
|dataset/Apple Bra...|apple_braeburn|[28.6079272148960..

In [22]:
df_pandas_resultat_acp = df_resultat_acp.toPandas()

In [24]:
exporter_resultats_acp_vers_s3(df_pandas_resultat_acp)