# Fonctions

## Création du premier dataset

In [1]:
import re

def creer_dataset_initial_depuis_s3(bucket):
    colonnes = ["chemin_image", "label"]
    valeurs = recuperer_valeurs_dataset_depuis_s3(bucket, True)
    return spark.createDataFrame(valeurs, colonnes)

def recuperer_valeurs_dataset_depuis_s3(bucket, utiliser_echantillon):
    valeurs = []
    
    if utiliser_echantillon:
        expression_image = "echantillon.*\.jpg"
    else:
        expression_image = "dataset.*\.jpg"
    
    for objet_bucket in bucket.objects.all():
        if re.search(expression_image, objet_bucket.key):
            chemin_image = objet_bucket.key
            categorie_image = recuperer_categorie_image_depuis_s3(objet_bucket.key)
            valeurs.append([chemin_image, categorie_image])
            
    return valeurs

def recuperer_categorie_image_depuis_s3(cle_objet_bucket):
    liste_dossiers_fichiers = cle_objet_bucket.split('/')
    categorie_fruit = liste_dossiers_fichiers[1]
    categorie_fruit = categorie_fruit.lower()
    categorie_fruit = categorie_fruit.replace(" ", "_")
    return categorie_fruit

## Extraction des features automatiquement depuis S3

In [None]:
import io
import boto3
import pandas as pd
from keras.applications.vgg16 import VGG16
from keras.models import Model
from keras.utils import load_img, img_to_array
from keras.applications.vgg16 import preprocess_input

def extraire_features_depuis_s3(chemin_image):
    
    # Récupération du bucket
    bucket_p8 = recuperer_bucket_s3()
    objet = bucket_p8.Object(chemin_image)
    file_binary_stream = io.BytesIO()
    objet.download_fileobj(file_binary_stream)
    
    # Chargement et prétraitement de l'image
    image = load_img(file_binary_stream, target_size=(224, 224))
    
    # Convertir en tableau numpy
    image = img_to_array(image) 
    
    # Créer la collection d'images (un seul échantillon)
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    
    # Prétraiter l'image comme le veut VGG-16
    image = preprocess_input(image) 
    
    # Extraire les features de l'image
    cnn_extraction_features = creer_cnn_extraction_features()
    features = cnn_extraction_features.predict(image)[0]
    features = features.tolist()
    
    file_binary_stream.close()
    
    return features
    
def recuperer_bucket_s3():
    
    # Récupération des clés d'accès AWS
    access_key_id, secret_access_key = recuperer_cles_acces_aws()
    
    # Connexion à S3
    s3 = boto3.resource(
        service_name='s3',
        region_name='eu-west-3',
        aws_access_key_id=access_key_id,
        aws_secret_access_key=secret_access_key
    )
    
    return s3.Bucket("oc-p8-sb-data")

def recuperer_cles_acces_aws():
    fichier_access_keys_AWS = "rootkey.csv"
    df_access_keys_AWS = pd.read_csv(fichier_access_keys_AWS)
    access_key_id = df_access_keys_AWS["AWSAccessKeyId"][0]
    secret_access_key = df_access_keys_AWS["AWSSecretKey"][0]
    return access_key_id, secret_access_key

def creer_cnn_extraction_features():
    # Initialisation du modèle VGG-16 et suppression de ses 2 dernières couches (prédiction et fully-connected)
    cnn = VGG16()
    cnn = Model(inputs=cnn.inputs, outputs=cnn.layers[-2].output)
    return cnn

## Extraction des features "à la main" depuis S3

In [2]:
import io
from keras.applications.vgg16 import VGG16
from keras.models import Model
from keras.utils import load_img, img_to_array
from keras.applications.vgg16 import preprocess_input

def creer_cnn_extraction_features():
    # Initialisation du modèle VGG-16 et suppression de ses 2 dernières couches (prédiction et fully-connected)
    cnn = VGG16()
    cnn = Model(inputs=cnn.inputs, outputs=cnn.layers[-2].output)
    return cnn

def extraire_features_depuis_s3(chemin_image):
    
    # Récupération du bucket
    objet = bucket.Object(chemin_image)
    file_binary_stream = io.BytesIO()
    objet.download_fileobj(file_binary_stream)
    
    # Chargement et prétraitement de l'image
    image = load_img(file_binary_stream, target_size=(224, 224))
    
    # Convertir en tableau numpy
    image = img_to_array(image) 
    
    # Créer la collection d'images (un seul échantillon)
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    
    # Prétraiter l'image comme le veut VGG-16
    image = preprocess_input(image) 
    
    # Extraire les features de l'image
    features = cnn_extraction_features.predict(image)[0]
    features = features.tolist()
    
    file_binary_stream.close()
    
    return features

## Choix du nombre optimum de composantes principales pour l'ACP

In [15]:
from pyspark.ml.feature import PCA

def choisir_nombre_optimum_composantes_principales(df_images_fruits):
    nombres_composantes_principales = [100, 150, 200, 225]
    variance_expliquee = []
    
    # Récupération des variances expliquées pour chaque nombre de composantes principales
    for nombre_composantes_principales in nombres_composantes_principales:
        print("Nombre de CP étudié : {}".format(nombre_composantes_principales))
        acp = PCA(k=nombre_composantes_principales, inputCol="features_vector", outputCol="resultats_acp_vector")
        acp = acp.fit(df_images_fruits)
        variance_expliquee.append(acp.explainedVariance.sum())
    
    # Recherche du premier index atteignant 95% de variance expliquée
    index = 0
    seuil_atteint = False
    taille_listes = len(variance_expliquee)
    while index < taille_listes and not seuil_atteint:
        if variance_expliquee[index] >= 0.95:
            seuil_atteint = True
        else:
            index += 1
    
    # Renvoi du nombre optimum de composantes principales selon si le seuil a été atteint ou non
    if seuil_atteint:
        return nombres_composantes_principales[index]
    else:
        return nombres_composantes_principales[taille_listes-1]        

## Export des résultats vers S3

In [4]:
def exporter_resultats_acp_vers_s3(df_pandas_resultat_acp):
    file_binary_stream = io.BytesIO()
    df_pandas_resultat_acp.to_csv(file_binary_stream, mode="wb", header=True, sep=";", index=False)
    file_binary_stream.seek(0)
    bucket.upload_fileobj(file_binary_stream, "sortie_acp.csv")
    file_binary_stream.close()
    return None

# Récupération des clés d'accès AWS

In [5]:
import pandas as pd

fichier_access_keys_AWS = "rootkey.csv"
df_access_keys_AWS = pd.read_csv(fichier_access_keys_AWS)
access_key_id = df_access_keys_AWS["AWSAccessKeyId"][0]
secret_access_key = df_access_keys_AWS["AWSSecretKey"][0]

# Connexion avec S3

In [6]:
import boto3

s3 = boto3.resource(
    service_name='s3',
    region_name='eu-west-3',
    aws_access_key_id=access_key_id,
    aws_secret_access_key=secret_access_key
)

bucket = s3.Bucket("oc-p8-sb-data")

# Chargement des images depuis S3

In [7]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, Row

# Initialisation du programme
#appName = "[OC/P8] - Déployer un modèle sur le cloud"
#master = "local"
#conf = SparkConf().setAppName(appName).setMaster(master)
#sc = SparkContext(conf=conf).getOrCreate()
sc = SparkContext.getOrCreate()
#spark = SparkSession.builder.config('spark.driver.memory', '4g').getOrCreate()
spark = SparkSession.builder.getOrCreate()

df_images_fruits = creer_dataset_initial_depuis_s3(bucket)
df_images_fruits.persist()
df_images_fruits.show(truncate=True)

+--------------------+--------------+
|        chemin_image|         label|
+--------------------+--------------+
|echantillon/Apple...|apple_braeburn|
|echantillon/Apple...|apple_braeburn|
|echantillon/Apric...|       apricot|
|echantillon/Apric...|       apricot|
|echantillon/Banan...|        banana|
|echantillon/Banan...|        banana|
+--------------------+--------------+



## Extraction des features des images

## En automatique

In [45]:
#########################################################
#    ATTENTION : à ne faire tourner que pour tester     #
#########################################################
from pyspark.sql.types import StructType, ArrayType, FloatType
from pyspark.sql.functions import udf

udf_extraire_features = udf(extraire_features_depuis_s3, ArrayType(FloatType()))

df_images_fruits = df_images_fruits.withColumn("features", udf_extraire_features("chemin_image"))
df_images_fruits.show(truncate=True)

Traceback (most recent call last):
  File "C:\Users\Administrateur\.virtualenvs\P8_Deployer-modele-cloud-18B3aG_S\lib\site-packages\pyspark\serializers.py", line 437, in dumps
    return cloudpickle.dumps(obj, pickle_protocol)
  File "C:\Users\Administrateur\.virtualenvs\P8_Deployer-modele-cloud-18B3aG_S\lib\site-packages\pyspark\cloudpickle\cloudpickle_fast.py", line 72, in dumps
    cp.dump(obj)
  File "C:\Users\Administrateur\.virtualenvs\P8_Deployer-modele-cloud-18B3aG_S\lib\site-packages\pyspark\cloudpickle\cloudpickle_fast.py", line 540, in dump
    return Pickler.dump(self, obj)
TypeError: cannot pickle '_thread.lock' object


PicklingError: Could not serialize object: TypeError: cannot pickle '_thread.lock' object

## "À la main" pour éviter de faire planter le PC

In [8]:
cnn_extraction_features = creer_cnn_extraction_features()

In [9]:
features_1 = extraire_features_depuis_s3('echantillon/Apple Braeburn/0_100.jpg')
features_2 = extraire_features_depuis_s3('echantillon/Apple Braeburn/1_100.jpg')
features_3 = extraire_features_depuis_s3('echantillon/Apricot/0_100.jpg')
features_4 = extraire_features_depuis_s3('echantillon/Apricot/1_100.jpg')
features_5 = extraire_features_depuis_s3('echantillon/Banana/0_100.jpg')
features_6 = extraire_features_depuis_s3('echantillon/Banana/1_100.jpg')



In [10]:
from pyspark.sql import Row
rdd_features = sc.parallelize([Row(chemin_image="echantillon/Apple Braeburn/0_100.jpg", features=features_1), 
                               Row(chemin_image="echantillon/Apple Braeburn/1_100.jpg", features=features_2), 
                               Row(chemin_image="echantillon/Apricot/0_100.jpg", features=features_3), 
                               Row(chemin_image="echantillon/Apricot/1_100.jpg", features=features_4), 
                               Row(chemin_image="echantillon/Banana/0_100.jpg", features=features_5), 
                               Row(chemin_image="echantillon/Banana/1_100.jpg", features=features_6)])
df_features = spark.createDataFrame(rdd_features)
df_features.show(truncate=True)

+--------------------+--------------------+
|        chemin_image|            features|
+--------------------+--------------------+
|echantillon/Apple...|[0.0, 0.0, 0.0, 0...|
|echantillon/Apple...|[0.0, 0.0, 0.0, 0...|
|echantillon/Apric...|[0.0, 0.0, 0.0, 0...|
|echantillon/Apric...|[0.0, 0.0, 0.0, 0...|
|echantillon/Banan...|[0.0, 0.0, 0.0, 1...|
|echantillon/Banan...|[0.0, 0.0, 0.3409...|
+--------------------+--------------------+



In [11]:
df_images_fruits = df_images_fruits.join(df_features, "chemin_image").sort("chemin_image")
df_images_fruits.persist()
df_images_fruits.show(truncate=True)

+--------------------+--------------+--------------------+
|        chemin_image|         label|            features|
+--------------------+--------------+--------------------+
|echantillon/Apple...|apple_braeburn|[0.0, 0.0, 0.0, 0...|
|echantillon/Apple...|apple_braeburn|[0.0, 0.0, 0.0, 0...|
|echantillon/Apric...|       apricot|[0.0, 0.0, 0.0, 0...|
|echantillon/Apric...|       apricot|[0.0, 0.0, 0.0, 0...|
|echantillon/Banan...|        banana|[0.0, 0.0, 0.0, 1...|
|echantillon/Banan...|        banana|[0.0, 0.0, 0.3409...|
+--------------------+--------------+--------------------+



# Réduction de dimension avec l'ACP

## Conversion des features en Vector

In [12]:
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.functions import udf

udf_convertir_en_vecteur = udf(lambda x: Vectors.dense(x), VectorUDT())

In [13]:
df_images_fruits = df_images_fruits.withColumn('features_vector', udf_convertir_en_vecteur("features"))
df_images_fruits.persist()
df_images_fruits.show(truncate=True)

+--------------------+--------------+--------------------+--------------------+
|        chemin_image|         label|            features|     features_vector|
+--------------------+--------------+--------------------+--------------------+
|echantillon/Apple...|apple_braeburn|[0.0, 0.0, 0.0, 0...|[0.0,0.0,0.0,0.0,...|
|echantillon/Apple...|apple_braeburn|[0.0, 0.0, 0.0, 0...|[0.0,0.0,0.0,0.0,...|
|echantillon/Apric...|       apricot|[0.0, 0.0, 0.0, 0...|[0.0,0.0,0.0,0.0,...|
|echantillon/Apric...|       apricot|[0.0, 0.0, 0.0, 0...|[0.0,0.0,0.0,0.0,...|
|echantillon/Banan...|        banana|[0.0, 0.0, 0.0, 1...|[0.0,0.0,0.0,1.30...|
|echantillon/Banan...|        banana|[0.0, 0.0, 0.3409...|[0.0,0.0,0.340979...|
+--------------------+--------------+--------------------+--------------------+



## ACP

In [16]:
nombre_optimum_composantes_principales = choisir_nombre_optimum_composantes_principales(df_images_fruits)

Nombre de CP étudié : 100
Nombre de CP étudié : 150
Nombre de CP étudié : 200
Nombre de CP étudié : 225


In [17]:
print("Nombre optimum de CP : {}".format(nombre_optimum_composantes_principales))

Nombre optimum de CP : 100


In [18]:
acp = PCA(k=nombre_optimum_composantes_principales, inputCol="features_vector", outputCol="resultats_acp_vector")
acp = acp.fit(df_images_fruits)

In [19]:
df_images_fruits = acp.transform(df_images_fruits)
df_images_fruits.persist()
df_images_fruits.show(truncate=True)

+--------------------+--------------+--------------------+--------------------+--------------------+
|        chemin_image|         label|            features|     features_vector|resultats_acp_vector|
+--------------------+--------------+--------------------+--------------------+--------------------+
|echantillon/Apple...|apple_braeburn|[0.0, 0.0, 0.0, 0...|[0.0,0.0,0.0,0.0,...|[47.8794401749720...|
|echantillon/Apple...|apple_braeburn|[0.0, 0.0, 0.0, 0...|[0.0,0.0,0.0,0.0,...|[49.1322660292027...|
|echantillon/Apric...|       apricot|[0.0, 0.0, 0.0, 0...|[0.0,0.0,0.0,0.0,...|[29.8963130558993...|
|echantillon/Apric...|       apricot|[0.0, 0.0, 0.0, 0...|[0.0,0.0,0.0,0.0,...|[30.9112918822341...|
|echantillon/Banan...|        banana|[0.0, 0.0, 0.0, 1...|[0.0,0.0,0.0,1.30...|[-44.434613084136...|
|echantillon/Banan...|        banana|[0.0, 0.0, 0.3409...|[0.0,0.0,0.340979...|[-42.526938587708...|
+--------------------+--------------+--------------------+--------------------+------------

In [20]:
print('Taux de variance expliquée :', acp.explainedVariance.sum())

Taux de variance expliquée : 1.0000000000000027


# Export du résultat de la réduction de dimension vers S3

In [21]:
df_resultat_acp = df_images_fruits.select("chemin_image", "label", "resultats_acp_vector")
df_resultat_acp.persist()
df_resultat_acp.show(truncate=True)

+--------------------+--------------+--------------------+
|        chemin_image|         label|resultats_acp_vector|
+--------------------+--------------+--------------------+
|echantillon/Apple...|apple_braeburn|[47.8794401749720...|
|echantillon/Apple...|apple_braeburn|[49.1322660292027...|
|echantillon/Apric...|       apricot|[29.8963130558993...|
|echantillon/Apric...|       apricot|[30.9112918822341...|
|echantillon/Banan...|        banana|[-44.434613084136...|
|echantillon/Banan...|        banana|[-42.526938587708...|
+--------------------+--------------+--------------------+



In [22]:
df_pandas_resultat_acp = df_resultat_acp.toPandas()

In [23]:
exporter_resultats_acp_vers_s3(df_pandas_resultat_acp)