# Flow ML Course

# TP 6 : Projet Complet

L'objectif de ce TP est d'utiliser tous les outils vus dans les TP précédents pour entrainer et déposer un modèle sur la plateforme de Flow ML de GCP.

## Initialisation

In [1]:
! pip3 install --upgrade --quiet google-cloud-aiplatform[tensorboard] \
                                 tensorflow==2.15.1

In [2]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

# Clear any logs from previous runs
!rm -rf ./logs/


In [28]:
your_name = "fvi"

In [None]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import joblib
import os
import gcsfs

from google.cloud import aiplatform
from google.cloud import storage

# Import TensorFlow and the TensorBoard HParams plugin
import tensorflow as tf
from tensorboard.plugins.hparams import api as hp


PROJECT_ID = "projet-ia-448520"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

aiplatform.init(project=PROJECT_ID, location=LOCATION)

BUCKET_URI = "gs://cours1bucket" 



client = storage.Client()
bucket = client.bucket(BUCKET_URI.split("//")[1])


## Création Données [PROF]

In [12]:



# Original Data file : https://www.kaggle.com/datasets/whenamancodes/students-performance-in-exams?resource=download

# Load it first from kaggle manually if needed

data = pd.read_csv("exams.csv", sep=",")

train_blob = bucket.blob('data/dataset_train_course_1_6.csv')
test_blob = bucket.blob('data/dataset_test_course_1_6.csv')


train, test = train_test_split(data, test_size=.2, random_state=47)

train.to_csv("exams_train.csv", index=False)
test.to_csv("exams_test.csv", index=False)


with open("exams_train.csv", 'r') as f:
   train_blob.upload_from_file(f)

with open("exams_test.csv", 'r') as f:
   test_blob.upload_from_file(f)


## Extraction des données

Depuis un bucket GCP, on utilisera gcsfs.

Utilisez l'exemple du fichier exams_train.csv pour extraire exams_test.csv


Le fichier de base est disponible ici : https://www.kaggle.com/datasets/whenamancodes/students-performance-in-exams?resource=download

In [None]:


fs = gcsfs.GCSFileSystem(project=PROJECT_ID)
with fs.open(f"{BUCKET_URI.split('//')[1]}/data/exams_train.csv") as f:
    df_train = pd.read_csv(f)
df_train

In [None]:

## Your Turn 

fs = gcsfs.GCSFileSystem(project=PROJECT_ID)
with fs.open(f"{BUCKET_URI.split('//')[1]}/data/exams_test.csv") as f:
    df_test = pd.read_csv(f)
df_test

## Explore & Transform your data

L'objectif est de prédire le "math score".

Analyser, explorer et transformer les données pour pouvoir prédire le score.

In [54]:
X_train = df_train[["reading score", "writing score"]].values
y_train = df_train["math score"].values
X_test = df_train[["reading score", "writing score"]].values
y_test = df_train["math score"].values

score_mean, score_std = X_train.mean(), X_train.std()

normalize_score = lambda x: (x - score_mean) / score_std
unormalize_score = lambda x: (x * score_std) + score_mean

## Training

### Model Selection

Choisissez 3 modèles à entrainer pour répondre à la problématique du dataset

In [35]:
model_names = ["LinearRegression"]

### Training Method


Utiliser cet espace pour créer les méthodes vous aidant à entrainer les modèles


Conseil :

 Enregistrez vos modèles à chacun de vos entrainements. Enregistrer dans l'espace de storage google via ce snippet :


'''

    blob = bucket.blob(MODEL_DIR)

    blob.upload_from_filename(model_file)
'''


Modifier MODEL_DIR pour créer un sous répertoire à votre nom où vous pourrez stocker vos informations.

In [189]:


def save_model(model, model_name, hparams):
    config = "__".join([f"{key}_{values}" for key, values in hparams.items()])

    MODEL_DIR = f"sup_aero_vertex_course_1_6/{model_name}/{config}/"
    
    model_file = f'{model_name}_{config}/model.joblib'
    if not os.path.isdir(f"{model_name}_{config}"):
        os.mkdir(f"{model_name}_{config}")
    joblib.dump(model, model_file)

    blob = bucket.blob(MODEL_DIR)
    blob.upload_from_filename(model_file)
    print(MODEL_DIR, model_file)
    

def run(run_dir, model_name, hparams):
    with tf.summary.create_file_writer(run_dir).as_default():
        hp.hparams(hparams)  # record the values used in this trial
        model, mse = train_test_model(model_name, hparams)
        tf.summary.scalar(METRIC_NAME, mse, step=1)
        
        save_model(model, model_name, hparams)

        
        
def train_test_model(model_name, hparams):
    if model_name == "LinearRegression":
        model = LinearRegression(**hparams)
    
    model.fit(normalize_score(X_train), normalize_score(y_train))
    
    y_pred = model.predict(normalize_score(X_test))
    
    mse = mean_squared_error(unormalize_score(y_pred), unormalize_score(y_test))
                           
    return model, mse
    

### Hyperparamétrage

Définisser les range d'hyperparamètres ainsi que la métrique d'évaluation ici

In [190]:
fit_intercept_model_1 = hp.HParam("fit_intercept", hp.Discrete([False, True]))

In [191]:

METRIC_NAME = "MSE"

with tf.summary.create_file_writer("logs/hparam_tuning_model_1").as_default():
    hp.hparams_config(
        hparams=[fit_intercept_model_1],
        metrics=[hp.Metric(METRIC_NAME, display_name=METRIC_NAME)],
    )
    


### Train

Entrainer vos différents modèles avec les différentes combinaisons de paramètres.

In [None]:
    
session_num = 0

for fit_intercept in fit_intercept_model_1.domain.values:
    hparams = {
                fit_intercept_model_1: fit_intercept,
            }
    
    run_name = f"run-{your_name}-%d" % session_num
    print("--- Starting trial: %s" % run_name)
    print({h.name: hparams[h] for h in hparams})
    run("logs/hparam_tuning_model_1/" + run_name, "LinearRegression", {h.name: hparams[h] for h in hparams})
    session_num += 1

### Tensorboard

Poussez chaque modèle dans des des expériments Tensorboards

In [216]:
model_name = "LinearRegression"

In [None]:
# Set the display name for your tensorboard instance

TENSORBOARD_NAME = f"supaero-vertex-course-1-6-tb-{PROJECT_ID}-{your_name}"  # @param {type:"string"}

tensorboard = aiplatform.Tensorboard.create(
    display_name=TENSORBOARD_NAME, project=PROJECT_ID, location=LOCATION
)
TENSORBOARD_RESOURCE_NAME = tensorboard.gca_resource.name
print("TensorBoard resource name:", TENSORBOARD_RESOURCE_NAME)

EXPERIMENT_NAME = f"supaero-vertex-course-1-6-{PROJECT_ID}-{your_name}-{model_name.lower()}"  # @param {type:"string"}

In [None]:
!tb-gcp-uploader --one_shot=True --tensorboard_resource_name=$TENSORBOARD_RESOURCE_NAME --logdir="logs/hparam_tuning_model_1/" --experiment_name=$EXPERIMENT_NAME

### Meilleurs modèles

Utiliser Tensorboard pour définir les meilleurs paramètres pour chaque modèle

## Register your models



### Model Registry

Sauvegardez vos modèles dans le model Registry de ML Flow.

Utiliser ce lien pour définir les DEPLOY_IMAGE en fonction de votre type de modèle (CPU only) : https://cloud.google.com/vertex-ai/docs/training/pre-built-containers#scikit-learn

In [81]:
TF = "2.13".replace(".", "-")

DEPLOY_VERSION = "tf2-cpu.{}".format(TF) # Possibility to use GPU 


DEPLOY_IMAGE = "{}-docker.pkg.dev/vertex-ai/prediction/{}:latest".format(
    LOCATION.split("-")[0], DEPLOY_VERSION
)

In [None]:
model_1 = aiplatform.Model.upload(
    display_name=f"model_course_1_6_{your_name}_{model_name.lower()}",
    artifact_uri="gs://cours1bucket/sup_aero_vertex_course_1_6/LinearRegression/fit_intercept_False",
    serving_container_image_uri="us-docker.pkg.dev/vertex-ai/training/sklearn-cpu.1-0:latest",
    is_default_version=True,
    version_aliases=["v1"],
    # version_description=..., # Add a description
)


model_2 = aiplatform.Model.upload(
    display_name=f"model_course_1_6_{your_name}_{model_name.lower()}",
    artifact_uri="gs://cours1bucket/sup_aero_vertex_course_1_6/LinearRegression/fit_intercept_True/",
    serving_container_image_uri="us-docker.pkg.dev/vertex-ai/training/sklearn-cpu.1-0:latest",
    is_default_version=False,
    parent_model=model_1.resource_name,
    version_aliases=["v2"],
    # version_description=..., # Add a description
)
print(model_1)


## Versionning

Update the model registry to get the model in version v1

In [233]:
models = aiplatform.Model.list(filter=f'display_name="model_course_1_6_{your_name}_{model_name.lower()}"')

In [None]:

print("Version ID:", models[0].version_id)

models[1].versioning_registry.add_version_aliases(new_aliases=["default"], version="1")
print("Version ID:", models[1].version_id)

models = aiplatform.Model.list(filter=f'display_name="model_course_1_6_{your_name}_{model_name.lower()}"')
print("Number of models:", len(models))
print("Version ID:", models[0].version_id)


## Model Lineage



## Endpoint

Créer un endpoint

In [None]:

version_selected = ...


# Deploy the model to the endpoint
endpoint = aiplatform.Endpoint.create(
        display_name=f'model_course_1_6_{your_name}',
        project=PROJECT_ID,
        location=LOCATION,
    )


models = aiplatform.Model.list(filter=f'display_name="model_course_1_6_{your_name}_{model_name.lower()}"')
print(models)


models[version_selected].deploy(
    deployed_model_display_name=f'model_course_1_6_{your_name}',
    machine_type='n1-standard-4'
)

## Tester votre endpoint

In [None]:

# Make a prediction
response = endpoint.predict(instances=[{"input": "your-input-data"}])
print(response)