# -1) imports

In [1]:
print("start")

start


In [2]:
from   io                       import BytesIO
import pandas                   as     pd
from   google.cloud             import storage
from   sklearn.model_selection  import train_test_split
from   sklearn.preprocessing    import StandardScaler, OneHotEncoder
from   sklearn.impute           import SimpleImputer
from   sklearn.compose          import ColumnTransformer
from   sklearn.datasets         import fetch_openml
from   sklearn.pipeline         import Pipeline

In [3]:
import warnings
import sklearn
import pandas as pd


# 0) personnal function

In [4]:
# from : https://johaupt.github.io/scikit-learn/tutorial/python/data%20processing/ml%20pipeline/model%20interpretation/columnTransformer_feature_names.html 
def get_feature_names(column_transformer):
    """Get feature names from all transformers.
    Returns
    -------
    feature_names : list of strings
        Names of the features produced by transform.
    """
    # Remove the internal helper function
    #check_is_fitted(column_transformer)
    
    # Turn loopkup into function for better handling with pipeline later
    def get_names(trans):
        # >> Original get_feature_names() method
        if trans == 'drop' or (
                hasattr(column, '__len__') and not len(column)):
            return []
        if trans == 'passthrough':
            if hasattr(column_transformer, '_df_columns'):
                if ((not isinstance(column, slice))
                        and all(isinstance(col, str) for col in column)):
                    return column
                else:
                    return column_transformer._df_columns[column]
            else:
                indices = np.arange(column_transformer._n_features)
                return ['x%d' % i for i in indices[column]]
        if not hasattr(trans, 'get_feature_names'):
        # >>> Change: Return input column names if no method avaiable
            # Turn error into a warning
            warnings.warn("Transformer %s (type %s) does not "
                                 "provide get_feature_names. "
                                 "Will return input column names if available"
                                 % (str(name), type(trans).__name__))
            # For transformers without a get_features_names method, use the input
            # names to the column transformer
            if column is None:
                return []
            else:
                return [name + "__" + f for f in column]

        return [name + "__" + f for f in trans.get_feature_names()]
    
    ### Start of processing
    feature_names = []
    
    # Allow transformers to be pipelines. Pipeline steps are named differently, so preprocessing is needed
    if type(column_transformer) == sklearn.pipeline.Pipeline:
        l_transformers = [(name, trans, None, None) for step, name, trans in column_transformer._iter()]
    else:
        # For column transformers, follow the original method
        l_transformers = list(column_transformer._iter(fitted=True))
    
    
    for name, trans, column, _ in l_transformers: 
        if type(trans) == sklearn.pipeline.Pipeline:
            # Recursive call on pipeline
            _names = get_feature_names(trans)
            # if pipeline has no transformer that returns names
            if len(_names)==0:
                _names = [name + "__" + f for f in column]
            feature_names.extend(_names)
        else:
            feature_names.extend(get_names(trans))
    
    return feature_names

# 1) Fetch Data

In [5]:
USER_FLAG = "--user"

## 1.1) bucket configuration

In [6]:
bucket_name  = "cloud-orbit-bank-data"
file_name    = "bank-data.csv"

## 1.2) retrieving file content

In [7]:
client       = storage.Client()
bucket       = client.get_bucket(bucket_name)
blob         = bucket.get_blob(file_name)
blob_string  = blob.download_as_string()
content      = BytesIO(blob_string)

## 1.3) creating a DataFrame

In [8]:
df           = pd.read_csv(content, sep=";")
df.head(5).T

Unnamed: 0,0,1,2,3,4
age,58,44,33,47,33
job,management,technician,entrepreneur,blue-collar,unknown
marital,married,single,married,married,single
education,tertiary,secondary,secondary,unknown,unknown
default,no,no,no,no,no
balance,2143,29,2,1506,1
housing,yes,yes,yes,yes,no
loan,no,no,yes,no,no
contact,unknown,unknown,unknown,unknown,unknown
day,5,5,5,5,5


# 2) Data preparation

## 2.1) Pre-processing steps

In [9]:
target = "y"

In [10]:
column_y = df[target]

In [11]:
numeric_features        = df.drop(target, axis=1).select_dtypes("int64").columns
categorical_features    = df.drop(target, axis=1).select_dtypes("object").columns

In [12]:
# Colonnes catégorielles
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

In [13]:
# Colonnes numériques
missing_values          = SimpleImputer(strategy="median")
centrer_reduire         = StandardScaler()
steps                   = [("missing_values",missing_values ), ("scaler",centrer_reduire )]
numeric_transformer     = Pipeline(steps=steps )


### 2.1.1) get dummies

In [14]:
transformers = [ ("numerical"   , numeric_transformer    , numeric_features     ) ,
                ("categorical" , categorical_transformer, categorical_features ) ]
preprocessor = ColumnTransformer( transformers=transformers )

In [15]:
preprocessor = preprocessor.fit(df)

In [16]:
dummies = preprocessor.transform(df)

In [17]:
dummies = pd.DataFrame(dummies, columns=get_feature_names(preprocessor))



In [18]:
dummies[target] = column_y

dummies[target] = dummies[target].map({"no":0, "yes":1})

## 2.2) creating a validation dataset

In [19]:
pct_validation  = 0.05
nb_ligne        = dummies.shape[0]
validation_size = int (nb_ligne * pct_validation)
validation_set  = dummies.sample(validation_size)

In [20]:
dummies         = dummies.drop(validation_set.index)

In [21]:
validation_set.shape

(2260, 52)

In [22]:
dummies.shape

(42951, 52)

In [23]:
dummies.columns

Index(['numerical__age', 'numerical__balance', 'numerical__day',
       'numerical__duration', 'numerical__campaign', 'numerical__pdays',
       'numerical__previous', 'categorical__x0_admin.',
       'categorical__x0_blue-collar', 'categorical__x0_entrepreneur',
       'categorical__x0_housemaid', 'categorical__x0_management',
       'categorical__x0_retired', 'categorical__x0_self-employed',
       'categorical__x0_services', 'categorical__x0_student',
       'categorical__x0_technician', 'categorical__x0_unemployed',
       'categorical__x0_unknown', 'categorical__x1_divorced',
       'categorical__x1_married', 'categorical__x1_single',
       'categorical__x2_primary', 'categorical__x2_secondary',
       'categorical__x2_tertiary', 'categorical__x2_unknown',
       'categorical__x3_no', 'categorical__x3_yes', 'categorical__x4_no',
       'categorical__x4_yes', 'categorical__x5_no', 'categorical__x5_yes',
       'categorical__x6_cellular', 'categorical__x6_telephone',
       'catego

# 3) Modelisation

## 3.1) create X et y

In [24]:
y      = dummies[target]
X      = dummies.drop(target, axis=1)

In [25]:
X.shape

(42951, 51)

## 3.2) test / train / validation 

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [27]:
X_validation    = validation_set.drop(target, axis=1)
y_validation    = validation_set[target]

## 3.3) Modélisation naïve

### 3.3.1) Randomforest

In [28]:
from sklearn.ensemble import RandomForestClassifier
algo    = RandomForestClassifier()
modele  = algo.fit(X_train, y_train)

### 3.3.2) scoring

In [29]:
from sklearn.metrics import roc_auc_score

In [30]:
roc_auc_score(y_test, modele.predict(X_test))

0.6791824117533298

## 3.4) Modélisation avec un grid_search

### 3.4.1) grid search 

In [31]:
from sklearn.model_selection import GridSearchCV

In [32]:
hyper_parameters = {
        'bootstrap'                : [ ],
        'ccp_alpha'                : [ ],
        'class_weight'             : [ ],
        'criterion'                : [ "gini", "entropy" ],
        'max_depth'                : [ 10, 20 ],
        'max_features'             : [ ],
        'max_leaf_nodes'           : [ ],
        'max_samples'              : [ ],
        'min_impurity_decrease'    : [ ],
        'min_samples_leaf'         : [ ],
        'min_samples_split'        : [ ],
        'min_weight_fraction_leaf' : [ ],
        'n_estimators'             : [ 20, 30 ],
        'n_jobs'                   : [ ],
        'oob_score'                : [ ],
        'random_state'             : [ ],
        'verbose'                  : [ ],
        'warm_start'               : [ ],
}


In [33]:
grid_search_parameters = { k:v  for k,v in hyper_parameters.items() if v  }

In [34]:
grid = GridSearchCV(RandomForestClassifier(), grid_search_parameters, n_jobs=-1, scoring='roc_auc')
grid.fit(X_train, y_train)

GridSearchCV(estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [10, 20], 'n_estimators': [20, 30]},
             scoring='roc_auc')

In [35]:
print (f"kpi : {grid.scoring} = {grid.best_score_:.2f} \n Best estimator : {grid.best_estimator_}" )

kpi : roc_auc = 0.92 
 Best estimator : RandomForestClassifier(max_depth=20, n_estimators=30)


In [36]:
model_random_forest = grid.best_estimator_

# 4) installing a missing library

## 4.1) catboost is not installed

In [37]:
import numpy 
from catboost import CatBoostClassifier

## Catboost can be installed with pip install

## Catboost is installed

In [38]:
from catboost import CatBoostClassifier

In [39]:
import numpy 


In [40]:
import catboost

In [41]:
!pip install ipywidgets --user



## 4.2) Using catboost

### 4.2.1) with GPU 

In [42]:
from time import time

In [43]:
for task_type in ["GPU","CPU"]:
    print(f"{task_type}", end="") 
    start     = time()
    algo      = CatBoostClassifier(learning_rate=1, depth=6, silent=True, task_type=task_type)
    modele    = algo.fit(X_train, y_train)
    auc_score = roc_auc_score(y_test, modele.predict(X_test))
    end       = time()
    duration  = end - start 
    print(f" -> duration = {duration:.1f} sec. : auc = {auc_score :.2f} %")

GPU -> duration = 29.9 sec. : auc = 0.72 %
CPU -> duration = 6.8 sec. : auc = 0.72 %


## 4.3) Catboost GPU is slower on small dataset, but faster in big ones

### 4.3.1) on crée une grosse df

In [44]:
dummies.shape

(42951, 52)

In [45]:
big_df = dummies
for i in range(10):
    big_df = big_df.append(dummies, ignore_index=True)

### 4.3.2) on recrée les dummies et le X et y

In [46]:
big_df.shape

(472461, 52)

In [47]:
len(set(dummies.columns))

52

In [48]:
target = "y"
dummies = pd.get_dummies(big_df)
X      = dummies.drop(target, axis=1)
y      = dummies[target]

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

### 4.3.3) on modélise sur GPU et CPU

In [50]:
task_types = ["CPU","GPU"]
for task_type in task_types :
    print(f"{task_type}", end="") 
    start     = time()
    algo      = CatBoostClassifier(learning_rate=1, depth=6, silent=True, task_type=task_type)
    modele    = algo.fit(X_train, y_train)
    auc_score = roc_auc_score(y_validation, modele.predict(X_validation))
    end       = time()
    duration  = end - start 
    print(f" -> duration = {duration:.1f} sec. : auc = {auc_score :.2f} %")

CPU -> duration = 37.2 sec. : auc = 0.73 %
GPU -> duration = 7.4 sec. : auc = 0.73 %


### 4.3.4) on peut 'watch' le nvidia-smi

In [51]:
#watch  nvidia-smi --format=csv --query-gpu=power.draw,utilization.gpu,fan.speed,temperature.gpu

# 5) Création d'un pipeline de traitement de la donnée

## 5.1) Création d'un pipeline

In [52]:
from sklearn.pipeline import Pipeline

In [53]:
steps = [("preprocessor", preprocessor) , ("prediction", modele) ]

In [54]:
pipeline = Pipeline(steps)

## 5.2) test du pipeline

In [55]:
indexes           = X_validation.sample(1).index
df[target]        = df[target].map({"no":0, "yes":1})
data_sans_dummies = df.iloc[indexes].drop(target, axis=1)
y_a_tester        = df.iloc[indexes][target]
data_sans_dummies.shape

(1, 16)

In [56]:
pipeline.predict_proba(data_sans_dummies)

array([[9.99998012e-01, 1.98797207e-06]])

### 5.2.1) score de la prédiction globale

In [57]:
indexes           = X_validation.index
data_sans_dummies = df.iloc[indexes].drop(target, axis=1)
y_a_tester        = df.iloc[indexes][target]
data_sans_dummies.shape

(2260, 16)

In [58]:
predictions_ = pipeline.predict(data_sans_dummies)

In [59]:
reel_ = y_a_tester.values

In [60]:
pipeline.score(data_sans_dummies, y_a_tester.values)

0.9004424778761062

# 6) Save the pipeline model 

In [61]:
import pickle

In [92]:
model_filename = "model.pkl"
with open(model_filename, "wb") as f:
    pickle.dump(pipeline, f)

# 7) Passer en production
In order to deploy your trained model on AI Platform Prediction, you must:
- Upload your saved model to a Cloud Storage bucket.
- Create an AI Platform Prediction model resource.
- Create an AI Platform Prediction version resource, specifying the Cloud Storage path to your saved model.

In [93]:
from time import time

In [94]:
MODEL_NAME   = "from_notebook_cloud_orbit"
VERSION_NAME = f"{MODEL_NAME}_{int(time())}"

In [95]:
VERSION_NAME

'from_notebook_cloud_orbit_1655758904'

## 7.1) Upload the model to Cloud Storage => https://pantheon.corp.google.com/storage 

In [96]:
BUCKET_NAME = "bucket_cloud_orbit"
bucket      = client.bucket(BUCKET_NAME)
blob        = bucket.blob(model_filename)
blob.upload_from_filename(model_filename)

In [97]:
[x for x in bucket.list_blobs()]

[<Blob: bucket_cloud_orbit, 2020.csv, 1655678817572147>,
 <Blob: bucket_cloud_orbit, 2020_avec_virgules.csv, 1655679846234531>,
 <Blob: bucket_cloud_orbit, 2020_avec_virgules_one_line.csv, 1655680596719636>,
 <Blob: bucket_cloud_orbit, avro_schema.avsc, 1655720004709811>,
 <Blob: bucket_cloud_orbit, forecast.avro, 1655720031931821>,
 <Blob: bucket_cloud_orbit, model.pkl, 1655758905159882>,
 <Blob: bucket_cloud_orbit, test.pkl, 1655758895665852>,
 <Blob: bucket_cloud_orbit, une_col_une_ligne.csv, 1655681019154689>]

## 7.2) Créer un endpoint, dans l'AI Plateforme (en cours d'intégration à Vertex AI ?) 

### 7.2.1) activer l'API Endpoint dans le projet

In [98]:
#!gcloud services enable ml.googleapis.com

### 7.2.2) Créer le modèle dans l'endpoint, Redirige vers l'AI Platform => https://pantheon.corp.google.com/ai-platform

In [69]:
# Parfois il faut passer par la ligne de commande :-/ notamment pour répondre aux questions posées par les commandes

In [70]:
# Fait à la main : https://pantheon.corp.google.com/vertex-ai/models?project=jouin-romain-demos

In [71]:
print(f"gcloud ai-platform models create {MODEL_NAME}") # Chose region 6

gcloud ai-platform models create from_notebook_cloud_orbit


In [72]:
!gcloud ai-platform models create $MODEL_NAME --region="us-central1"

Using endpoint [https://us-central1-ml.googleapis.com/]
Created ai platform model [projects/helical-sanctum-353821/models/from_notebook_cloud_orbit].


## 7.3) Créer une ressource versionnée qui servira le modèle (prend du temps quand ça plante : y a pas de feedback au niveau du terminal de commande...)

In [79]:
# The system a "model.pkl" file into the directory "origin"

In [111]:
print(f"""gcloud ai-platform versions create {VERSION_NAME} \
  --model={MODEL_NAME} \
  --framework=scikit-learn \
  --origin=gs://{BUCKET_NAME}\
  --python-version=3.5 \
  --runtime-version=1.12 """)

gcloud ai-platform versions create from_notebook_cloud_orbit_1655758904   --model=from_notebook_cloud_orbit   --framework=scikit-learn   --origin=gs://bucket_cloud_orbit  --python-version=3.5   --runtime-version=1.12 


In [98]:
!gcloud ai-platform versions create $VERSION_NAME \
  --model=$MODEL_NAME \
  --framework=scikit-learn \
  --origin=gs://$BUCKET_NAME/ \
  --python-version=3.5 \
  --runtime-version=1.12 \
  --region=us-central1

Using endpoint [https://us-central1-ml.googleapis.com/]
Creating version (this might take a few minutes)......failed.                  
[1;31mERROR:[0m (gcloud.ai-platform.versions.create) Timeout. To view model server logs, please enable console logging when creating your model (https://cloud.google.com/ai-platform/prediction/docs/online-predict#requesting_logs_for_online_prediction_requests). If this error persists, contact cloudml-feedback@google.com


# 8) testing the API 

In [242]:
data_sans_dummies

Unnamed: 0,Age,Job,MaritalStatus,Education,Default,Balance,Housing,Loan,Contact,Day,Month,Duration,Campaign,PDays,Previous,POutcome
23801,36,technician,single,secondary,no,0,no,no,cellular,29,aug,85,6,-1,0,unknown
21209,45,technician,single,tertiary,no,0,no,no,cellular,18,aug,773,4,-1,0,unknown
31422,57,management,divorced,unknown,no,0,no,no,cellular,27,mar,102,1,-1,0,unknown
4951,34,management,single,tertiary,no,1350,yes,no,unknown,21,may,68,1,-1,0,unknown
39400,39,management,single,tertiary,no,0,yes,no,cellular,22,may,271,1,93,3,success
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4440,36,admin.,married,secondary,no,465,yes,no,unknown,20,may,160,1,-1,0,unknown
14750,53,blue-collar,married,secondary,no,6258,yes,no,telephone,15,jul,62,5,-1,0,unknown
2375,32,management,single,tertiary,yes,-3,yes,no,unknown,13,may,500,2,-1,0,unknown
7700,41,blue-collar,married,secondary,no,629,yes,yes,unknown,30,may,208,2,-1,0,unknown


In [90]:

data = data_sans_dummies[0:1].to_json()

In [91]:
dico = {}

In [101]:
dico["instances"] = [data]

In [102]:
import json


In [103]:
with open("data.json", "w") as f: 
    json.dump(dico, f)

In [95]:
curl \
-X POST \
-H "Authorization: Bearer $(gcloud auth print-access-token)" \
-H "Content-Type: application/json" \
https://europe-west1-aiplatform.googleapis.com/v1/projects/${PROJECT_ID}/locations/europe-west1/endpoints/${ENDPOINT_ID}:predict \
-d "@${data.json}"

SyntaxError: invalid syntax (1571311334.py, line 2)

In [111]:
data_sans_dummies[0:1].to_dict()

{'Age': {12374: 57},
 'Job': {12374: 'housemaid'},
 'MaritalStatus': {12374: 'married'},
 'Education': {12374: 'primary'},
 'Default': {12374: 'no'},
 'Balance': {12374: 85},
 'Housing': {12374: 'no'},
 'Loan': {12374: 'no'},
 'Contact': {12374: 'unknown'},
 'Day': {12374: 27},
 'Month': {12374: 'jun'},
 'Duration': {12374: 194},
 'Campaign': {12374: 1},
 'PDays': {12374: -1},
 'Previous': {12374: 0},
 'POutcome': {12374: 'unknown'}}

In [None]:
{"instances" : [[57, 'housemaid', 'married', 'primary', 'no', 85, 'no', 'no',
        'unknown', 27, 'jun', 194, 1, -1, 0, 'unknown', '?']] }

In [114]:
!cat data.json

{"a" : [57],
"z" : ["housemaid"],
"e" : ["married"],
"r" : ["primary"],
"t" : ["no"],
""y : [ 85],
"u" : ["no"],
"i" : ["no"],
"o" : ["unknown"],
"p" : [ 27],
"m" : ["jun"],
"k" : [ 194],
"j : [ 1],
"f" : [-1],
"h" : [ 0],
"g" : ["unknown"]}


In [110]:

data

'{"Age":{"12374":57},"Job":{"12374":"housemaid"},"MaritalStatus":{"12374":"married"},"Education":{"12374":"primary"},"Default":{"12374":"no"},"Balance":{"12374":85},"Housing":{"12374":"no"},"Loan":{"12374":"no"},"Contact":{"12374":"unknown"},"Day":{"12374":27},"Month":{"12374":"jun"},"Duration":{"12374":194},"Campaign":{"12374":1},"PDays":{"12374":-1},"Previous":{"12374":0},"POutcome":{"12374":"unknown"}}'