In [1]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [2]:
import joblib

# Summary 
Create ML model serving pipeline with following components:
- Data source: Use iris dataset as training dataset. For serving, use Confluent Kafka to simulate training data base on their data schema
- Model deployment: Google Vertex AI
- Prediction control: Google Cloud Run
- Storage: Google Cloud Storage
- Analytics / Monitoring: Google BigQuery, Looker Studio


The purpose is to implement ML serving pipeline so model development will be simple with basic dataset (iris) 

# Input data 

In [3]:
# Load the iris dataset
iris = load_iris()

# Create a DataFrame
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)

# Add target column to DataFrame
df['target'] = iris.target

# Show the first few rows of the DataFrame
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


## Data Exporation 

In [4]:
df.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333,1.0
std,0.828066,0.435866,1.765298,0.762238,0.819232
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


In [7]:
df.to_parquet('data/df_iris_v1.parquet')

# Train model 

In [8]:

# Split the dataset into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(columns=['target']),  # Features
    df['target'],                 # Target
    test_size=0.2,                # 20% for testing
    random_state=42               # For reproducibility
)

# Show the sizes of the training and testing sets
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")

Training set size: 120 samples
Testing set size: 30 samples


In [11]:
# Initialize the Decision Tree model
model = DecisionTreeClassifier(random_state=42)

# Fit the model on the training data
model.fit(X_train, y_train)

# Predict the labels on the test data
y_pred = model.predict(X_test)

# Evaluate the model by calculating the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on the test set: {accuracy:.2f}")

Accuracy on the test set: 1.00


## Save model 

In [9]:
LOCAL_MODEL_ARTIFACTS_DIR='model'

In [13]:
joblib.dump(model, f"{LOCAL_MODEL_ARTIFACTS_DIR}/model.joblib")

['model/model.joblib']