In [1]:
import os
from google.cloud import storage
from google.oauth2 import service_account
from ast import literal_eval
import pandas as pd
from io import BytesIO, StringIO
import pickle

# Environment

In [2]:
# settings.py
import os
from dotenv import load_dotenv
load_dotenv()

PROJECT_ID = os.environ.get("PROJECT_ID")
PROJECT_NAME = os.environ.get("PROJECT_NAME")
BUCKET_NAME = os.environ.get("BUCKET_NAME")
PROJECT_ID, BUCKET_NAME, PROJECT_NAME
credential = service_account.Credentials.from_service_account_info(literal_eval(os.environ.get("CREDENTIAL")))

# Read Dataset from Cloud Storage

In [3]:
client = storage.Client(project=PROJECT_NAME, credentials=credential)
bucket = client.get_bucket(BUCKET_NAME)
blob = bucket.get_blob("input/diabetes.csv") # File path @CloudStorage
# blob.download_to_filename("data/diabetes_gcs.csv") # Download to local
df = pd.read_csv(BytesIO(blob.download_as_string()))
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


# Modeling

In [4]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from jcopml.pipeline import num_pipe, cat_pipe
from jcopml.utils import save_model, load_model
from jcopml.plot import plot_missing_value
from jcopml.feature_importance import mean_score_decrease

In [5]:
X = df.drop(columns="Outcome")
y = df.Outcome

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((614, 8), (154, 8), (614,), (154,))

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from jcopml.tuning import random_search_params as rsp



In [7]:
preprocessor = ColumnTransformer([
    ('numeric', num_pipe(), X_train.columns),
])

pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', LogisticRegression(solver='lbfgs', n_jobs=-1, random_state=42))
])



model = RandomizedSearchCV(pipeline, rsp.logreg_params, cv=3, n_iter=50, n_jobs=-1, verbose=1, random_state=42)
model.fit(X_train, y_train)

print(model.best_params_)
print(model.score(X_train, y_train), model.best_score_, model.score(X_test, y_test))

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    4.1s


{'algo__C': 0.014993298055091558, 'algo__fit_intercept': True}
0.7817589576547231 0.7769169456400448 0.7337662337662337


[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:    7.3s finished


In [8]:
model.predict(X_test)

array([1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0])

# Save Model

In [22]:
save_model(model.best_estimator_, "model.pkl")

def save_model_to_storage(model_path):
    # Initiate Bucket
    bucket = client.get_bucket(BUCKET_NAME)
    blob = bucket.blob("model/model.pkl") # File path @CloudStorage
    blob.upload_from_filename(model_path) # File path @Local
    
save_model_to_storage("model/model.pkl")

Model is pickled as model/model.pkl


# Load Model

In [24]:
def load_model_from_storage(model_path):
    bucket = client.get_bucket(BUCKET_NAME)
    blob = bucket.get_blob("model/model.pkl") # File path @CloudStorage
    # blob.download_to_filename("data/diabetes_gcs.csv") # Download to local
    gcs_model = pickle.load(BytesIO(blob.download_as_string()))
    return gcs_model

model = load_model_from_storage("model/model.pkl")

In [25]:
model.predict(X_test)

array([1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0])