# Etapa de treinamento

Steps:
- Seleção de modelo
- Seleção de hiperparâmetros
- Fit

In [11]:
# Imports
%pip install -r requirements.txt

import os
from google.cloud import bigquery
from google.cloud import storage
import joblib

Note: you may need to restart the kernel to use updated packages.


In [12]:
# Load
PROJECT_ID = "ai-platform-mockup"
INPUT_DATASET_ID = "preprocessed"
TABLE_ID = "p1_county_natality"

client = bigquery.Client(project=PROJECT_ID)

query = f"SELECT * FROM  `{PROJECT_ID}.{INPUT_DATASET_ID}.{TABLE_ID}`"
df = client.query(query).to_dataframe()

In [13]:
# Split
df_train = df[df["split"] == "train"]
df_test = df[df["split"] == "test"]
df_eval = df[df["split"] == "eval"]
df_train = df_train.drop(columns=["split"])
df_test = df_test.drop(columns=["split"])   
df_eval = df_eval.drop(columns=["split"])

In [14]:
# Model Selection
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [15]:
# Hyperparameter Selection
from sklearn.model_selection import GridSearchCV
param_grid = {
    'fit_intercept': [True, False]
}
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(df_train.drop(columns=['Births']), df_train['Births'])
best_model = grid_search.best_estimator_

In [16]:
# Fit
best_model.fit(df_train.drop(columns=['Births']), df_train['Births'])

In [None]:
# Save Model
model_path = "linear_regressor.pkl"
joblib.dump(best_model, model_path)

storage_client = storage.Client(project=PROJECT_ID)
bucket_name = f"vertex-models-{PROJECT_ID}"
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(model_path)
blob.upload_from_filename(model_path)

os.remove(model_path)
