In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from snowflake.snowpark import Session
from snowflake.ml.utils.connection_params import SnowflakeLoginOptions
from dotenv import load_dotenv

In [None]:
load_dotenv()

In [None]:
session=Session.builder.configs(SnowflakeLoginOptions("test_conn")).create()
session

In [None]:
df=session.table('HOUSING').sample(frac=0.10).to_pandas()
df.shape

In [None]:
X=df.loc[:,df.columns!='MEDIAN_HOUSE_VALUE']
y=df['MEDIAN_HOUSE_VALUE']
X,y

In [None]:
def fit_pipeline(X,y,cat_attribs,num_attribs):
    model=Pipeline(steps=[
        (
            'preprocessor',ColumnTransformer([
                ("num",Pipeline(steps=[
                    ("imputer",SimpleImputer(strategy='median')),
                    ('std_scaler',StandardScaler())
                    ]),
                    num_attribs),
                ("cat",Pipeline(steps=[
                    ("imputer",SimpleImputer(strategy='most_frequent')),
                    ('one_hot',OneHotEncoder(handle_unknown='ignore'))
                    ]),
                    cat_attribs)
            ]),),
        (
            'model',RandomForestRegressor(n_estimators=100,random_state=42,n_jobs=1)
        )
    ])
    model.fit(X,y)
    return model

In [None]:


pipe=fit_pipeline(X=X,
                  y=y,
                  cat_attribs=['OCEAN_PROXIMITY'],
                  num_attribs=['LONGITUDE','LATITUDE','HOUSING_MEDIAN_AGE',
                              'TOTAL_ROOMS','TOTAL_BEDROOMS','POPULATION',
                              'HOUSEHOLDS','MEDIAN_INCOME']
                 )

In [None]:
pipe

In [None]:
pipe.predict(X)[0]

In [None]:
def save_model(session:Session,model,stage_name,stage_path,model_file):
    import io
    import joblib

    input_stream=io.BytesIO()
    input_stream.name=model_file
    joblib.dump(model,input_stream)

    model_path=f"{stage_name}/{stage_path}/{model_file}"
    session.file.put_stream(input_stream=input_stream, 
                            stage_location=model_path,
                            overwrite=True)
    return model_path

In [None]:
def train_model(
    session:Session,
    training_table:str,
    target_col:str,
    save_stage:str
) -> dict:
    from datetime import datetime
    import numpy as np
    import pandas as pd
    from snowflake.snowpark import types as T
    from sklearn.metrics import mean_squared_error

    now=datetime.now()

    snowdf_train,snowdf_test=session.table(training_table).\
        random_split([0.8,0.2],seed=82)

    cat_attribs=[ c.name for c in snowdf_train.schema.fields if (type(c.datatype)==T.StringType) & (c.name!=target_col)]
    numeric_types=[T.DecimalType,T.LongType,T.DoubleType,T.FloatType,T.IntegerType]
    num_attribs=[c.name for c in snowdf_train.schema.fields if (type(c.datatype) in numeric_types) & (c.name!=target_col)]

    train_table_name=training_table+'_TRAIN'
    snowdf_train.write.mode(save_mode="overwrite").save_as_table(train_table_name)

    test_table_name=training_table+'_TEST'
    snowdf_test.write.mode(save_mode="overwrite").save_as_table(test_table_name)

    pd_train=snowdf_train.to_pandas()

    X_train=pd_train.loc[:,pd_train.columns!=target_col]
    y_train=pd_train[target_col]

    full_pipeline=fit_pipeline(X=X_train,y=y_train,cat_attribs=cat_attribs,num_attribs=num_attribs)

    save_path=now.strftime("%Y-%m-%d-%H%M%S")
    object_saved_path=save_model(session=session,model=full_pipeline,
               stage_name=f"@{save_stage}/models",stage_path=save_path,model_file='housing_fores_reg.joblib')

    pd_test=snowdf_test.to_pandas()

    X_test=pd_test.loc[:,pd_train.columns!=target_col]
    y_test=pd_test[target_col]

    housing_predictions=full_pipeline.predict(X_test)

    lin_mse=mean_squared_error(y_test,housing_predictions)
    lin_rmse=np.sqrt(lin_mse)

    ret_dict={
        "MSE":lin_mse,
        "RMSE":lin_rmse,
        "model_path":object_saved_path,
        "train_table":train_table_name,
        "test_table":test_table_name
    }

    return ret_dict

In [None]:
session.clear_packages()
session.clear_imports()

session.add_packages('snowflake-snowpark-python','scikit-learn','pandas','numpy','joblib')
from snowflake.snowpark import functions as F
train_model_sp=F.sproc(func=train_model,name="train_house_sp",replace=True,is_permanent=True,
                      stage_location="int_stage/sp/",session=session)

In [None]:
ret=train_model_sp(session,'HOUSING',"MEDIAN_HOUSE_VALUE","int_stage")

In [None]:
ret

In [None]:
session.sql("LIST @int_stage")