In [None]:
# Import python packages
import pandas as pd 

# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session
from snowflake.ml.modeling.linear_model import LinearRegression
from snowflake.ml.modeling.linear_model import LogisticRegression
from snowflake.ml.registry import Registry 
from snowflake.snowpark.functions import col, lit, sum 
import warnings
warnings.filterwarnings("ignore")
session = get_active_session() 


In [None]:
reservations_df = session.table("RESERVATIONS")
reservations_df.show();

In [None]:
reserve_df_booking_channel = reservations_df.group_by("BOOKING_CHANNEL").agg(sum("TOTAL_PRICE")).with_column_renamed("SUM(TOTAL_PRICE)","Revenue")
reserve_df_booking_channel.show(10)

In [None]:
train_sdf = reservations_df.drop('RESERVATION_ID','RESERVATION_STATUS','CHECK_IN_DATE','CHECK_OUT_DATE','CONTACT_ID','CREATION_DATE','PAYMENT_STATUS','ROOM_NUMBER','TAXES_AND_FEES','FEEDBACK_VAL')
train_sdf.show()


In [None]:
train_df, test_df = train_sdf.random_split(weights=[0.8,0.2], seed=0)

In [None]:
from snowflake.ml.modeling.pipeline import Pipeline
from snowflake.ml.modeling.compose import ColumnTransformer
from snowflake.ml.modeling.model_selection import GridSearchCV
from snowflake.ml.modeling.preprocessing import PolynomialFeatures, StandardScaler 
CROSS_VALIDATION_FOLDS = 10
POLYNOMIAL_FEATURES_DEGREE = 2
numeric_features = ['NUMBER_OF_ADULTS','NUMBER_OF_CHILDREN']
numeric_transformer = Pipeline(steps=[('poly',PolynomialFeatures(degree = POLYNOMIAL_FEATURES_DEGREE)),('scaler', StandardScaler())])

# Combine the preprocessed step together using the Column Transformer module
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)])

# The next step is the integrate the features we just preprocessed with our Machine Learning algorithm to enable us to build a model
pipeline = Pipeline(steps=[('preprocessor', preprocessor),('classifier', LinearRegression())])
parameteres = {}

# Use GridSearch to find the best fitting model based on number_of_folds folds
model = GridSearchCV(
    estimator=pipeline,
    param_grid=parameteres,
    cv=CROSS_VALIDATION_FOLDS,
    label_cols=["TOTAL_PRICE"],
    output_cols=["PREDICTED_REVENUE"],
    verbose=2
)

# Fit and Score
model.fit(train_df)
train_r2_score = model.score(train_df)
test_r2_score = model.score(test_df)

# R2 score on train and test datasets
print(f"R2 score on Train : {train_r2_score}")
print(f"R2 score on Test  : {test_r2_score}")

In [None]:
registry = Registry(session)
MODEL_NAME = "PREDICT_ROI"

In [None]:
mv = registry.log_model(model,
                        model_name=MODEL_NAME,
                        metrics={"R2_train": train_r2_score, "R2_test":test_r2_score},
                        comment='Model pipeline to predict revenue',
                        options={"embed_local_ml_library": True, "relax_version":False}
                    )

In [None]:
registry.show_models()

In [None]:
# Predicting the revenue for a new reservation
# We can use the model to predict the revenue for a new reservation
test_df = session.create_dataframe([['Phone',4,4,'Standard Rate','Deluxe','None']],schema= ['BOOKING_CHANNEL','NUMBER_OF_ADULTS','NUMBER_OF_CHILDREN','RATE_PLAN','ROOM_TYPE','SPECIAL_REQUESTS'])
mv.run(test_df,function_name='predict').show()