In [None]:
# !pip install xgboost==2.1.1
!pip install xgboost --upgrade


In [None]:
# Import python packages
import streamlit as st
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from snowflake.ml.registry import Registry
import ast

# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session
session = get_active_session()


In [None]:
titanic = pd.read_csv('titanic_snowflake.csv')
titanic = titanic.drop(["AGE", 
                        "DECK", 
                        "ALIVE",
                        "ADULT_MALE",
                        "EMBARKED",
                        "PCLASS",
                        "ALONE",
                        "SEX"],axis=1)
titanic.head()
titanic.head()

In [None]:
titanic.dropna(inplace=True)

In [None]:
titanic = pd.get_dummies(titanic, drop_first=True)

# Convert all boolean columns to integers
titanic = titanic.apply(lambda x: x.astype(int) if x.dtype == 'bool' else x)

titanic.dtypes

In [None]:
#now we will get the train data and label
x = titanic.drop('SURVIVED',axis=1)
y = titanic.SURVIVED

In [None]:
#make the x for train and test (also called validation data) 
xtrain,xtest,ytrain,ytest = train_test_split(x,y,train_size=.85,random_state=1234)

In [None]:
param_grid = {
    "n_estimators": [100, 200],
    # "learning_rate": [0.1, 0.5],
    # "max_depth": [1,2,3,4,5,6],
    # "min_child_weight": [1, 6]
}

In [None]:
model = XGBClassifier(objective='binary:logistic', 
                      eval_metric='logloss')

grid_search = GridSearchCV(estimator=model, 
                           param_grid=param_grid)

grid_search.fit(xtrain, ytrain)

In [None]:
# Best parameters and score
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Score:", best_score)

# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
test_score = best_model.score(xtest, ytest)
print("Test Score:", test_score)

In [None]:
metrics = {
    "Accuracy": best_score,
    "Params": best_params
}

metrics

In [None]:
from snowflake.ml.registry import Registry
# Get sample input data to pass into the registry logging function
X = xtrain.sample(n=1)

# Create a registry and log the model
# You can specify a different DB and Schema if you'd like
# otherwise it uses the session context
reg = Registry(session=session)

reg_df = reg.show_models()

# Define model name and version (use uppercase for name)
model_name = "TITANIC"

titanic_model = reg.log_model(
    model_name=model_name,
    options = {
    "relax_version": True,
    },
    #version_name="V_2",
    model=best_model,
    sample_input_data=X,
    metrics=metrics,
)

In [None]:
models_df = reg.show_models()
models_df[models_df['name'] == model_name]

In [None]:
reg.get_model(model_name).show_versions()

In [None]:
m = reg.get_model(model_name)
m.default = "FRESH_LION_2"
mv = m.default
mv.version_name

In [None]:
remote_prediction = mv.run(xtest, function_name="predict_proba")
remote_prediction.head()

In [None]:
test_sf = session.create_dataframe(xtest)
test_sf.write.mode("overwrite").save_as_table("test_pd")
session.table('test_pd').show()

In [None]:
# Create a Snowpark DataFrame that is configured to load data from the CSV file
titanic_df = (
    session.read.option("infer_schema", True)
    .option("PARSE_HEADER", True)
    .csv("@ml_data/titanic.csv")
)
titanic_df.show()

In [None]:
select *, round(TITANIC!predict_proba(*):output_feature_0,2)
as surv_pred
from test_pd