In [None]:
# Import python packages
import streamlit as st
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from snowflake.ml.registry import Registry


# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session
session = get_active_session()

In [None]:
CREATE OR REPLACE NETWORK RULE pypi_network_rule
MODE = EGRESS
TYPE = HOST_PORT
VALUE_LIST = ('pypi.org','raw.githubusercontent.com', 'pypi.python.org', 'pythonhosted.org',  'files.pythonhosted.org');

CREATE OR REPLACE EXTERNAL ACCESS INTEGRATION pypi_access_integration
ALLOWED_NETWORK_RULES = (pypi_network_rule)
ENABLED = true;

--GRANT USAGE ON INTEGRATION pypi_access_integration TO ROLE my_notebook_role;

In [None]:
!pip install xgboost --upgrade

In [None]:
!ls CONTAINER_NOTEBOOK_DEMO

In [None]:
!python container_notebook_demo/test.py

In [None]:
!pip freeze

In [None]:
titanic = pd.read_csv(
    "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv"
)
titanic.columns = [c.upper() for c in titanic.columns]
titanic.to_csv("titanic.csv", index=False)

In [None]:
titanic.head()

In [None]:
import streamlit as st

st.dataframe(titanic)

In [None]:
titanic = titanic.drop(columns=["AGE", "DECK", "ALIVE", "ADULT_MALE", "EMBARKED"])
titanic.head()

In [None]:
# Train-test split
X = titanic.drop("SURVIVED", axis=1)  # Features
y = titanic["SURVIVED"]               # Target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the training set shape
X_train.shape, X_test.shape


In [None]:
# Identify categorical columns
categorical_cols = X_train.select_dtypes(include=['object']).columns

# Apply SimpleImputer to categorical columns
imputer = SimpleImputer(strategy='most_frequent')

# Fit and transform the training data
X_train_imputed = X_train.copy()
X_train_imputed[categorical_cols] = imputer.fit_transform(X_train[categorical_cols])

# Apply transformation to the test set
X_test_imputed = X_test.copy()
X_test_imputed[categorical_cols] = imputer.transform(X_test[categorical_cols])

# View the imputed dataframe
X_train_imputed.head()

In [None]:
# One-hot encoding the imputed data using pd.get_dummies
X_train_encoded = pd.get_dummies(X_train_imputed, columns=categorical_cols, drop_first=True)

# Apply the same to the test set
X_test_encoded = pd.get_dummies(X_test_imputed, columns=categorical_cols, drop_first=True)

# Align the train and test data (this ensures both have the same one-hot encoded columns)
X_train_encoded, X_test_encoded = X_train_encoded.align(X_test_encoded, join='left', axis=1, fill_value=0)

# View the one-hot encoded training dataframe
X_train_encoded.head()


In [None]:
# Define the XGBoost model
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Define hyperparameter grid
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0]
}

# Perform grid search cross-validation
grid_search = GridSearchCV(xgb_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the model to the one-hot encoded training data
grid_search.fit(X_train_encoded, y_train)

# View the best parameters and score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation accuracy: {grid_search.best_score_}")

# Evaluate the model on the test data
test_accuracy = grid_search.score(X_test_encoded, y_test)
print(f"Test set accuracy: {test_accuracy}")


In [None]:
optimal_model = grid_search.best_estimator_

In [None]:
reg = Registry(session=session)
X = X_train_encoded.head(100)
mv = reg.log_model(model = optimal_model,
                   model_name="Titanic_pd",
                   version_name="v1",
                   conda_dependencies=["scikit-learn","xgboost"],
                   comment="Scikit Model",
                   metrics={"Accuracy": test_accuracy},
                   sample_input_data=X)

In [None]:
reg.show_models()

In [None]:
m = reg.get_model("Titanic_pd")
mv = m.default
mv.version_name

In [None]:
remote_prediction = mv.run(X_train_encoded, function_name="predict")
remote_prediction.head()