In [None]:
# Import python packages
import streamlit as st
import pandas as pd

# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session
session = get_active_session()


In [None]:
from xgboost import XGBRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV


In [None]:
import numpy as np
import pandas as pd
import random
random.seed(42)

# Parameters
num_rows = 500000
num_numerical_cols = 40
num_categorical_cols = 10
num_classes = 2  # for the target variable

# Generating numerical data
numerical_data = np.random.randn(num_rows, num_numerical_cols)

# Generating categorical data
categorical_data = np.random.choice(['A', 'B', 'C', 'D','E','F','G','H','I','J','K'], size=(num_rows, num_categorical_cols))

# Combining into a DataFrame
column_names = [f'num_col_{i+1}' for i in range(num_numerical_cols)] + [f'cat_col_{i+1}' for i in range(num_categorical_cols)]
data = pd.DataFrame(np.hstack((numerical_data, categorical_data)), columns=column_names)

# Generating a target variable
data['target'] = np.random.choice(range(num_classes), num_rows)

# Display dataset summary
print(data.head())
print(data.info())

### Local Training

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

snowpark_df = session.table("TPCDS_XGBOOST.DEMO.XGBOOST500k")
data = snowpark_df.to_pandas()

# Assume data is your DataFrame
X = data.drop('TARGET', axis=1)
y = data['TARGET']

# Identify numerical and categorical columns
numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Preprocessing for numerical data
numerical_transformer = StandardScaler()

# Preprocessing for categorical data
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])



In [None]:
# Define the XGBoost model
#xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
#xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', tree_method="hist", predictor= "cpu_predictor")
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', tree_method="gpu_hist", predictor= "gpu_predictor")


# Create a pipeline that first preprocesses the data and then fits the model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', xgb_model)
])

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
X_train.describe() 

In [None]:
# Fit the model
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy: {accuracy:.2f}")

### Snowpark

In [None]:
data.head()

In [None]:
from snowflake.snowpark import Session
from snowflake.snowpark.functions import col
from snowflake.snowpark.types import StringType
import pandas as pd
import numpy as np
from snowflake.ml.modeling.xgboost import XGBClassifier
from snowflake.ml.modeling.preprocessing import OneHotEncoder, MinMaxScaler
from snowflake.ml.modeling.impute import SimpleImputer

from snowflake.ml.modeling.compose import ColumnTransformer
from snowflake.ml.modeling.pipeline import Pipeline
from snowflake.ml.modeling.preprocessing import StandardScaler, OrdinalEncoder
from snowflake.ml.modeling.impute import SimpleImputer
from snowflake.ml.modeling.model_selection import GridSearchCV
import re
import snowflake.snowpark.functions as F

#data.columns = data.columns.str.upper()
#input_df = session.create_dataframe(data)
#cols = input_df.columns
#for old_col in cols:
#    new_col = re.sub(r'[^a-zA-Z0-9_]', '', old_col)
#    new_col = new_col.upper()
#    input_df = input_df.rename(F.col(old_col), new_col)



#input_df.write.mode('overwrite').save_as_table('TPCDS_XGBOOST.DEMO.XGBOOST100k')

# Load the data from a Snowflake table
snowpark_df = session.table("TPCDS_XGBOOST.DEMO.XGBOOST500k")
snowdf_train, snowdf_test = snowpark_df.random_split([0.8, 0.2], seed=82)
#feature_cols = snowpark_df.columns
#feature_cols.remove("TARGET")
target_col = ["TARGET"]

# Select numerical and categorical columns
numerical_cols = [
    "NUM_COL_1", "NUM_COL_2", "NUM_COL_3", "NUM_COL_4", "NUM_COL_5", "NUM_COL_6", "NUM_COL_7", "NUM_COL_8", 
    "NUM_COL_9", "NUM_COL_10", "NUM_COL_11", "NUM_COL_12", "NUM_COL_13", "NUM_COL_14", "NUM_COL_15", 
    "NUM_COL_16", "NUM_COL_17", "NUM_COL_18", "NUM_COL_19", "NUM_COL_20", "NUM_COL_21", "NUM_COL_22", 
    "NUM_COL_23", "NUM_COL_24", "NUM_COL_25", "NUM_COL_26", "NUM_COL_27", "NUM_COL_28", "NUM_COL_29", 
    "NUM_COL_30", "NUM_COL_31", "NUM_COL_32", "NUM_COL_33", "NUM_COL_34", "NUM_COL_35", "NUM_COL_36", 
    "NUM_COL_37", "NUM_COL_38", "NUM_COL_39", "NUM_COL_40"
]

categorical_cols = [
    "CAT_COL_1", "CAT_COL_2", "CAT_COL_3", "CAT_COL_4", "CAT_COL_5", "CAT_COL_6", 
    "CAT_COL_7", "CAT_COL_8", "CAT_COL_9", "CAT_COL_10"
]

for col_name in numerical_cols:
    snowdf_train = snowdf_train.with_column(col_name, col(col_name).cast("FLOAT"))

pipeline = Pipeline(
            steps=[
                ("OHE", OneHotEncoder(input_cols=categorical_cols, output_cols=categorical_cols, drop_input_cols=True, handle_unknown='ignore')),
                ("MMS", MinMaxScaler(clip=True, input_cols=numerical_cols, output_cols=numerical_cols)),
                ("SS", StandardScaler(input_cols=numerical_cols, output_cols=numerical_cols)),
               # ("classifier", XGBClassifier(label_cols=target_col,tree_method="hist", predictor= "cpu_predictor")),
               # ("classifier", XGBClassifier(label_cols=target_col,tree_method="gpu_hist", predictor= "gpu_predictor")),
                ("classifier", XGBClassifier(label_cols=target_col)),
            ]
        )


In [None]:
snowdf_train.count()

In [None]:
# Make predictions
y_pred1 = pipeline.fit_predict(snowdf_train)

In [None]:
from snowflake.ml.modeling.metrics import accuracy_score

# Fit the model
#pipeline.fit(snowdf_train)

# Make predictions
y_pred = pipeline.fit_predict(snowdf_train)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy: {accuracy:.2f}")

In [None]:
y_pred

In [None]:
xgb_model

In [None]:
# Make predictions
y_pred = pipeline.predict(snowdf_train)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy: {accuracy:.2f}")