In [None]:
from snowflake.snowpark.context import get_active_session
session = get_active_session()


In [None]:
from snowflake.snowpark.context import get_active_session
session = get_active_session()

session.sql("""
SELECT 
  CURRENT_ROLE() AS role,
  CURRENT_WAREHOUSE() AS warehouse,
  CURRENT_DATABASE() AS database_name,
  CURRENT_SCHEMA() AS schema_name
""").collect()


In [None]:
# ====== 0. Setup: session & imports ======
from snowflake.snowpark.context import get_active_session
from snowflake.snowpark.functions import col
import pandas as pd

session = get_active_session()

# Optional: ensure we are using right context
session.sql("USE WAREHOUSE ML_WH").collect()
session.sql("USE DATABASE ML_PROJECT").collect()
session.sql("USE SCHEMA PUBLIC").collect()

print("Context set.")

# ====== 1. Load data from Snowflake into a pandas DataFrame ======
df_sp = session.table("CUSTOMERS")
print("Row count in Snowpark DF:", df_sp.count())

df = df_sp.to_pandas()
print("Pandas shape:", df.shape)
df.head()


In [None]:
# ====== 2. Basic preprocessing ======
# Separate features and target
target_col = "CHURN"
X = df.drop(columns=[target_col, "CUSTOMER_ID"])  # keep customer_id separate
y = df[target_col]

# Identify categorical and numeric columns
categorical_cols = ["GENDER", "MEMBERSHIP_TIER"]
numeric_cols = [c for c in X.columns if c not in categorical_cols]

print("Categorical:", categorical_cols)
print("Numeric:", numeric_cols)

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Preprocessing for categorical data: OneHotEncoder
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_transformer, categorical_cols),
        ("num", "passthrough", numeric_cols),
    ]
)

# Define the model
model = RandomForestClassifier(
    n_estimators=150,
    max_depth=None,
    random_state=42
)

# Build full pipeline: preprocessing + model
clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])

# Train/test split
X_train, X_test, y_train, y_test, cid_train, cid_test = train_test_split(
    X, y, df["CUSTOMER_ID"], test_size=0.2, random_state=42, stratify=y
)

# Fit the model
clf.fit(X_train, y_train)

# Evaluate
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)

print("Accuracy on test set:", round(acc, 4))
print("\nClassification report:\n", classification_report(y_test, y_pred))


In [None]:
# ====== 3. Predict for all customers and save to Snowflake ======

# Predict probabilities and class for all rows
all_preds = clf.predict(X)
all_proba = clf.predict_proba(X)[:, 1]  # probability of churn=1

# Build pandas DataFrame with results
results_df = pd.DataFrame({
    "CUSTOMER_ID": df["CUSTOMER_ID"],
    "PREDICTED_CHURN": all_preds,
    "PREDICTED_CHURN_PROB": all_proba
})

results_df.head()


In [None]:
# Overwrite / create prediction table in Snowflake
session.sql("""
    CREATE OR REPLACE TABLE CUSTOMER_CHURN_PREDICTIONS (
        CUSTOMER_ID INT,
        PREDICTED_CHURN boolean,
        PREDICTED_CHURN_PROB FLOAT
    );
""").collect()

session.write_pandas(
    results_df,
    table_name="CUSTOMER_CHURN_PREDICTIONS",
    overwrite=True
)

print("Predictions table created & loaded.")


In [None]:
select * from CUSTOMER_CHURN_PREDICTIONS