# Introduction

BigQuery DataFrames provides a Pythonic DataFrame and machine learning (ML) API powered by the BigQuery engine.

`bigframes.pandas provides a pandas-compatible API for analytics.`

`bigframes.ml provides a scikit-learn-like API for ML.`

BigQuery DataFrames is an open-source package. You can run pip install --upgrade bigframes to install the latest version.



#### [Limitations](https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-create-xgboost#limitations)


# XGBoost
This demo shows how we can implement a XGBoost model in BigQuery DataFrames ML, with API that is exactly compatible with [xgboost](https://xgboost.readthedocs.io/en/stable/).

## 1. Init & load data

In [2]:
# Initialize BigQuery DataFrame
import os 
import numpy as np
import pandas as pd
import seaborn as sns
import bigframes.pandas as bpd
import matplotlib.pyplot as plt
from google.cloud import bigquery
import bigframes.ml.metrics as bmetrics
from bigframes.ml.pipeline import Pipeline
from bigframes.ml.compose import ColumnTransformer
from bigframes.ml.model_selection import train_test_split
from bigframes.ml.ensemble import XGBClassifier, XGBRegressor
from bigframes.ml.preprocessing import StandardScaler, OneHotEncoder


#Set environment variables for your notebook
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'key.json'
project_id = "rudderstacktestbq"

# Initialize the BigQuery client with your project ID
client = bigquery.Client(project=project_id)

# read a BigQuery table to a BigQuery DataFrame
df_feature1 = bpd.read_gbq("rudderstacktestbq.PROFILES_INTEGRATION_TEST.Material_user_var_table_839ca5d3_27",use_cache=False) 
df_label1 = bpd.read_gbq("rudderstacktestbq.PROFILES_INTEGRATION_TEST.Material_user_var_table_839ca5d3_28",use_cache=False)

df_feature2 = df_feature1.copy()
df_label2 = df_label1.copy()

# take a peek at the dataframe
df_feature1.head(5)

Unnamed: 0,user_main_id,max_timestamp_bw_tracks_pages,days_since_last_seen,gross_amt_spent_in_past_90_days,n_orders_completed,total_sessions_till_date
0,rid82e1895a3dfe7016b31cd1be2df391a3,2024-02-23 07:14:24+00:00,12,85,2,2
1,riddebda526ff83fc8074a66518e798deb9,2024-09-19 05:24:13+00:00,-197,0,0,5
2,ridb3d4c2cce83e68befecebefd5985d5b8,2025-01-13 19:20:40+00:00,-313,0,0,5
3,ridaf45b47a9e3d7aefb947de86fa69324a,2024-04-06 17:01:31+00:00,-31,0,0,1
4,ridb087ba8f178c645584098db5d65e9eb5,2025-04-04 02:41:56+00:00,-394,0,0,4


## 2. Getting Final Table

In [2]:
# creating the target column
def create_label(df_label,df_feature):
    df_feature['target'] = df_feature['days_since_last_seen'].isin([1]).map({True:1,False:0})
    return df_feature

df_feature1 = create_label(df_label1,df_feature1)
df_feature2 = create_label(df_label2,df_feature2)

df = bpd.concat([df_feature1, df_feature2], ignore_index=True)

In [3]:
df.target.value_counts()

target
0    40388
1       84
Name: count, dtype: Int64

## 3. Data cleaning / prep

In [4]:
# drop columns we don't care about,rows with nulls to get our training data
training_data = df.drop(columns=["user_main_id",'max_timestamp_bw_tracks_pages','days_since_last_seen']).dropna()

## 4. Use `model_selection.train_test_split` to prepare training data

In [5]:
feature_columns = training_data.drop(columns=['target'])
label_columns = training_data[['target']] 

X_train, X_test, y_train, y_test = train_test_split(feature_columns, label_columns, test_size=0.2)

## 5. Configure a XGBoost classification pipeline with preprocessing

In [6]:
preprocessing = ColumnTransformer([
  ("scaler", StandardScaler(), training_data.drop(columns=['target']).columns.tolist()),
])
  
model = XGBClassifier(n_estimators=50, learning_rate = 0.1, enable_global_explain = True)

pipeline = Pipeline([
  ('preproc', preprocessing),
  ('xgb', model)
])

pipeline

Pipeline(steps=[('preproc',
                 ColumnTransformer(transformers=[('scaler', StandardScaler(),
                                                  ['gross_amt_spent_in_past_90_days',
                                                   'n_orders_completed',
                                                   'total_sessions_till_date'])])),
                ('xgb',
                 XGBClassifier(enable_global_explain=True, learning_rate=0.1,
                               n_estimators=50))])

## 6. Fit the pipeline to the training data

This will create a temporary BQML model in BigQuery

In [7]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('preproc',
                 ColumnTransformer(transformers=[('scaler', StandardScaler(),
                                                  ['gross_amt_spent_in_past_90_days',
                                                   'n_orders_completed',
                                                   'total_sessions_till_date'])])),
                ('xgb',
                 XGBClassifier(enable_global_explain=True, learning_rate=0.1,
                               n_estimators=50))])

## 7. Score the pipeline on the test data

In [9]:
y_pred = pipeline.predict(X_test)["predicted_target"]

precision = bmetrics.precision_score(y_test, y_pred,average=None)
recall = bmetrics.recall_score(y_test, y_pred,average=None)
f1 = bmetrics.f1_score(y_test, y_pred,average=None)

print('precision : ',precision)
print('recall : ',recall)
print('f1 : ',f1)

  precision_score.loc[i] = precision.loc[i]


precision :  0    0.997776
1    0.000000
dtype: float64
recall :  0    1
1    0
dtype: int64
f1 :  0    0.998887
1    0.000000
dtype: float64


  precision_score.loc[i] = precision.loc[i]
  f1_score[index] = (


## 8. Plots

In [10]:
def plot_roc_auc_curve(y_pred, y_true, roc_auc_file) -> None:
    fpr, tpr, _ = bmetrics.roc_curve(y_true, y_pred,drop_intermediate=False)
    roc_auc = bmetrics.auc(fpr, tpr)
    sns.set(style="ticks", context="notebook")
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color="b", label=f"ROC AUC = {roc_auc:.2f}", linewidth=2)
    plt.plot([0, 1], [0, 1], color="gray", linestyle="--", linewidth=2)
    plt.title("ROC Curve (Test Data)")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.legend(loc="lower right")
    sns.despine()
    plt.grid(True)
    plt.savefig(roc_auc_file)
    plt.clf()

plot_roc_auc_curve(y_pred,y_test,'roc_auc_plot.png')

<Figure size 800x600 with 0 Axes>

In [11]:
def plot_pr_auc_curve(y_pred, y_true, pr_auc_file) -> None:
    # precision, recall, _ = precision_recall_curve(y_true, y_pred)
    precision = bpd.Series(bmetrics.precision_score(y_true, y_pred,average=None))
    recall = bpd.Series(bmetrics.recall_score(y_true, y_pred,average=None))
    pr_auc = bmetrics.auc(recall, precision)
    sns.set(style="ticks", context="notebook")
    plt.figure(figsize=(8, 6))
    plt.plot(recall, precision, color="b", label=f"PR AUC = {pr_auc:.2f}", linewidth=2)
    plt.ylim([int(min(precision) * 20) / 20, 1.0])
    plt.xlim([int(min(recall) * 20) / 20, 1.0])
    plt.title("Precision-Recall Curve (Test data)")
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.legend(loc="lower left")
    sns.despine()
    plt.grid(True)
    plt.savefig(pr_auc_file)
    plt.clf()

plot_pr_auc_curve(y_pred,y_test,'pr_auc_plot.png')

  precision_score.loc[i] = precision.loc[i]


<Figure size 800x600 with 0 Axes>

In [12]:
def plot_lift_chart(y_pred, y_true, lift_chart_file) -> None:
    """Generates a lift chart for a binary classification model."""

    data = bpd.concat([y_true, y_pred], axis=1)
    data  = data.rename(columns={'target': 'label', 'predicted_target': 'pred'})
    data = data.sort_values(by="pred", ascending=False)
    data["cum_tp"] = data["label"].cumsum()


    # data = pd.DataFrame()
    # data["label"] = y_true
    # data["pred"] = y_pred

    # sorted_indices = np.argsort(data["pred"].values, kind="heapsort")[::-1]
    cumulative_actual = data["cum_tp"] #np.cumsum(data["label"][sorted_indices].values)
    cumulative_percentage = np.linspace(0, 1, len(cumulative_actual) + 1)

    sns.set(style="ticks", context="notebook")
    plt.figure(figsize=(8, 6))
    print(type(cumulative_actual))
    sns.lineplot(
        x=cumulative_percentage * 100,
        y=np.array([0] + list(100 * cumulative_actual / cumulative_actual.iloc[-1])),
        linewidth=2,
        color="b",
        label="Model Lift curve",
    )
    sns.despine()
    plt.plot(
        [0, 100 * data["label"].mean()],
        [0, 100],
        color="red",
        linestyle="--",
        label="Best Case",
        linewidth=1.5,
    )
    plt.plot(
        [0, 100],
        [0, 100],
        color="black",
        linestyle="--",
        label="Baseline",
        linewidth=1.5,
    )

    plt.title("Cumulative Gain Curve")
    plt.xlabel("Percentage of Predicted Target Users")
    plt.ylabel("Percent of Actual Target Users")
    plt.ylim([0, 100])
    plt.xlim([0, 100])
    plt.legend()
    plt.grid(True)
    plt.savefig(lift_chart_file)
    plt.clf()

plot_lift_chart(y_pred,y_test,'lift_chart_plot.png')

<class 'bigframes.series.Series'>


<Figure size 800x600 with 0 Axes>

## 9. Inference the model on new data

In [13]:
import pandas

new_data = bpd.read_pandas(
        pandas.DataFrame(
            {
                "gross_amt_spent_in_past_90_days": [0,5,100],
                "total_sessions_till_date": [1,2,3],
                "n_orders_completed": [0,5,2],

            }
        )
    )

In [14]:
pipeline.predict(new_data)

Unnamed: 0,predicted_target,predicted_target_probs,gross_amt_spent_in_past_90_days,total_sessions_till_date,n_orders_completed
0,0,"[{'label': 1, 'prob': 0.06780016422271729}  {'...",0,1,0
1,0,"[{'label': 1, 'prob': 0.06780016422271729}  {'...",5,2,5
2,0,"[{'label': 1, 'prob': 0.06780016422271729}  {'...",100,3,2


## 10. Save in BigQuery

In [15]:
pipeline.to_gbq("rudderstacktestbq.PROFILES_INTEGRATION_TEST.bigframes_xgb_test", replace=True)

Pipeline(steps=[('transform', StandardScaler()),
                ('estimator',
                 XGBClassifier(booster='GBTREE', learning_rate=0.1,
                               n_estimators=50, tree_method='AUTO'))])

## 11. Feature Importance using Bigquey ML

In [3]:
bpd.read_gbq_query('''SELECT
  *
FROM
  ML.FEATURE_IMPORTANCE(MODEL rudderstacktestbq.PROFILES_INTEGRATION_TEST.bigframes_xgb_test)''')

Unnamed: 0,feature,importance_weight,importance_gain,importance_cover
0,standard_scaled_n_orders_completed,0,0.0,0.0
1,standard_scaled_gross_amt_spent_in_past_90_days,0,0.0,0.0
2,standard_scaled_total_sessions_till_date,800,0.142641,6169.472687
