# Hootsuite Intelligence - ML Models

This notebook trains and registers three machine learning models for the Hootsuite Intelligence Agent:
1. **Post Engagement Predictor**: Predicts engagement rate based on post characteristics.
2. **Churn Risk Predictor**: Predicts probability of customer churn.
3. **Optimal Time Predictor**: Predicts engagement rate for different posting times.

## Prerequisites
- Database: `HOOTSUITE_INTELLIGENCE`
- Warehouse: `HOOTSUITE_WH`
- Feature Views created in `ANALYTICS` schema

In [None]:
from snowflake.snowpark.session import Session
from snowflake.snowpark import functions as F
from snowflake.snowpark.types import * 
from snowflake.ml.registry import Registry
from snowflake.ml.modeling.pipeline import Pipeline
from snowflake.ml.modeling.preprocessing import OneHotEncoder, StandardScaler
from snowflake.ml.modeling.xgboost import XGBRegressor, XGBClassifier
import pandas as pd

# Connect to Snowflake (assumes notebook environment)
session = Session.builder.getOrCreate()
session.use_database('HOOTSUITE_INTELLIGENCE')
session.use_schema('ANALYTICS')
session.use_warehouse('HOOTSUITE_WH')

## Model 1: Post Engagement Predictor

Predicts `ENGAGEMENT_RATE` using `V_POST_ENGAGEMENT_FEATURES`.

In [None]:
# 1. Load Data from Feature View
df_engagement = session.table('ANALYTICS.V_POST_ENGAGEMENT_FEATURES')

# 2. Define Features and Target
cat_cols = ['MEDIA_TYPE']
num_cols = ['FOLLOWER_COUNT', 'IS_VERIFIED', 'DELAY_HOURS', 'HOUR_OF_DAY', 'DAY_OF_WEEK', 'SENTIMENT_SCORE']
target_col = 'ENGAGEMENT_RATE'

# 3. Build Pipeline
pipeline_engagement = Pipeline(
    steps=[
        ('ohe', OneHotEncoder(input_cols=cat_cols, output_cols=cat_cols, drop_input_cols=True)),
        ('scaler', StandardScaler(input_cols=num_cols, output_cols=num_cols)),
        ('model', XGBRegressor(
            label_cols=[target_col]
        ))
    ]
)

# Drop ID columns not used for training
train_df = df_engagement.drop('POST_ID')

# Train
model_engagement = pipeline_engagement.fit(train_df)

# 4. Register Model
reg = Registry(session=session, database_name='HOOTSUITE_INTELLIGENCE', schema_name='ML_MODELS')
mv_engagement = reg.log_model(
    model=model_engagement,
    model_name='POST_ENGAGEMENT_PREDICTOR',
    sample_input_data=train_df.limit(10),
    comment='Predicts post engagement rate based on metadata'
)
print('Model POST_ENGAGEMENT_PREDICTOR registered.')

## Model 2: Churn Risk Predictor

Predicts `IS_CHURNED` using `V_CHURN_RISK_FEATURES`.

In [None]:
# 1. Load Data
df_churn = session.table('ANALYTICS.V_CHURN_RISK_FEATURES')

# 2. Define Features
cat_cols = ['PLAN_TIER']
num_cols = ['EMPLOYEE_COUNT', 'SUBSCRIPTION_DAYS', 'ACTIVE_USERS_COUNT', 'SUPPORT_TICKETS_COUNT', 'AVG_SATISFACTION_SCORE']
target_col = 'IS_CHURNED'

# 3. Pipeline
pipeline_churn = Pipeline(
    steps=[
        ('ohe', OneHotEncoder(input_cols=cat_cols, output_cols=cat_cols, drop_input_cols=True)),
        ('scaler', StandardScaler(input_cols=num_cols, output_cols=num_cols)),
        ('model', XGBClassifier(
            label_cols=[target_col]
        ))
    ]
)

train_df_churn = df_churn.drop('ORGANIZATION_ID')
model_churn = pipeline_churn.fit(train_df_churn)

# 4. Register
mv_churn = reg.log_model(
    model=model_churn,
    model_name='CHURN_RISK_PREDICTOR',
    sample_input_data=train_df_churn.limit(10),
    comment='Predicts organization churn probability'
)
print('Model CHURN_RISK_PREDICTOR registered.')

## Model 3: Optimal Time Predictor

Predicts `ENGAGEMENT_RATE` based on time and industry contexts.

In [None]:
# 1. Load Data
df_time = session.table('ANALYTICS.V_OPTIMAL_TIME_FEATURES')

# 2. Features
cat_cols = ['INDUSTRY', 'NETWORK']
num_cols = ['HOUR_OF_DAY', 'DAY_OF_WEEK']
target_col = 'ENGAGEMENT_RATE'

# 3. Pipeline
pipeline_time = Pipeline(
    steps=[
        ('ohe', OneHotEncoder(input_cols=cat_cols, output_cols=cat_cols, drop_input_cols=True)),
        ('model', XGBRegressor(
            label_cols=[target_col]
        ))
    ]
)

train_df_time = df_time.drop('POST_ID')
model_time = pipeline_time.fit(train_df_time)

# 4. Register
mv_time = reg.log_model(
    model=model_time,
    model_name='OPTIMAL_TIME_PREDICTOR',
    sample_input_data=train_df_time.limit(10),
    comment='Predicts engagement rate for time optimization'
)
print('Model OPTIMAL_TIME_PREDICTOR registered.')

In [None]:
# Verify models
models = reg.show_models()
print(models)