# Customer churn analysis


# Machine Learning Pipeline

In the following notebooks, we will go through the implementation of each one of the steps in the Machine Learning Pipeline. 

We will discuss:

1. Data Preparation and Analysis
2. Feature Engineering
3. Feature Selection
4. Model Training
5. Obtaining Predictions / Scoring

## Import Python and Snowpark Packages

In [None]:
from snowflake.snowpark.session import Session
from snowflake.snowpark import functions as F
from snowflake.snowpark.types import *
from snowflake.snowpark.functions import udf


import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import datetime as dt
import numpy as np
import seaborn as sns

import json
import ast

# to divide train and test set
from sklearn.model_selection import train_test_split

# feature scaling
#from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, RepeatedStratifiedKFold, train_test_split, cross_val_score
from sklearn.pipeline import make_pipeline



from sklearn.datasets import make_classification
from snowflake.snowpark import Session, DataFrame
from snowflake.ml.modeling.xgboost import XGBClassifier
from snowflake.ml.modeling.model_selection.grid_search_cv import GridSearchCV
from snowflake.snowpark.functions import udf, col, lit, translate, is_null, iff
from snowflake.snowpark.version import VERSION
# setup pipeline

#transformations
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import FunctionTransformer

#snowpark_ml
from snowflake.ml._internal.utils import identifier
from snowflake.ml.modeling.impute import SimpleImputer
from snowflake.ml.modeling.metrics import accuracy_score, precision_score, recall_score
from snowflake.ml.modeling.model_selection import GridSearchCV
from snowflake.ml.modeling.preprocessing import OneHotEncoder
from snowflake.ml.modeling.xgboost import XGBClassifier
from snowflake.ml.registry import model_registry
from snowflake.ml.registry import Registry
from snowflake.ml.utils.connection_params import SnowflakeLoginOptions
from snowflake.snowpark import Session
from snowflake.snowpark import types as T
from snowflake.snowpark.functions import col
from snowflake.ml.modeling.preprocessing import MinMaxScaler
from snowflake.ml.modeling.preprocessing import OrdinalEncoder
from snowflake.ml.modeling.pipeline import Pipeline

#Classifier
from sklearn.ensemble import RandomForestClassifier

#Pipeline
from sklearn.pipeline import make_pipeline

#Model Accuracy
from sklearn.metrics import balanced_accuracy_score

# to save the trained scaler class
import joblib

## Connect to Snowflake Account and Create a Session

In [None]:

# connect to Snowflake
with open("creds.json", "r") as f:
    snowflake_conn_prop = json.load(f)  
session = Session.builder.configs(snowflake_conn_prop).create()

session.sql_simplifier_enabled = True

snowflake_environment = session.sql('SELECT current_user(), current_version()').collect()
snowpark_version = VERSION


# Current Environment Details
print('\nConnection Established with the following parameters:')
print('User                        : {}'.format(snowflake_environment[0][0]))
print('Role                        : {}'.format(session.get_current_role()))
print('Database                    : {}'.format(session.get_current_database()))
print('Schema                      : {}'.format(session.get_current_schema()))
print('Warehouse                   : {}'.format(session.get_current_warehouse()))
print('Snowflake version           : {}'.format(snowflake_environment[0][1]))
print('Snowpark for Python version : {}.{}.{}'.format(snowpark_version[0],snowpark_version[1],snowpark_version[2]))

In [None]:
%%time
final_data = session.table('TRAIN_CHURN_DATASET_BIN')
final_data.show(5)

In [None]:
CUSTOMERID = final_data.select("CUSTOMERID")
CUSTOMERID.show(5)

In [None]:
final_data = final_data.drop("CUSTOMERID") # "TENUREMONTHS"
final_data.show(5)

In [None]:
final_data.columns

In [None]:
cat_cols = ['PHONESERVICE_YES',
 'MULTIPLELINES_NO_PHONE_SERVICE',
 'MULTIPLELINES_YES',
 'INTERNETSERVICE_FIBER_OPTIC',
 'INTERNETSERVICE_NO',
 'ONLINESECURITY_NO_INTERNET_SERVICE',
 'ONLINESECURITY_YES',
 'ONLINEBACKUP_NO_INTERNET_SERVICE',
 'ONLINEBACKUP_YES',
 'DEVICEPROTECTION_NO_INTERNET_SERVICE',
 'DEVICEPROTECTION_YES',
 'TECHSUPPORT_NO_INTERNET_SERVICE',
 'TECHSUPPORT_YES',
 'STREAMINGTV_NO_INTERNET_SERVICE',
 'STREAMINGTV_YES',
 'STREAMINGMOVIES_NO_INTERNET_SERVICE',
 'STREAMINGMOVIES_YES',
 'CONTRACT_ONE_YEAR',
 'CONTRACT_TWO_YEAR',
 'PAPERLESSBILLING_YES',
 'PAYMENTMETHOD_CREDIT_CARD',
 'PAYMENTMETHOD_ELECTRONIC_CHECK',
 'PAYMENTMETHOD_MAILED_CHECK',
 'MONTHLYCHARGESBIN_LOW',
 'MONTHLYCHARGESBIN_MEDIUM',
 'TOTALCHARGESBIN_LOW',
 'TOTALCHARGESBIN_MEDIUM',]

num_cols = ['TENUREMONTHS',
 'MONTHLYCHARGES',
 'TOTALCHARGES']

In [None]:
impute_cat = SimpleImputer(
    input_cols=cat_cols,
    output_cols=cat_cols,
    strategy="most_frequent",
    drop_input_cols=True,
)

final_data = impute_cat.fit(final_data).transform(final_data)
final_data.show()

In [None]:
# Convert all columns to upper case using alias
new_columns = [final_data[col].alias(col.upper()) for col in final_data.columns]
final_data = final_data.select(*new_columns)

# Show the updated DataFrame
final_data.show()

# Separate dataset into train and test

It is important to separate our data intro training and testing set. 

When we engineer features, some techniques learn parameters from data. It is important to learn these parameters only from the train set. This is to avoid over-fitting.

In [None]:
# Let's separate into train and test set
# Remember to set the seed (random_state for this sklearn function)
# seed value
train_df, test_df = final_data.random_split(weights=[0.8, 0.2], seed=25)


In [None]:
train_df.show(5)

In [None]:
test_df.show(5)

In [None]:
parameters = {
    "n_estimators": [100, 200, 300, 400, 500],
    "learning_rate": [0.1, 0.2, 0.3, 0.4, 0.5],
    "max_depth": list(range(3, 6, 1)),
    "min_child_weight": list(range(1, 6, 1)),
}

In [None]:
parameters

In [None]:
session.sql(
    f"ALTER WAREHOUSE {session.get_current_warehouse()[1:-1]} SET WAREHOUSE_SIZE=LARGE;"
).collect()

In [None]:
grid_search = GridSearchCV(
    estimator=XGBClassifier(),
    param_grid=parameters,
    n_jobs=-1,
    scoring="accuracy",
    input_cols=train_df.drop("CHURNVALUE").columns,
    label_cols="CHURNVALUE",
    output_cols="PRED_CHURNVALUE",
)

# Train
grid_search.fit(train_df)

In [None]:
session.sql(
    f"ALTER WAREHOUSE {session.get_current_warehouse()[1:-1]} SET WAREHOUSE_SIZE=XSMALL;"
).collect()

In [None]:
result = grid_search.predict(test_df)

In [None]:
accuracy = accuracy_score(
    df=result, y_true_col_names="CHURNVALUE", y_pred_col_names="PRED_CHURNVALUE"
)

print(f"Accuracy: {accuracy}")

In [None]:
# Print each combination of hyperparameters with their accuracy
results = grid_search.to_sklearn().cv_results_
data = {"accuracy": results["mean_test_score"]}
for i, param in enumerate(results["params"]):
    for key, value in param.items():
        if key not in data:
            data[key] = [None] * len(results["params"])
        data[key][i] = value

# Create DataFrame
hp_df = pd.DataFrame(data).sort_values(by="accuracy", ascending=False)
hp_df.head()

# Model Registry

In [None]:
optimal_model = grid_search.to_sklearn().best_estimator_
optimal_n_estimators = optimal_model.n_estimators
optimal_learning_rate = optimal_model.learning_rate
optimal_max_depth = optimal_model.max_depth
optimal_min_child_weight = optimal_model.min_child_weight
optimal_accuracy = hp_df["accuracy"][0]

In [None]:
# create function to add one to our model number if it already exists

def check_and_update(df, model_name):
    if df.empty:
        return "V_1"
    elif df[df["name"] == model_name].empty:
        return "V_1"
    else:
        # Increment model_version if df is not a pandas Series
        lst = sorted(ast.literal_eval(df["versions"][0]))
        last_value = lst[-1]
        prefix, num = last_value.rsplit("_", 1)
        new_last_value = f"{prefix}_{int(num)+1}"
        lst[-1] = new_last_value
        return new_last_value

In [None]:
# Get sample input data to pass into the registry logging function
X = train_df.drop("CHURNVALUE").limit(100)

# Create a registry and log the model
# You can specify a different DB and Schema if you'd like
# otherwise it uses the session context
reg = Registry(session=session)

reg_df = reg.show_models()

# Define model name and version
model_name = "CHURN_MODEL"

model_version = check_and_update(reg_df, model_name)

churn_model = reg.log_model(
    model_name=model_name,
    version_name=model_version,
    model=optimal_model,
    sample_input_data=X,
)

# Add evaluation metric
churn_model.set_metric(
    metric_name="accuracy",
    value=hp_df["accuracy"][0],
)


In [None]:
# Let's confirm it was added
reg.show_models()

In [None]:
hyperparameters = {
    k: v for k, v in optimal_model.get_params().items() if v and k != "missing"
}
churn_model.set_metric(metric_name="hyperparameters", value=hyperparameters)

In [None]:
pd.options.display.max_colwidth = 500
reg.get_model(model_name).show_versions()

In [None]:
reg_df = reg.get_model(model_name).show_versions()
reg_df["accuracy"] = reg_df["metadata"].apply(
    lambda x: json.loads(x)["metrics"]["accuracy"]
)
best_model = reg_df.sort_values(by="accuracy", ascending=False)

In [None]:
deployed_version = best_model["name"].iloc[0]
deployed_version

In [None]:
m = reg.get_model(model_name)
m.default = deployed_version
mv = m.default
mv.version_name

In [None]:
remote_prediction = mv.run(test_df, function_name="predict_proba")
remote_prediction.show()

In [None]:
# To test in SQL write test data back to a table

test_df.write.mode("overwrite").save_as_table("TEST_DATA")

## Add images to stage for Streamlit App

In [None]:
#session.file.put("../streamlit_images/*", "@ML_DATA")

## Calling model from a new notebook

In [None]:
# Point to the registry

reg = Registry(session=session)

# Get the default version of your model (Model with best accuracy in our case)

mv = reg.get_model("CHURN_MODEL").default

remote_prediction = mv.run(test_df, function_name="predict_proba")
remote_prediction.drop('"output_feature_0"').with_column_renamed(
    '"output_feature_1"', "pred_churnvalue"
).show()

## To delete your model and all of it's versions

In [None]:
reg.delete_model("CHURN_MODEL")

In [None]:
session.close()