In [2]:
# This is the notebook version of build-churn-model.py. It is intended for use within the Skafos for Creators framework. 

In [1]:
from skafossdk import *
import logging
import random
import pickle
import datetime
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

from common.data import *
from common.modeling import *

In [3]:
# Initialize Skafos object
ska = Skafos()

2018-12-31 17:30:57,902 - skafossdk.data_engine - INFO - Connecting to DataEngine
2018-12-31 17:30:58,029 - skafossdk.data_engine - INFO - DataEngine Connection Opened


In [5]:
#Grab relevant features from those selected in the modeling.py file.   
features = MODEL_INPUT_FEATURES
ska.log(f"List of model input: {features}", labels=["features"], level=logging.INFO)
csvCols = features.copy()
csvCols.append(TARGET_VARIABLE) # Break into features, label, ID
csvCols.insert(0, UNIQUE_ID)


2018-12-31 17:34:22,087 - skafossdk.skafos - INFO - [features] - List of model input: ['internet_service', 'monthly_charges', 'tenure', 'dependents', 'total_charges']


In [6]:
#Get data from public S3 bucket
df = get_data(csvCols, "training")

In [7]:
# Split X and Y variables and convert categorial to dummy variables
xVars = dummify_columns(df[features], features)
yVar = df[TARGET_VARIABLE].apply(lambda x: 1 if x == "Yes" else 0)

# Create train/test split. 
X_train, X_test, y_train, y_test = train_test_split(xVars, yVar, random_state=10)

In [10]:
#Build logistic regression model
lr = LogisticRegression(C=1.0, solver='liblinear')
fittedModel = lr.fit(X_train, y_train)

In [12]:
# Compare predictions to actual values and calculate accuracy and ROC
y_preds = fittedModel.predict(X_test)
y_scores = [p[1] for p in fittedModel.predict_proba(X_test)]
model_accuracy = accuracy_score(y_test, y_preds)
model_auc = roc_auc_score(y_test, y_scores)
ska.log(f"Training accuracy: {model_accuracy}", labels=["Metrics"], level=logging.INFO)
ska.log(f"Training ROC_AUC: {model_auc}", labels=["Metrics"], level=logging.INFO)

2018-12-31 17:42:59,023 - skafossdk.skafos - INFO - [Metrics] - Training accuracy: 0.799115603284902
2018-12-31 17:42:59,025 - skafossdk.skafos - INFO - [Metrics] - Training ROC_AUC: 0.8173776670261961


In [15]:
# save model to Cassandra using the Skafos Data Engine
pickledModel = pickle.dumps(fittedModel)
saved_model = ska.engine.save_model(MODEL_TYPE, pickledModel, tags=[MODEL_TYPE, "latest"]).result()
ska.log(f"Model saved to Cassandra: {saved_model} \n", labels=["model saving"], level=logging.INFO)

2018-12-31 17:45:49,153 - skafossdk.skafos - INFO - [modelsaving] - Model saved to Cassandra: {'data': {'name': 'logisticregression', 'version': '1546278349132', 'tags': ['LogisticRegression', 'latest'], 'deployment_id': 'faa9eb05-67de-40cf-a836-c99338c5fee7', 'job_id': '', 'project_token': '91633e3d419e23dc7a2da419', 'inserted_at': '2018-12-31T17:45:49Z'}, 'success': True, 'final': True} 

