In [None]:
!pip install -U pandasql
!pip install zeno_etl_libs_v3==1.0.1

In [None]:
"""
Author:shubham.gupta@zeno.health
Purpose: Churn prediction
"""
import json
import os
from datetime import datetime as dt
from pickle import load, dump
import argparse
import sys

In [None]:
sys.path.append('../../../..')

from zeno_etl_libs.helper.aws.s3 import S3
from zeno_etl_libs.db.db import DB
from zeno_etl_libs.helper import helper
from zeno_etl_libs.logger import get_logger
from zeno_etl_libs.helper.email.email import Email

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from dateutil.relativedelta import relativedelta
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_curve, plot_roc_curve
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier

In [None]:
env = "dev"
email_to = ["shubham.gupta@zeno.health"]
re_train = 1
features = []
schema = "prod2-generico"
prod_write = 1

In [None]:
# params ( retrain or just predict from trained model )
if re_train:
    end = str(dt.today().date())
    start = str(dt.today().date() - relativedelta(months=7))
else:
    end = str(dt.today().date())
    start = str(dt.today().date() - relativedelta(months=6))
    features = ['patient-id'] + features

In [None]:
os.environ['env'] = env
logger = get_logger()
logger.info(f"env: {env}")

In [None]:
#table_name = 'customer-behaviour-segment'

rs_db = DB()
rs_db.open_connection()

table_name = 'consumer-churn'
rs_db_write = DB(read_only=prod_write)
rs_db_write.open_connection()

s3 = S3(bucket_name='datascience-manager')

In [None]:
def seek():
    """ get the data """
    pass
def run_fun(rs_db, s3):
    # write logic here
    pass

In [None]:
table_info = helper.get_table_info(db=rs_db_write, table_name=table_name, schema=schema)
logger.info(table_info)

In [None]:
read_schema = 'prod2-generico'

if isinstance(table_info, type(None)):
     print(f"table: {table_name} do not exist") 
else:
    truncate_query = f"""                
                    DELETE
                        FROM
                    "{read_schema}"."consumer-churn"
                    WHERE
                        DATE("created-at") = '{dt.now().date()}';"""    
    logger.info(truncate_query)
    rs_db_write.execute(truncate_query)

In [None]:
# data_fetching

data_q = f"""
        select
            rm."patient-id"  ,
            rm.id as "bill-id",
            rm."bill-date",
            rm."created-at" as "bill-created-at",
            rm."total-spend",
            rm."spend-generic", 
            rm."value-segment", 
            rm."system-age-days" as "system-age"
        from
            "{read_schema}"."retention-master" rm 
        where
            rm."bill-date" between '{start}' and '{end}';
        """

data = rs_db.get_df(data_q)

In [None]:
def dataframe_creator(bill_data, re_train, features=[]) :
    # Preparing dataset
    data['bill-date'] = pd.to_datetime(data['bill-date'])

    if re_train:
        df_end = str(dt.today().date() + relativedelta(months=-1))
    else:
        df_end = str(dt.today().date())

    df = data[(data['bill-date'] < df_end)]
    df = df[['patient-id', 'bill-id', 'bill-date', 'bill-created-at', 'total-spend',
             'spend-generic', 'value-segment', 'system-age']]
    logger.info(df.info())

    # data type correction
    df['total-spend'] = df['total-spend'].astype(float)
    df['spend-generic'] = df['spend-generic'].astype(float)

    # total spend and ABV in 6 months
    df2_nob_spend = df.groupby('patient-id', as_index=False).agg({'bill-id': 'nunique',
                                                                  'total-spend': ['mean', 'sum'],
                                                                  'spend-generic': 'sum'})
    df2_nob_spend['generic-pc'] = df2_nob_spend[('spend-generic', 'sum')] / df2_nob_spend[('total-spend', 'sum')]
    df2_nob_spend = df2_nob_spend.drop(columns=[('spend-generic', 'sum')])
    df2_nob_spend.columns = ['patient-id', 'nob', 'abv', 'total-spend', 'generic-pc']

    df3_rec_bill = df.sort_values('bill-created-at', ascending=False)

    df3_rec_bill = df3_rec_bill.groupby('patient-id', as_index=False).head(1)
    df3_rec_bill['value-segment'] = df3_rec_bill['value-segment'].map({'platinum': 4,
                                                                       'gold': 3,
                                                                       'silver': 2,
                                                                       'others': 1})
    df3_rec_bill.loc[:, 'today'] = df_end
    df3_rec_bill['recency'] = pd.to_datetime(df3_rec_bill['today']) - df3_rec_bill['bill-date']
    df3_rec_bill['recency'] = df3_rec_bill['recency'].dt.days
    df3_rec_bill = df3_rec_bill[['patient-id', 'recency', 'system-age', 'value-segment']]

    df4_bill_diff = df[['bill-id', 'patient-id', 'bill-date']]
    df4_bill_diff = df4_bill_diff.sort_values('bill-date')
    df4_bill_diff['shifted-date'] = df4_bill_diff.groupby('patient-id', as_index=False)['bill-date'].shift(1)
    df4_bill_diff['bill-diff'] = df4_bill_diff['bill-date'] - df4_bill_diff['shifted-date']
    df4_bill_diff['bill-diff'] = df4_bill_diff['bill-diff'].dt.days
    df4_bill_diff['bill-diff'] = df4_bill_diff.groupby('patient-id', as_index=False)['bill-diff'].backfill()
    df4_bill_diff = df4_bill_diff.groupby('patient-id', as_index=False).agg({'bill-diff': ['mean', 'std']})
    df4_bill_diff = df4_bill_diff.fillna(0)
    df4_bill_diff.columns = ['patient-id', 'mean-purchase-interval', 'std-purchase-interval']

    final_df = pd.merge(df2_nob_spend, df3_rec_bill, on='patient-id', how='left')
    final_df = pd.merge(final_df, df4_bill_diff, on='patient-id', how='left')

    for i in final_df.columns:
        if final_df[i].dtype == 'float64':
            final_df[i] = final_df[i].round(4)

    if re_train:
        train_label = data[(data['bill-date'] >= df_end)]
        train_label = train_label[['patient-id']].drop_duplicates()
        train_label['churn'] = 0

        final_df = pd.merge(final_df, train_label, on='patient-id', how='left')
        final_df['churn'] = final_df['churn'].fillna(1)
        final_df = final_df.drop_duplicates(subset='patient-id')
    else:
        final_df = final_df.drop_duplicates(subset='patient-id')
        final_df = final_df[features]

    final_df = final_df.dropna()
    return final_df

In [None]:
def find_optimal_cutoff(target, predicted):
    """ Find the optimal probability cutoff point for a classification model
    ----------
    target : Matrix with dependent or target data, where rows are observations

    predicted : Matrix with predicted data, where rows are observations

    Returns
    -------
    list type, with optimal cutoff value
    """
    fpr, tpr, threshold = roc_curve(target, predicted)
    i = np.arange(len(tpr))
    roc = pd.DataFrame(
        {'tf': pd.Series(tpr - (1 - fpr), index=i), 'threshold': pd.Series(threshold, index=i)})
    roc_t = roc.iloc[(roc.tf - 0).abs().argsort()[:1]]

    return list(roc_t['threshold'])

In [None]:
if re_train == 1 :
    final_df = dataframe_creator(data, re_train=1, features=[])
else :
    final_df = dataframe_creator(data, re_train=0, features=features)

In [None]:
if re_train:
    y = final_df[['churn']]
    X = final_df.drop(columns=['churn', 'patient-id'])
    # train-test split
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.20,
                                                        random_state=0,
                                                        stratify=y)
    X_train, X_val, y_train, y_val = train_test_split(X_train,
                                                      y_train,
                                                      test_size=0.20,
                                                      random_state=0,
                                                      stratify=y_train)

    # Baseline DecisionTree Model
    dtc = DecisionTreeClassifier()
    dtc.fit(X_train, y_train)
    dtc_report = classification_report(y_val, dtc.predict(X_val))
    logger.info('decision_tree baseline model classification report validation data : '
                '{}'.format(dtc_report))
    # extracting all alphas
    alphas = dtc.cost_complexity_pruning_path(X_train, y_train)
    alphas = alphas['ccp_alphas']
    # finding best alphas
    g_search = GridSearchCV(dtc,
                            param_grid={'ccp_alpha': list(set(alphas.round(6)))},
                            cv=5,
                            scoring='roc_auc',
                            n_jobs=-1,
                            verbose=0)

    # fit the grid search to the data
    g_search.fit(X_train, y_train)
    # putting best params in DT
    dtc = DecisionTreeClassifier(**g_search.best_params_)
    dtc.fit(X_train, y_train)

    # bp = best params
    dtc_report_bp_train = classification_report(y_train, dtc.predict(X_train))
    dtc_report_bp_val = classification_report(y_val, dtc.predict(X_val))
    logger.info('decision_tree tuned model classification report train : '
                '{}'.format(dtc_report_bp_train))
    logger.info('decision_tree tuned model classification report validation : '
                '{}'.format(dtc_report_bp_val))

    ft_imp = pd.DataFrame(data=dtc.feature_importances_,
                          index=X_train.columns).sort_values(0, ascending=False)
    ft_imp[1] = ft_imp[0].cumsum()
    
    # feature selection
    feat_selection = ft_imp[ft_imp[1] < 0.90]

    if len(feat_selection) <= 5:
        feat = ft_imp.index[:5]
    else:
        feat = feat_selection.index

    X_train = X_train[feat]
    X_test = X_test[feat]
    X_val = X_val[feat]
    X = X[feat]

    logger.info('feature selected : {}'.format(feat))

    # Taking best params from DT
    depth = np.linspace(dtc.get_depth() / 2, dtc.get_depth(), 5).round()
    alpha = dtc.ccp_alpha

    # Create the parameter grid based on the results of best decision tree
    param_grid = {
        'bootstrap': [True],
        'max_depth': depth,
        'max_features': ["sqrt", "log2"],
        'ccp_alpha': [alpha],
        'n_estimators': [25, 50, 75, 100, 150, 200, 250]
    }
    
    # Create a based model
    rf = RandomForestClassifier()
    # Instantiate the grid search model
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
                               cv=5, n_jobs=-1, verbose=0, scoring='roc_auc')

    grid_search.fit(X_train, y_train)
    rf = RandomForestClassifier(**grid_search.best_params_)
    rf.fit(X_train, y_train)

    # classification report
    rf_report_bp_train = classification_report(y_train, rf.predict(X_train))
    rf_report_bp_val = classification_report(y_val, rf.predict(X_val))
    rf_report_bp_test = classification_report(y_test, rf.predict(X_test))
    logger.info('random_forest tuned model classification report train : '
                '{}'.format(rf_report_bp_train))
    logger.info('random_forest tuned model classification report validation : '
                '{}'.format(rf_report_bp_val))
    logger.info('random_forest tuned model classification report test : '
                '{}'.format(rf_report_bp_test))


    cutoff = find_optimal_cutoff(y_train, rf.predict_proba(X_train)[:, 1])[0]
    # Train data
    #plot_roc_curve(rf, X_train, y_train)
    # plt.savefig(output_dir_path + 'roc_curve_Train.png')
    logger.info('optimal cutoff  value : {}'.format(round(cutoff, 3)))

    # Validation data
    #plot_roc_curve(rf, X_val, y_val)
    # plt.savefig(output_dir_path + 'roc_curve_Val.png')

    # Test data
    #plot_roc_curve(rf, X_test, y_test)
    # plt.savefig(output_dir_path + 'roc_curve_Test.png')

    # # Saving model
    # dump(rf, open(output_dir_path + 'model.pkl', 'wb'))
    # script_manager_obj.s3_admin_obj.upload_object(output_dir_path + 'model.pkl',
    #                                               f'data/Job-{script_manager_obj.job_id}/input/model.pkl')

In [None]:
if re_train:
    final_df = dataframe_creator(data, re_train=0, features=feat)
    final_df['churn-prob'] = rf.predict_proba(final_df)[:, 1]
    final_df['churn-prediction'] = np.where(final_df['churn-prob'] >= cutoff, 1, 0)
    final_df['created-at'] = dt.now().date()
    final_df['re-trained'] = 1
else:
    # script_manager_obj.s3_admin_obj.get_object(f'data/Job-{script_manager_obj.job_id}/input/model.pkl',
    #                                            input_dir_path)
    # rf = load(open(input_dir_path + 'model.pkl', 'rb'))
    pred = final_df.drop(columns=['patient-id'])
    final_df['churn-prob'] = rf.predict_proba(pred)[:, 1]
    final_df['churn-prediction'] = np.where(final_df['churn-prob'] >= job_data_params['cutoff'], 1, 0)
    final_df['created-at'] = dt.now().date()
    final_df['re-trained'] = 0

In [None]:
# Write to csv
s3.save_df_to_s3(df=final_df, 
                 file_name='consumer_churn_prediction_{}.csv'.format(dt.today()), index=False)

In [None]:
# data type correction
final_df['churn'] = final_df['churn'].astype(int)
final_df['value-segment'] = final_df['value-segment'].astype(int)

In [None]:
# upload to db
s3.write_df_to_db(df=final_df[table_info['column_name']], 
                  table_name=table_name, db=rs_db_write, schema=schema)

In [None]:
email = Email()

subject = "Task status churn calcualtion"
mail_body = "Churn data upload succeeded"

file_uris= []
    
email.send_email_file(subject=subject,
                      mail_body=mail_body,
                      to_emails=email_to, file_uris=file_uris, file_paths=[])  

In [None]:
# for action_dict in actions_list:
#     if action_dict['category'] == 'EML':
#         to_emails = action_dict['email_to']
#         subject = 'churn prediction algo status : {}'.format(status)
#         mail_body = 'Table fetch from  {} to {} '.format(start, end)
#         if job_data_params['re_train']:
#             file_paths = [output_dir_path + 'debug_{}.txt'.format(script_manager_obj.job_id),
#                           output_dir_path + 'roc_curve_Train.png',
#                           output_dir_path + 'roc_curve_Val.png',
#                           output_dir_path + 'roc_curve_Test.png']
#         else:
#             file_paths = [output_dir_path + 'debug_{}.txt'.format(script_manager_obj.job_id)]
#         send_email_file(subject, mail_body, to_emails, file_paths)