In [49]:
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu

import pickle
from dateutil.relativedelta import relativedelta
import gc
from re import finditer

## Find DD DU
from helper import preprocess_data
from patterns import find_drawdowns, find_drawups

## MATCHING
import name_matching
from name_matching import name_match
import transaction_matching
from transaction_matching import transaction_match

## CONSOLIDATION
from consolidation import combine_matches, consolidate_matches, find_attritions, find_new_accounts, get_attrition_status, get_new_account_status

In [50]:
start_date = dataiku.get_custom_variables()['start_date']
end_date = dataiku.get_custom_variables()['end_date']

consistency = int(dataiku.get_custom_variables()['consistency'])
drawdown_period_average = int(dataiku.get_custom_variables()['drawdown_period_average'])
drawdown = int(dataiku.get_custom_variables()['drawdown'])
drawdown_fwd_check = int(dataiku.get_custom_variables()['drawdown_fwd_check'])
drawdown_lookback_period = int(dataiku.get_custom_variables()['drawdown_lookback_period'])
drawup_lookfwd_period = int(dataiku.get_custom_variables()['drawup_lookfwd_period'])
statistics_period = int(dataiku.get_custom_variables()['statistics_period'])
inactive_period = int(dataiku.get_custom_variables()['inactive_period'])

## MATCHING VARIABLES
month_diff_h = int(dataiku.get_custom_variables()['month_diff_h'])
month_diff_l = int(dataiku.get_custom_variables()['month_diff_l'])
sd_mul = int(dataiku.get_custom_variables()['sd_mul'])
max_city_distance = int(dataiku.get_custom_variables()['max_city_distance'])
threshold_score_step1 = int(dataiku.get_custom_variables()['threshold_score_step1'])
threshold_score_step2 = int(dataiku.get_custom_variables()['threshold_score_step2'])

## RUN TYPE
run = dataiku.get_custom_variables()['run_type']

In [51]:
def date_tz_naive(pd_s):
    return pd.to_datetime(pd_s).apply(lambda x:x.tz_localize(None))

In [52]:
# Read recipe inputs
NAFCUSTOMER_C360_ACCOUNTS = dataiku.Dataset("NAFCUSTOMER_C360_ACCOUNTS")
NAFCUSTOMER_C360_ACCOUNTS_df = NAFCUSTOMER_C360_ACCOUNTS.get_dataframe()
print(len(NAFCUSTOMER_C360_ACCOUNTS_df))

NAFCUSTOMER_REVENUE_BY_CUSTOMER_LIMITED = dataiku.Dataset("NAFCUSTOMER_REVENUE_BY_CUSTOMER_LIMITED")
NAFCUSTOMER_REVENUE_BY_CUSTOMER_LIMITED_df = NAFCUSTOMER_REVENUE_BY_CUSTOMER_LIMITED.get_dataframe()
print(len(NAFCUSTOMER_REVENUE_BY_CUSTOMER_LIMITED_df))

844417
21434


In [53]:
df_v = NAFCUSTOMER_REVENUE_BY_CUSTOMER_LIMITED_df

print(len(df_v))
df_v['REVENUE_DATE'] = df_v.REVENUE_MONTH.astype(str) + "/01/" + df_v.REVENUE_YEAR.astype(str)
df_v['REVENUE_DATE'] = date_tz_naive(df_v['REVENUE_DATE'])
print(len(df_v))
df_v.head()

21434
21434


Unnamed: 0,CUSTOMER_ACCOUNT_ID,CUSTOMER_ACCOUNT_NAME,CUSTOMER,ACCOUNTSTATUS,SETUP_DATE,ACCOUNT_CLOSED_DATE,REVENUE_YEAR,REVENUE_QUARTER,REVENUE_MONTH,ACCOUNT_CITY,ACCOUNT_STATE,CUSTOMER_SOURCE_SYSTEM_CODE,CUSTOMER_BUSINESS_PROGRAM_NAME,BI_CUS_PAR,BI_PRODUCT,BI_MRU,REVENUE_AMOUNT_USD,GROSS_SPEND_AMOUNT,PURCHASE_GALLONS_QTY,ACTIVE_CARD_COUNT,REVENUE_DATE
0,469007639032,SIEMENS BLDG TECH (2),SIEMENS MEDICAL,Active,2007-12-07 08:00:00+00:00,9999-12-31 00:00:00.000,2019,1,1,Des Plaines,IL,TANDEM,Wheels old,33200,10200,3100,408.59252,18391.35,7602.992,119.0,2019-01-01
1,469005701255,CALIFORNIA WATERS,CALIFORNIAWATERS,Active,2017-08-24 07:00:00+00:00,9999-12-31 00:00:00.000,2019,1,1,Des Plaines,IL,TANDEM,Wheels old,33200,10200,3100,2145.328235,114502.95,32432.799,446.0,2019-01-01
2,469005701172,HARTUNG BROTHERS,HARTUNG BROTHERS,Terminated,2017-07-03 07:00:00+00:00,2022-07-12 00:00:00.000,2019,1,1,Des Plaines,IL,TANDEM,Wheels old,33200,10200,3100,199.400141,2145.55,616.969,11.0,2019-01-01
3,469005700208,CONSTELLATION BRANDS (3CRW),CONSTELLATION BRANDS (3CRW),Terminated,2015-02-18 08:00:00+00:00,2021-03-10 00:00:00.000,2019,1,1,Des Plaines,IL,TANDEM,Wheels old,33200,10200,3100,1139.458388,55804.33,21944.178,344.0,2019-01-01
4,469008059644,BALFOUR BEATTY EQUIPMENT ONLY,BALFOUR EQUIP,Active,2009-08-20 07:00:00+00:00,9999-12-31 00:00:00.000,2019,1,1,Des Plaines,IL,TANDEM,Wheels old,33200,10200,3100,701.569042,49047.39,18068.041,110.0,2019-01-01


In [54]:
print(len(df_v))
df_v = df_v[df_v['REVENUE_DATE'].between(pd.to_datetime(start_date), pd.to_datetime(end_date))].copy()
df_v = df_v.dropna(subset=['CUSTOMER_ACCOUNT_ID'])
df_v = df_v[df_v['CUSTOMER_SOURCE_SYSTEM_CODE'].isin(['TANDEM', 'SIEBEL'])]
print(len(df_v))

21434
21434


In [55]:
df_v['CUSTOMER_ACCOUNT_ID'] = df_v['CUSTOMER_ACCOUNT_ID'].astype('int64')
df_v['REVENUE_DATE'] = pd.to_datetime(df_v['REVENUE_DATE'])

In [56]:
states = list(df_v['ACCOUNT_STATE'].unique())
states_dict = {s:s.upper() for s in states}

In [57]:
df_v['ACCOUNT_STATE'] = df_v['ACCOUNT_STATE'].map(states_dict)

In [58]:
## remove the unneccesary columns
remove_cols=['REVENUE_MONTH','REVENUE_YEAR', 'REVENUE_QUARTER']
df_v = df_v.drop([x for x in remove_cols if x in df_v.columns], axis=1)

In [59]:
df_v.sort_values(['REVENUE_DATE'], inplace=True)

In [60]:
seen_accounts = df_v[df_v['PURCHASE_GALLONS_QTY'] > 0].groupby(['CUSTOMER_ACCOUNT_ID'], as_index=False)[['REVENUE_DATE']].first()
seen_accounts['FIRST_DATE'] = seen_accounts['REVENUE_DATE'] - pd.DateOffset(months=1)

In [61]:
df_v.REVENUE_DATE.value_counts(dropna=False)
print(len(df_v))

21434


In [62]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from dateutil.relativedelta import relativedelta
from helper import *

#---------------------
# input vars
df = df_v
period_end_date = end_date
match_type = 'program_flip'
period_start_date=None
split=None
#------------------------

drawdown = (100 - drawdown)/100
drawdown_fwd_check /= 100

inactive_date_start = pd.to_datetime(period_end_date) + relativedelta(months=-inactive_period)

if match_type == 'conversion':
    df = df[df['CUSTOMER_SOURCE_SYSTEM_CODE'] == 'TANDEM'].copy()

df = df[df['REVENUE_DATE'] <= period_end_date].copy()

if period_start_date:
    period_start_date = pd.to_datetime(period_start_date)
    df = df[df['REVENUE_DATE'] >= period_start_date].copy()

all_account_ids = list(df['CUSTOMER_ACCOUNT_ID'].unique())

if not split:
    split=1

all_account_ids_n = list(split_list(all_account_ids, split))

drop_df = pd.DataFrame()

for sublist in tqdm(all_account_ids_n):

    dd_find = df[df['CUSTOMER_ACCOUNT_ID'].isin(sublist)].copy()

    ## Find consistent customers
    consistent_customers_dd = find_consistent_customers(dd_find, consecutive=consistency)
    if len(consistent_customers_dd) == 0:
        continue

  0%|          | 0/1 [00:00<?, ?it/s]


KeyError: "None of [Index(['customer_account_id', 'revenue_date', 'purchase_gallons_qty'], dtype='object')] are in the [columns]"

In [0]:
# Compute recipe outputs from inputs
# TODO: Replace this part by your actual code that computes the output, as a Pandas dataframe
# NB: DSS also supports other kinds of APIs for reading and writing data. Please see doc.



#CALCULATED_DRAW_DOWNS_df = NAFCUSTOMER_C360_ACCOUNTS_df # For this sample code, simply copy input to output


# Write recipe outputs
#CALCULATED_DRAW_DOWNS = dataiku.Dataset("CALCULATED_DRAW_DOWNS")
#CALCULATED_DRAW_DOWNS.write_with_schema(CALCULATED_DRAW_DOWNS_df)