In [1]:
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu

import pickle
from dateutil.relativedelta import relativedelta
import gc
from re import finditer

## Find DD DU
from helper import preprocess_data
from patterns import find_drawdowns, find_drawups

## MATCHING
import name_matching
from name_matching import name_match
import transaction_matching
from transaction_matching import transaction_match

## CONSOLIDATION
from consolidation import combine_matches, consolidate_matches, find_attritions, find_new_accounts, get_attrition_status, get_new_account_status


In [2]:
start_date = dataiku.get_custom_variables()['start_date']
end_date = dataiku.get_custom_variables()['end_date']

consistency = int(dataiku.get_custom_variables()['consistency'])
drawdown_period_average = int(dataiku.get_custom_variables()['drawdown_period_average'])
drawdown = int(dataiku.get_custom_variables()['drawdown'])
drawdown_fwd_check = int(dataiku.get_custom_variables()['drawdown_fwd_check'])
drawdown_lookback_period = int(dataiku.get_custom_variables()['drawdown_lookback_period'])
drawup_lookfwd_period = int(dataiku.get_custom_variables()['drawup_lookfwd_period'])
statistics_period = int(dataiku.get_custom_variables()['statistics_period'])
inactive_period = int(dataiku.get_custom_variables()['inactive_period'])

## MATCHING VARIABLES
month_diff_h = int(dataiku.get_custom_variables()['month_diff_h'])
month_diff_l = int(dataiku.get_custom_variables()['month_diff_l'])
sd_mul = int(dataiku.get_custom_variables()['sd_mul'])
max_city_distance = int(dataiku.get_custom_variables()['max_city_distance'])
threshold_score_step1 = int(dataiku.get_custom_variables()['threshold_score_step1'])
threshold_score_step2 = int(dataiku.get_custom_variables()['threshold_score_step2'])

## RUN TYPE
run = dataiku.get_custom_variables()['run_type']

In [5]:
# Read recipe inputs
NAFCUSTOMER_REVENUE_BY_CUSTOMER_LIMITED = dataiku.Dataset("NAFCUSTOMER_REVENUE_BY_CUSTOMER_LIMITED")
NAFCUSTOMER_REVENUE_BY_CUSTOMER_LIMITED_df = NAFCUSTOMER_REVENUE_BY_CUSTOMER_LIMITED.get_dataframe()

print(len(NAFCUSTOMER_REVENUE_BY_CUSTOMER_LIMITED_df))
print(len(NAFCUSTOMER_REVENUE_BY_CUSTOMER_LIMITED_df.CUSTOMER.unique()))
NAFCUSTOMER_REVENUE_BY_CUSTOMER_LIMITED_df.head()

21965
692


Unnamed: 0,CUSTOMER_ACCOUNT_ID,CUSTOMER_ACCOUNT_NAME,CUSTOMER,ACCOUNTSTATUS,SETUP_DATE,ACCOUNT_CLOSED_DATE,REVENUE_YEAR,REVENUE_QUARTER,REVENUE_MONTH,ACCOUNT_CITY,ACCOUNT_STATE,CUSTOMER_SOURCE_SYSTEM_CODE,CUSTOMER_BUSINESS_PROGRAM_NAME,BI_CUS_PAR,BI_PRODUCT,BI_MRU,REVENUE_AMOUNT_USD,GROSS_SPEND_AMOUNT,PURCHASE_GALLONS_QTY,ACTIVE_CARD_COUNT,OUTSTANDING_CARD_COUNT
0,469005700349,GENERAL MILLS (3GMS),GENERAL MILLS,Terminated,2015-07-09 07:00:00+00:00,2021-08-17 00:00:00.000,2019,1,1,Des Plaines,IL,TANDEM,Wheels old,33200,10200,3100,4867.218696,235717.32,99603.129,1398.0,1440.0
1,469009225020,WHEELS 996,WHEELS 996,Terminated,2014-05-06 07:00:00+00:00,2021-01-14 00:00:00.000,2019,1,1,Des Plaines,IL,TANDEM,Wheels old,33200,10200,3100,127.086359,5513.53,2376.902,39.0,45.0
2,469005700612,JONES LANG LASALLE (0469),JONES LANG LASALLE (0469),Terminated,2015-12-30 08:00:00+00:00,2022-02-07 00:00:00.000,2019,1,1,Des Plaines,IL,TANDEM,Wheels old,33200,10200,3100,234.595982,22961.95,8789.132,63.0,64.0
3,469005131156,HILL MECHANICAL,HILL MECHANICAL,Terminated,2003-03-06 08:00:00+00:00,2021-04-01 00:00:00.000,2019,1,1,Des Plaines,IL,TANDEM,Wheels old,33200,10200,3100,2.0,0.0,0.0,0.0,1.0
4,469005701479,JOHNSON & JOHNSON SERVICES (3),JOHNSON JOHNSON,Terminated,2017-12-18 08:00:00+00:00,2021-10-21 00:00:00.000,2019,1,1,Des Plaines,IL,TANDEM,Wheels old,33200,10200,3100,13549.952117,687391.16,293662.118,3745.0,4155.0


In [6]:
df_v = NAFCUSTOMER_REVENUE_BY_CUSTOMER_LIMITED_df
print(len(df_v))

21965


In [7]:
def date_tz_naive(pd_s):
    return pd.to_datetime(pd_s).apply(lambda x:x.tz_localize(None))

In [8]:
print(len(df_v))
df_v['REVENUE_DATE'] = df_v.REVENUE_MONTH.astype(str) + "/01/" + df_v.REVENUE_YEAR.astype(str)
df_v['REVENUE_DATE'] = date_tz_naive(df_v['REVENUE_DATE'])
print(len(df_v))

21965
21965


In [9]:
print(len(df_v))
df_v = df_v[df_v['REVENUE_DATE'].between(pd.to_datetime(start_date), pd.to_datetime(end_date))].copy()
df_v = df_v.dropna(subset=['CUSTOMER'])
print(len(df_v))

21965
21434


In [10]:
df_v['REVENUE_DATE'] = pd.to_datetime(df_v['REVENUE_DATE'])

In [11]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from dateutil.relativedelta import relativedelta
from helper import *

def split_list(lst, n):
    '''
    Splits a list into almost equal n parts
    '''
    k, m = divmod(len(lst), n)
    return [lst[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n)]



In [12]:
match_type = "program_flip"
period_start_date = start_date
period_end_date = None
drawup_window = drawup_lookfwd_period
statistics_period = statistics_period
split = None

In [14]:
period_start_date = pd.to_datetime(period_start_date)
df_v = df_v[df_v['REVENUE_DATE'] >= period_start_date].copy()

if period_end_date:
    period_end_date = pd.to_datetime(period_end_date)
    df_v = df_v[df_v['revenue_date'] <= period_end_date].copy()

all_account_ids = list(df_v['CUSTOMER'].unique())

if not split:
    split=1

all_account_ids_n = list(split_list(all_account_ids, split))

rise_df = pd.DataFrame()

In [20]:
for sublist in tqdm(all_account_ids_n):

    du_find = df_v[df_v['CUSTOMER'].isin(sublist)].copy()

    ## Filter Non-Zero Records and find the first non zero transaction date
    du_find = du_find[du_find['ACTIVE_CARD_COUNT'] > 0]
    
    du_find.sort_values(['REVENUE_DATE'], inplace=True)

    du_agg = du_find.groupby(['CUSTOMER'], as_index=False)[['REVENUE_DATE']].min()
    
    du_agg['DU_INDICATOR'] = np.where((du_agg['REVENUE_DATE'] > period_start_date), True, False)
    du_agg.rename(columns={'REVENUE_DATE':'DU_DATE'}, inplace=True)
    du_agg['DU_DATE'] -= pd.DateOffset(months=1)
    du_agg = du_agg[du_agg['DU_INDICATOR'] == True].drop_duplicates(['CUSTOMER'])
    
    ## list of customers who are drawing up
    du_customers = list(du_agg['CUSTOMER'])
    
    if len(du_customers) == 0:
        continue

100%|██████████| 1/1 [00:00<00:00, 41.78it/s]


In [0]:
#CALCULATED_CARD_DRAW_UPS_df = NAFCUSTOMER_REVENUE_BY_CUSTOMER_LIMITED_df # For this sample code, simply copy input to output

# Write recipe outputs
#CALCULATED_CARD_DRAW_UPS = dataiku.Dataset("CALCULATED_CARD_DRAW_UPS")
#CALCULATED_CARD_DRAW_UPS.write_with_schema(CALCULATED_CARD_DRAW_UPS_df)