In [9]:
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu
import time

import pickle
from dateutil.relativedelta import relativedelta
import gc
from re import finditer

## Find DD DU
from helper import preprocess_data
from patterns import find_drawdowns, find_drawups

## MATCHING
import name_matching
from name_matching import name_match
import transaction_matching
from transaction_matching import transaction_match

## CONSOLIDATION
from consolidation import combine_matches, consolidate_matches, find_attritions, find_new_accounts, get_attrition_status, get_new_account_status


In [10]:
start_date = dataiku.get_custom_variables()['start_date']
end_date = dataiku.get_custom_variables()['end_date']

consistency = int(dataiku.get_custom_variables()['consistency'])
drawdown_period_average = int(dataiku.get_custom_variables()['drawdown_period_average'])
drawdown = int(dataiku.get_custom_variables()['drawdown'])
drawdown_fwd_check = int(dataiku.get_custom_variables()['drawdown_fwd_check'])
drawdown_lookback_period = int(dataiku.get_custom_variables()['drawdown_lookback_period'])
drawup_lookfwd_period = int(dataiku.get_custom_variables()['drawup_lookfwd_period'])
statistics_period = int(dataiku.get_custom_variables()['statistics_period'])
inactive_period = int(dataiku.get_custom_variables()['inactive_period'])

## MATCHING VARIABLES
month_diff_h = int(dataiku.get_custom_variables()['month_diff_h'])
month_diff_l = int(dataiku.get_custom_variables()['month_diff_l'])
sd_mul = int(dataiku.get_custom_variables()['sd_mul'])
max_city_distance = int(dataiku.get_custom_variables()['max_city_distance'])
threshold_score_step1 = int(dataiku.get_custom_variables()['threshold_score_step1'])
threshold_score_step2 = int(dataiku.get_custom_variables()['threshold_score_step2'])

## RUN TYPE
run = dataiku.get_custom_variables()['run_type']

print("start_date", start_date)
print("end_date", end_date)

start_date 2019-01-01
end_date 2022-10-01


In [11]:
# Read recipe inputs
NAFCUSTOMER_ACTIVE_CARDS_FULL = dataiku.Dataset("NAFCUSTOMER_ACTIVE_CARDS_FULL")
NAFCUSTOMER_ACTIVE_CARDS_FULL_df = NAFCUSTOMER_ACTIVE_CARDS_FULL.get_dataframe()
print(len(NAFCUSTOMER_ACTIVE_CARDS_FULL_df))
NAFCUSTOMER_ACTIVE_CARDS_FULL_df.head()

16505529


Unnamed: 0,CUSTOMER,REVENUE_YEAR,REVENUE_MONTH,ACTIVE_CARD_COUNT
0,ENCORE DERMATOLOGY INC,2019,1,32.0
1,BRADANICK CONSTRUCTION SERVICES INC,2019,1,3.0
2,PEREZ INDUSTRIES INC,2019,1,2.0
3,A S CAR CLEAN LLC,2019,1,0.0
4,M SULLIVAN AND SON LIMITED,2019,1,20.0


In [12]:
def date_tz_naive(pd_s):
    return pd.to_datetime(pd_s).apply(lambda x:x.tz_localize(None))

In [13]:
customer_list_full = NAFCUSTOMER_ACTIVE_CARDS_FULL_df.CUSTOMER.unique()
print(len(customer_list_full))

624069


In [14]:
page_size = 10000
idx = 0
current_page = 0
max_pages = 5

drop_df = pd.DataFrame()
t0 = time.time()

total_pages = len(customer_list_full)/page_size

while idx<len(customer_list_full):
    
    current_page+=1
    print("page", current_page)
    
    to_range = idx+page_size
    if to_range>len(customer_list_full):
        to_range = len(customer_list_full)-1
        
    current_set = customer_list_full[idx:to_range]
    
    #==============================================
    
    df_v = NAFCUSTOMER_ACTIVE_CARDS_FULL_df[NAFCUSTOMER_ACTIVE_CARDS_FULL_df.CUSTOMER.isin(current_set)]
    print("processing", len(df_v.CUSTOMER.unique()), "customers")
    print(len(df_v), "data frame records")
    
    df_v['REVENUE_DATE'] = df_v.REVENUE_MONTH.astype(str) + "/01/" + df_v.REVENUE_YEAR.astype(str)
    df_v['REVENUE_DATE'] = date_tz_naive(df_v['REVENUE_DATE'])
    
    
    #==============================================
    
    pages_remaining = total_pages-current_page
    
    t1 = time.time()
    avg_duration = (((t1-t0)/current_page)/60.0)
    print(round(avg_duration,2), "avg mins per iteration")
    print(round(pages_remaining,2), "pages remaining")
    print(round(avg_duration*pages_remaining,2), "estimated minutes remaining")
    print()
    
    #=====================================
    
    idx+=page_size
    
    if max_pages>0:
        if current_page>=max_pages:
            break;
    

page 1
0.0 avg mins per iteration
61.41 pages remaining
0.0 estimated minutes remaining

page 2
0.0 avg mins per iteration
60.41 pages remaining
0.0 estimated minutes remaining

page 3
0.0 avg mins per iteration
59.41 pages remaining
0.0 estimated minutes remaining

page 4
0.0 avg mins per iteration
58.41 pages remaining
0.0 estimated minutes remaining

page 5
0.0 avg mins per iteration
57.41 pages remaining
0.0 estimated minutes remaining



In [0]:
# Write recipe outputs
#CALCULATED_DRAW_UPS = dataiku.Dataset("CALCULATED_DRAW_UPS")
#CALCULATED_DRAW_UPS.write_with_schema(CALCULATED_DRAW_UPS_df)