In [2]:
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu

import pickle
from dateutil.relativedelta import relativedelta
import gc
from re import finditer

## Find DD DU
from helper import preprocess_data
from patterns import find_drawdowns, find_drawups

## MATCHING
import name_matching
from name_matching import name_match
import transaction_matching
from transaction_matching import transaction_match

## CONSOLIDATION
from consolidation import combine_matches, consolidate_matches, find_attritions, find_new_accounts, get_attrition_status, get_new_account_status


In [3]:
start_date = dataiku.get_custom_variables()['start_date']
end_date = dataiku.get_custom_variables()['end_date']

consistency = int(dataiku.get_custom_variables()['consistency'])
drawdown_period_average = int(dataiku.get_custom_variables()['drawdown_period_average'])
drawdown = int(dataiku.get_custom_variables()['drawdown'])
drawdown_fwd_check = int(dataiku.get_custom_variables()['drawdown_fwd_check'])
drawdown_lookback_period = int(dataiku.get_custom_variables()['drawdown_lookback_period'])
drawup_lookfwd_period = int(dataiku.get_custom_variables()['drawup_lookfwd_period'])
statistics_period = int(dataiku.get_custom_variables()['statistics_period'])
inactive_period = int(dataiku.get_custom_variables()['inactive_period'])

## MATCHING VARIABLES
month_diff_h = int(dataiku.get_custom_variables()['month_diff_h'])
month_diff_l = int(dataiku.get_custom_variables()['month_diff_l'])
sd_mul = int(dataiku.get_custom_variables()['sd_mul'])
max_city_distance = int(dataiku.get_custom_variables()['max_city_distance'])
threshold_score_step1 = int(dataiku.get_custom_variables()['threshold_score_step1'])
threshold_score_step2 = int(dataiku.get_custom_variables()['threshold_score_step2'])

## RUN TYPE
run = dataiku.get_custom_variables()['run_type']

In [5]:
# Read recipe inputs
NAFCUSTOMER_ACTIVE_CARDS_FULL = dataiku.Dataset("NAFCUSTOMER_ACTIVE_CARDS_FULL")
NAFCUSTOMER_ACTIVE_CARDS_FULL_df = NAFCUSTOMER_ACTIVE_CARDS_FULL.get_dataframe()

print(len(NAFCUSTOMER_ACTIVE_CARDS_FULL_df))
print(len(NAFCUSTOMER_ACTIVE_CARDS_FULL_df.CUSTOMER.unique()))
NAFCUSTOMER_ACTIVE_CARDS_FULL_df.head()

16855726
657100


Unnamed: 0,CUSTOMER,REVENUE_YEAR,REVENUE_MONTH,ACTIVE_CARD_COUNT
0,HODA KARAM,2019,1,1.0
1,REIGH SERVICES INC,2019,1,2.0
2,STEPLIN CONSTRUCTION CORP,2019,1,1.0
3,REX A HYMEL CO INC,2019,1,1.0
4,HOMESTEAD DESIGN LLC,2019,1,1.0


In [7]:
df_v = NAFCUSTOMER_ACTIVE_CARDS_FULL_df
print(len(df_v))

16855726


In [8]:
def date_tz_naive(pd_s):
    return pd.to_datetime(pd_s).apply(lambda x:x.tz_localize(None))

In [9]:
print(len(df_v))
df_v['REVENUE_DATE'] = df_v.REVENUE_MONTH.astype(str) + "/01/" + df_v.REVENUE_YEAR.astype(str)
df_v['REVENUE_DATE'] = date_tz_naive(df_v['REVENUE_DATE'])
print(len(df_v))

16855726
16855726


In [10]:
print(len(df_v))
df_v = df_v[df_v['REVENUE_DATE'].between(pd.to_datetime(start_date), pd.to_datetime(end_date))].copy()
df_v = df_v.dropna(subset=['CUSTOMER'])
print(len(df_v))

16855726
16453212


In [11]:
df_v['REVENUE_DATE'] = pd.to_datetime(df_v['REVENUE_DATE'])

df_v = df_v[['CUSTOMER','REVENUE_DATE', 'ACTIVE_CARD_COUNT']]

df_v_max = df_v[['CUSTOMER','ACTIVE_CARD_COUNT']]
df_max = df_v_max.groupby(by=["CUSTOMER"]).max().reset_index()
df_max.columns = ['CUSTOMER', 'ACTIVE_CARD_MAX']

In [12]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from dateutil.relativedelta import relativedelta
from helper import *

def split_list(lst, n):
    '''
    Splits a list into almost equal n parts
    '''
    k, m = divmod(len(lst), n)
    return [lst[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n)]

In [13]:
match_type = "program_flip"
period_start_date = start_date
period_end_date = None
drawup_window = drawup_lookfwd_period
statistics_period = statistics_period
split = None

In [14]:
period_start_date = pd.to_datetime(period_start_date)
df_v = df_v[df_v['REVENUE_DATE'] >= period_start_date].copy()

if period_end_date:
    period_end_date = pd.to_datetime(period_end_date)
    df_v = df_v[df_v['revenue_date'] <= period_end_date].copy()

all_account_ids = list(df_v['CUSTOMER'].unique())

if not split:
    split=1

all_account_ids_n = list(split_list(all_account_ids, split))

rise_df = pd.DataFrame()

for sublist in tqdm(all_account_ids_n):

    du_find = df_v[df_v['CUSTOMER'].isin(sublist)].copy()

    ## Filter Non-Zero Records and find the first non zero transaction date
    du_find = du_find[du_find['ACTIVE_CARD_COUNT'] > 0]
    
    du_find.sort_values(['REVENUE_DATE'], inplace=True)

    du_agg = du_find.groupby(['CUSTOMER'], as_index=False)[['REVENUE_DATE']].min()
    
    du_agg['DU_INDICATOR'] = np.where((du_agg['REVENUE_DATE'] > period_start_date), True, False)
    du_agg.rename(columns={'REVENUE_DATE':'DU_DATE'}, inplace=True)
    du_agg['DU_DATE'] -= pd.DateOffset(months=1)
    du_agg = du_agg[du_agg['DU_INDICATOR'] == True].drop_duplicates(['CUSTOMER'])
    
    ## list of customers who are drawing up
    du_customers = list(du_agg['CUSTOMER'])
    
    if len(du_customers) == 0:
        continue
    
    du_find = du_find[du_find['CUSTOMER'].isin(du_customers)].copy()
    
    du_find = du_find.groupby('CUSTOMER').apply(lambda group: group.iloc[:-1, 1:]).reset_index()
    du_find.drop('level_1', axis=1, inplace=True)
    
    du_find = du_find.merge(du_agg, left_on=['CUSTOMER'], right_on=['CUSTOMER'])
    
    du_find['DU_AVG_START'] = du_find['DU_DATE']  + pd.DateOffset(months=drawup_window)
    du_find['DU_AVG_END'] = du_find['DU_DATE']  + pd.DateOffset(months=drawup_window+statistics_period-1)
    
    du_find_12 = du_find[du_find['REVENUE_DATE'].between(du_find['DU_AVG_START'], du_find['DU_AVG_END'])].copy()
    
    du_stat = du_find_12.groupby(['CUSTOMER'], as_index=False).agg({'ACTIVE_CARD_COUNT':['mean','std']})
    
    du_stat.columns = ['CUSTOMER', 'mean_du','std_du']
    
    rise_df_ = du_agg.merge(du_stat, left_on='CUSTOMER', right_on='CUSTOMER', how='left')
    
    rise_df = pd.concat([rise_df, rise_df_], ignore_index=True)

rise_df.rename(columns={'DU_DATE':'DRAW_UP_DATE',
                        'mean_du':'MEAN_DU',
                       'std_du':'STD_DU'}, inplace=True)

rise_df = rise_df[['CUSTOMER','DRAW_UP_DATE','MEAN_DU','STD_DU']]
rise_df.head()


100%|██████████| 1/1 [05:37<00:00, 337.45s/it]


Unnamed: 0,CUSTOMER,DRAW_UP_DATE,MEAN_DU,STD_DU
0,AIR CONDITION CO,2021-06-01,1.909091,0.301511
1,1109,2021-04-01,1.0,0.0
2,3D FASTENERS PLUS INC,2019-12-01,3.0,0.0
3,94775 J AND J SNACK FOODS AA,2022-04-01,1.0,
4,APOSTROPHE DESIGN INC,2019-02-01,2.75,0.452267


In [15]:
print(len(rise_df))
rise_df = pd.merge(rise_df, df_max, how='left', on='CUSTOMER')
print(len(rise_df))
rise_df

351542
351542


Unnamed: 0,CUSTOMER,DRAW_UP_DATE,MEAN_DU,STD_DU,ACTIVE_CARD_MAX
0,AIR CONDITION CO,2021-06-01,1.909091,0.301511,3.0
1,1109,2021-04-01,1.000000,0.000000,1.0
2,3D FASTENERS PLUS INC,2019-12-01,3.000000,0.000000,17.0
3,94775 J AND J SNACK FOODS AA,2022-04-01,1.000000,,1.0
4,APOSTROPHE DESIGN INC,2019-02-01,2.750000,0.452267,4.0
...,...,...,...,...,...
351537,ZZR CONSTRUCTION CO LLC,2022-02-01,,,1.0
351538,ZZSTRANSPORT,2019-02-01,,,1.0
351539,ZZZ TRANSPORTATION INC,2022-07-01,,,1.0
351540,ZZZ TRUCKING LLC,2020-09-01,1.000000,0.000000,3.0


In [0]:
# Compute recipe outputs from inputs
# TODO: Replace this part by your actual code that computes the output, as a Pandas dataframe
# NB: DSS also supports other kinds of APIs for reading and writing data. Please see doc.

CALCULATED_CARD_DRAW_UPS_FULL_df = rise_df

# Write recipe outputs
CALCULATED_CARD_DRAW_UPS_FULL = dataiku.Dataset("CALCULATED_CARD_DRAW_UPS_FULL")
CALCULATED_CARD_DRAW_UPS_FULL.write_with_schema(CALCULATED_CARD_DRAW_UPS_FULL_df)