In [37]:
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu

import pickle
from dateutil.relativedelta import relativedelta
import gc
from re import finditer

## Find DD DU
from helper import preprocess_data
from patterns import find_drawdowns, find_drawups

## MATCHING
import name_matching
from name_matching import name_match
import transaction_matching
from transaction_matching import transaction_match

## CONSOLIDATION
from consolidation import combine_matches, consolidate_matches, find_attritions, find_new_accounts, get_attrition_status, get_new_account_status

In [38]:
start_date = dataiku.get_custom_variables()['start_date']
end_date = dataiku.get_custom_variables()['end_date']

consistency = int(dataiku.get_custom_variables()['consistency'])
drawdown_period_average = int(dataiku.get_custom_variables()['drawdown_period_average'])
drawdown = int(dataiku.get_custom_variables()['drawdown'])
drawdown_fwd_check = int(dataiku.get_custom_variables()['drawdown_fwd_check'])
drawdown_lookback_period = int(dataiku.get_custom_variables()['drawdown_lookback_period'])
drawup_lookfwd_period = int(dataiku.get_custom_variables()['drawup_lookfwd_period'])
statistics_period = int(dataiku.get_custom_variables()['statistics_period'])
inactive_period = int(dataiku.get_custom_variables()['inactive_period'])

## MATCHING VARIABLES
month_diff_h = int(dataiku.get_custom_variables()['month_diff_h'])
month_diff_l = int(dataiku.get_custom_variables()['month_diff_l'])
sd_mul = int(dataiku.get_custom_variables()['sd_mul'])
max_city_distance = int(dataiku.get_custom_variables()['max_city_distance'])
threshold_score_step1 = int(dataiku.get_custom_variables()['threshold_score_step1'])
threshold_score_step2 = int(dataiku.get_custom_variables()['threshold_score_step2'])

## RUN TYPE
run = dataiku.get_custom_variables()['run_type']

In [39]:
def date_tz_naive(pd_s):
    return pd.to_datetime(pd_s).apply(lambda x:x.tz_localize(None))

In [40]:
# Read recipe inputs
NAFCUSTOMER_REVENUE_BY_CUSTOMER_LIMITED = dataiku.Dataset("NAFCUSTOMER_REVENUE_BY_CUSTOMER_LIMITED")
NAFCUSTOMER_REVENUE_BY_CUSTOMER_LIMITED_df = NAFCUSTOMER_REVENUE_BY_CUSTOMER_LIMITED.get_dataframe()

In [42]:
df_v = NAFCUSTOMER_REVENUE_BY_CUSTOMER_LIMITED_df

print(len(df_v))
df_v['REVENUE_DATE'] = df_v.REVENUE_MONTH.astype(str) + "/01/" + df_v.REVENUE_YEAR.astype(str)
df_v['REVENUE_DATE'] = date_tz_naive(df_v['REVENUE_DATE'])
print(len(df_v))

df_v = df_v[['CUSTOMER','REVENUE_DATE', 'ACTIVE_CARD_COUNT']]

df_cust =  df_v.groupby(by=["CUSTOMER","REVENUE_DATE"]).sum().reset_index()
df_cust.head()

21965
21965


Unnamed: 0,CUSTOMER,REVENUE_DATE,ACTIVE_CARD_COUNT
0,3GPP,2020-09-01,16.0
1,3GPP,2020-10-01,22.0
2,3GPP,2020-11-01,19.0
3,3GPP,2020-12-01,27.0
4,3GPP,2021-01-01,25.0


In [43]:
print(len(df_v))
df_v = df_v[df_v['REVENUE_DATE'].between(pd.to_datetime(start_date), pd.to_datetime(end_date))].copy()
df_v = df_v.dropna(subset=['CUSTOMER'])
print(len(df_v))

21965
21434


In [44]:
df_v['REVENUE_DATE'] = pd.to_datetime(df_v['REVENUE_DATE'])

In [45]:
df_v.sort_values(['REVENUE_DATE'], inplace=True)

In [47]:
df_v.head()

Unnamed: 0,CUSTOMER,REVENUE_DATE,ACTIVE_CARD_COUNT
0,GENERAL MILLS,2019-01-01,1398.0
297,VISTAWALL,2019-01-01,1.0
296,VOLVO CE,2019-01-01,0.0
295,BAYER (3M33),2019-01-01,1748.0
294,CARGILL INC,2019-01-01,203.0


In [48]:
seen_accounts = df_v[df_v['ACTIVE_CARD_COUNT'] > 0].groupby(['CUSTOMER'], as_index=False)[['REVENUE_DATE']].first()
seen_accounts['FIRST_DATE'] = seen_accounts['REVENUE_DATE'] - pd.DateOffset(months=1)

In [49]:
df_v.REVENUE_DATE.value_counts(dropna=False)
print(len(df_v))

21434


In [50]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from dateutil.relativedelta import relativedelta
from helper import *

In [51]:
df = df_v
period_end_date = end_date
match_type = 'program_flip'
period_start_date=None
split=None

In [53]:
drawdown = (100 - drawdown)/100
drawdown_fwd_check /= 100

inactive_date_start = pd.to_datetime(period_end_date) + relativedelta(months=-inactive_period)

df = df[df['REVENUE_DATE'] <= period_end_date].copy()

if period_start_date:
    period_start_date = pd.to_datetime(period_start_date)
    df = df[df['REVENUE_DATE'] >= period_start_date].copy()

all_account_ids = list(df['CUSTOMER'].unique())

if not split:
    split=1

all_account_ids_n = list(split_list(all_account_ids, split))

drop_df = pd.DataFrame()

In [0]:
def find_consistent_cust(df, consecutive=3):
    '''returns a list of customers who are consistent for 3 (default value) months'''
    
    ## Needs only these columns ['customer_account_name', 'revenue_month', 'purchase_gallons_qty']
    
    df = df[['CUSTOMER_ACCOUNT_ID', 'REVENUE_DATE', 'PURCHASE_GALLONS_QTY']].copy()
    df.sort_values(by=['CUSTOMER_ACCOUNT_ID', 'REVENUE_DATE'], inplace=True)
    
    z = (df.groupby(['CUSTOMER_ACCOUNT_ID'])['REVENUE_DATE'].diff(1)/np.timedelta64(1, 'M'))
    z = z.round(0)
    z = (z == 1).astype('int')
    df['CUST_CONS'] = (z * (z.groupby((z != z.shift()).cumsum()).cumcount() + 2))
    cust_cons = df.groupby('CUSTOMER_ACCOUNT_ID')['CUST_CONS'].max()
    
    return list(cust_cons[cust_cons>=consecutive].index)

In [56]:
for sublist in tqdm(all_account_ids_n):

    dd_find = df[df['CUSTOMER'].isin(sublist)].copy()
    
    consistent_customers_dd = find_consistent_cust(dd_find, consecutive=consistency)
    if len(consistent_customers_dd) == 0:
        continue


  0%|          | 0/1 [00:00<?, ?it/s]


KeyError: "['PURCHASE_GALLONS_QTY', 'CUSTOMER_ACCOUNT_ID'] not in index"

In [0]:
# Compute recipe outputs from inputs
# TODO: Replace this part by your actual code that computes the output, as a Pandas dataframe
# NB: DSS also supports other kinds of APIs for reading and writing data. Please see doc.

#CALCULATED_CARD_DRAW_DOWNS_df = NAFCUSTOMER_REVENUE_BY_CUSTOMER_LIMITED_df # For this sample code, simply copy input to output

# Write recipe outputs
#CALCULATED_CARD_DRAW_DOWNS = dataiku.Dataset("CALCULATED_CARD_DRAW_DOWNS")
#CALCULATED_CARD_DRAW_DOWNS.write_with_schema(CALCULATED_CARD_DRAW_DOWNS_df)