In [80]:
# load libraries

import numpy as np
import pandas as pd
from tqdm import tqdm

# my own custom functions
import EDA_functions as EDA
import cleaning_functions as cleaning

# visualization
import matplotlib.pyplot as plt
import seaborn as sns #, sns.set_style('whitegrid')
color = 'rebeccapurple'
%matplotlib inline

# display settings
from IPython.display import display
pd.options.display.max_columns = None

from pathlib import Path  # to make file path references relative to notebook directory

In [81]:
# import data

transcript_extended_hardcopy = Path.cwd() / "data" / "interim" / "transcript_extended.csv"
## transcript_extended.to_csv(transcript_extended_hardcopy, index=False)

# load hardcopy
transcript = pd.read_csv(transcript_extended_hardcopy)

---

In [179]:
"""create sample dataframe with events of three random customers"""

transcript_sample = transcript.loc[transcript['person_id'].isin(['p_1', 'p_2', 'p_3', 'p_200', 'p_10126', 'p_10', 'p_101', 'p_4'])]
transcript_sample = transcript_sample.sort_values(['person_id', 'time']).reset_index(drop=True)
display(transcript_sample.head())

Unnamed: 0,event,person_id,time,amount,offer_id,o_1,o_2,o_3,o_4,o_5,o_6,o_7,o_8,o_9,o_10
0,offer received,p_1,168,,o_8,,,,,,,,0.0,,
1,offer viewed,p_1,216,,o_8,,,,,,,,0.0,,
2,offer received,p_1,336,,o_5,,,,,0.0,,,,,
3,offer viewed,p_1,348,,o_5,,,,,0.0,,,,,
4,transaction,p_1,360,0.35,,,,,,0.0,,,,,


### Measure time for existing flagging formula

In [232]:
"""create sample dataframe with events of three random customers"""

transcript_sample = transcript.loc[transcript['person_id'].isin(['p_10126'])]
transcript_sample = transcript_sample.sort_values(['person_id', 'time']).reset_index(drop=True)



"""define function for activation flagging"""

def flag_for_activation(df):
    """Highlight / flag all transactions that are potentially activating customers. 
    This are the transactions between viewing and completion or if no completion
    between viewing and end of duration.
    
    INPUT:
        - df: dataframe, output from previous flagging function
      
    RETURNS:
        - df: dataframe with updated flags
    """
    
    for col in tqdm(df.columns[-10:]):
        index_start_list = []   
        
        for index in df.index:
            if df.loc[index, 'event'] == 'offer viewed' and df.loc[index, col] == 0:
                index_start = index
                index_start_list.append(index_start)

                for index_start in index_start_list: 
                    for idx in range(index_start, len(df), 1):
                        if df.loc[idx, 'event'] == 'offer received' and df.loc[idx, col] == 0:
                            index_stop = idx 
                            break
                        elif df.loc[idx, 'event'] == 'offer completed' and df.loc[idx, col] == 0:
                            index_stop = idx 
                            break
                        else:
                            index_stop = len(df) 
 
                    if index_stop < index_start:
                        index_stop = len(df)

                print(index, index_start, index_stop)

                for i in range(index_start, index_stop, 1):
                    if df.loc[i, col] == 0:
                        df.loc[i, col] = 1
    
    return df

In [233]:
# call function, measure runtime

import time
start = time.time()

test_flagged_1 = flag_for_activation(transcript_sample)

print('Duration: {} seconds'.format(time.time() - start))




  0%|                                                                                           | 0/10 [00:00<?, ?it/s]

6 6 11





100%|█████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 640.22it/s]

Duration: 0.015619516372680664 seconds


In [234]:
test_flagged_1.loc[test_flagged_1['person_id'] == 'p_10126']

Unnamed: 0,event,person_id,time,amount,offer_id,o_1,o_2,o_3,o_4,o_5,o_6,o_7,o_8,o_9,o_10
0,transaction,p_10126,12,22.32,,,,,,,,,,,
1,offer received,p_10126,168,,o_10,,,,,,,,,,0.0
2,offer viewed,p_10126,264,,o_10,,,,,,,,,,
3,offer received,p_10126,408,,o_8,,,,,,,,0.0,,
4,transaction,p_10126,450,31.33,,,,,,,,,0.0,,
5,offer completed,p_10126,450,,o_8,,,,,,,,0.0,,
6,offer viewed,p_10126,456,,o_8,,,,,,,,1.0,,
7,transaction,p_10126,462,26.13,,,,,,,,,1.0,,
8,transaction,p_10126,474,28.11,,,,,,,,,1.0,,
9,transaction,p_10126,504,35.4,,,,,,,,,1.0,,


In [210]:
print([x for x in range(17,18,1)])

[17]


In [7]:
BREAK here

SyntaxError: invalid syntax (<ipython-input-7-dc25f03f46ca>, line 1)

# Backup: old experimentation / function creation

In [None]:
# extend transcript with an empty column for each promotion
columns_list = list(transcript.columns)+ list(portfolio['offer_id'].values)
transcript = transcript.reindex(columns=columns_list)  # reindex creates df with all NaNs for new cols

In [None]:
# check result
display(transcript.head(2))

In [None]:
# make sample dataframe
transcript_sample = transcript.loc[transcript['person_id'].isin(['p_200', 'p_10126', 'p_1'])]
transcript_sample = transcript_sample.sort_values(['person_id', 'time'])
display(transcript_sample)

In [None]:
# flag for duration

duration_dict = dict(zip(portfolio['offer_id'], portfolio['duration_hours']))

transcript_extended = pd.DataFrame(columns=transcript.columns)

for person in transcript_sample['person_id'].unique():
    t_slice = transcript_sample.loc[transcript_sample['person_id'] == person]
    
    for index, row in t_slice.iterrows():
        if t_slice.loc[index, 'event'] == 'offer received':  
            o_id = t_slice.loc[index, 'offer_id']
            o_start = t_slice.loc[index, 'time']
            o_duration = duration_dict[o_id]
            o_end = o_start + o_duration - 1
            print(person, o_id, o_start, o_duration, o_end)
            
            t_slice[o_id] = np.where((t_slice['time'] >= o_start) & (t_slice['time'] <= o_end) \
                                     & ((t_slice['amount'] >= 0) | (t_slice['offer_id'] == o_id)), \
                                                 1, t_slice[o_id])
    display(t_slice)
    
    transcript_extended = pd.concat([transcript_extended, t_slice], join='outer')

In [None]:
duration_dict

In [None]:
transcript_extended

In [None]:
transcript_extended.reset_index(inplace=True, drop=True)

In [None]:
transcript_extended

In [None]:
transcript_extended[['event', 'person_id', 'offer_id', 'o_7']]

In [113]:
"""create sample dataframe with events of three random customers"""

transcript_sample = transcript.loc[transcript['person_id'].isin(['p_1', 'p_2', 'p_3'])]  # ['p_200', 'p_10126', 'p_1']
transcript_sample = transcript_sample.sort_values(['person_id', 'time']).reset_index(drop=True)
display(transcript_sample)

Unnamed: 0,event,person_id,time,amount,offer_id,o_1,o_2,o_3,o_4,o_5,o_6,o_7,o_8,o_9,o_10
0,offer received,p_1,168,,o_8,,,,,,,,0.0,,
1,offer viewed,p_1,216,,o_8,,,,,,,,0.0,,
2,offer received,p_1,336,,o_5,,,,,0.0,,,,,
3,offer viewed,p_1,348,,o_5,,,,,0.0,,,,,
4,transaction,p_1,360,0.35,,,,,,0.0,,,,,
5,offer received,p_1,408,,o_7,,,,,,,0.0,,,
6,offer viewed,p_1,408,,o_7,,,,,,,0.0,,,
7,transaction,p_1,414,0.74,,,,,,0.0,,0.0,,,
8,transaction,p_1,444,1.89,,,,,,0.0,,0.0,,,
9,offer received,p_1,504,,o_6,,,,,,0.0,,,,


In [189]:
"""create sample dataframe with events of three random customers"""

transcript_sample = transcript.loc[transcript['person_id'].isin(['p_1', 'p_2', 'p_3', 'p_200', 'p_10126', 'p_10', 'p_101', 'p_4'])]
transcript_sample = transcript_sample.sort_values(['person_id', 'time']).reset_index(drop=True)

# flag for activation

def function_v(transcript_extended):
    
    for col in transcript_extended.columns[-10:]:
        index_start_list = []   
        for index, row in transcript_extended.iterrows():

            if transcript_extended.loc[index, 'event'] == 'offer viewed' and transcript_extended.loc[index, col] == 0:
                index_start = index
                index_start_list.append(index_start)

                for index_start in index_start_list:    
                    for idx, roww in transcript_extended.loc[index_start: , :].iterrows():
                        if transcript_extended.loc[idx, 'event'] == 'offer received' and transcript_extended.loc[idx, col] == 0:
                            index_stop = idx-1
                            break
                        elif transcript_extended.loc[idx, 'event'] == 'offer completed' and transcript_extended.loc[idx, col] == 0:
                            index_stop = idx
                            break
                        else:
                            index_stop = len(transcript_extended) - 1 
                            
                    if index_stop < index_start:
                        index_stop = len(transcript_extended) -1

                print(index, index_start, index_stop)

#                 transcript_extended.loc[index_start : index_stop, col].apply(lambda x: 'v' if x == 1 else np.nan)
#                 transcript_extended.loc[index_start : index_stop, col].replace(1, 'v', inplace=True)
    
                for i, rowwww in transcript_extended.loc[index_start: index_stop].iterrows():
                    if transcript_extended.loc[i, col] == 0:
                        transcript_extended.loc[i, col] = 1

    return transcript_extended

In [190]:
import time
start = time.time()

transcript_test = function_v(transcript_extended = transcript_sample)

print('Duration: {} seconds'.format(time.time() - start))



103 103 106
26 26 109
37 37 65
84 84 91
93 93 95
33 33 46
49 49 51
77 77 77
79 79 103
109 109 109
3 3 79
10 10 15
6 6 14
17 17 70
72 72 109
1 1 55
59 59 69
24 24 53
98 98 109
Duration: 0.6101329326629639 seconds


In [96]:
transcript_test

Unnamed: 0,event,person_id,time,amount,offer_id,o_1,o_2,o_3,o_4,o_5,o_6,o_7,o_8,o_9,o_10
0,offer received,p_1,168,,o_8,,,,,,,,0,,
1,offer viewed,p_1,216,,o_8,,,,,,,,v,,
2,offer received,p_1,336,,o_5,,,,,0,,,,,
3,offer viewed,p_1,348,,o_5,,,,,v,,,,,
4,transaction,p_1,360,0.35,,,,,,v,,,,,
5,offer received,p_1,408,,o_7,,,,,,,0,,,
6,offer viewed,p_1,408,,o_7,,,,,,,v,,,
7,transaction,p_1,414,0.74,,,,,,v,,v,,,
8,transaction,p_1,444,1.89,,,,,,v,,v,,,
9,offer received,p_1,504,,o_6,,,,,,0,,,,


In [21]:
ttt = False
print(ttt > 0)

False
