In [2]:
# load libraries

import numpy as np
import pandas as pd
from tqdm import tqdm

# my own custom functions
import EDA_functions as EDA
import cleaning_functions as cleaning

# visualization
import matplotlib.pyplot as plt
import seaborn as sns #, sns.set_style('whitegrid')
color = 'rebeccapurple'
%matplotlib inline

# display settings
from IPython.display import display
pd.options.display.max_columns = None

from pathlib import Path  # to make file path references relative to notebook directory

In [161]:
# import data

transcript_extended_hardcopy = Path.cwd() / "data" / "interim" / "transcript_extended.csv"
transcript_flagged_hardcopy = Path.cwd() / "data" / "interim" / "transcript_flagged.csv"
portfolio_file = Path.cwd() / "data" / "processed" / "portfolio_clean.csv"

# load 
transcript = pd.read_csv(transcript_extended_hardcopy)
transcript_flagged = pd.read_csv(transcript_flagged_hardcopy)
portfolio = pd.read_csv(portfolio_file, index_col = 0)

---

In [77]:
"""create sample dataframe with events of three random customers"""

transcript_sample = transcript.loc[transcript['person_id'].isin(['p_1', 'p_2', 'p_3', 'p_200', 'p_10126', 'p_10', 'p_101', 'p_4'])]
transcript_sample = transcript_sample.sort_values(['person_id', 'time']).reset_index(drop=True)
display(transcript_sample.head())

Unnamed: 0,event,person_id,time,amount,offer_id,o_1,o_2,o_3,o_4,o_5,o_6,o_7,o_8,o_9,o_10
0,offer received,p_1,168,,o_8,,,,,,,,0.0,,
1,offer viewed,p_1,216,,o_8,,,,,,,,0.0,,
2,offer received,p_1,336,,o_5,,,,,0.0,,,,,
3,offer viewed,p_1,348,,o_5,,,,,0.0,,,,,
4,transaction,p_1,360,0.35,,,,,,0.0,,,,,


### Measure time for existing flagging formula

In [78]:
# """create sample dataframe with events of three random customers"""

# transcript_sample = transcript.loc[transcript['person_id'].isin(['p_10126'])]
# transcript_sample = transcript_sample.sort_values(['person_id', 'time']).reset_index(drop=True)



"""define function for activation flagging"""

def flag_for_activation(df):
    """Highlight / flag all transactions that are potentially activating customers. 
    This are the transactions between viewing and completion or if no completion
    between viewing and end of duration.
    
    INPUT:
        - df: dataframe, output from previous flagging function
      
    RETURNS:
        - df: dataframe with updated flags
    """
    
    for col in tqdm(df.columns[-10:]):
        index_start_list = []   
        
        for index in df.index:
            if df.loc[index, 'event'] == 'offer viewed' and df.loc[index, col] == 0:
                index_start = index
                index_start_list.append(index_start)

                for index_start in index_start_list: 
                    for idx in range(index_start, len(df), 1):
                        if df.loc[idx, 'event'] == 'offer received' and df.loc[idx, col] == 0:
                            index_stop = idx 
                            break
                        elif df.loc[idx, 'event'] == 'offer completed' and df.loc[idx, col] == 0:
                            index_stop = idx 
                            break
                        else:
                            index_stop = len(df) 
 
                    if index_stop < index_start:
                        index_stop = len(df)

                print(index, index_start, index_stop)

                for i in range(index_start, index_stop, 1):
                    if df.loc[i, col] == 0:
                        df.loc[i, col] = 1
    
    return df

In [79]:
# call function, measure runtime

import time
start = time.time()

test_flagged_1 = flag_for_activation(transcript_sample)

print('Duration: {} seconds'.format(time.time() - start))

  0%|                                                                                           | 0/10 [00:00<?, ?it/s]

103 103 106
26 26 110
37 37 66
84 84 92
93 93 95
33 33 47
49 49 51
77 77 78
79 79 104
109 109 110
3 3 80
10 10 15


 60%|█████████████████████████████████████████████████▊                                 | 6/10 [00:00<00:00, 58.41it/s]

6 6 14
17 17 71
72 72 110
1 1 56
59 59 70


 80%|██████████████████████████████████████████████████████████████████▍                | 8/10 [00:00<00:00, 35.53it/s]

24 24 54
98 98 110


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 36.33it/s]


Duration: 0.2812492847442627 seconds


In [137]:
transcript_flagged

Unnamed: 0,event,person_id,time,amount,offer_id,o_1,o_2,o_3,o_4,o_5,o_6,o_7,o_8,o_9,o_10
0,offer received,p_4,0,,o_3,,,0.0,,,,,,,
1,offer viewed,p_4,6,,o_3,,,1.0,,,,,,,
2,transaction,p_4,132,19.89,,,,1.0,,,,,,,
3,offer completed,p_4,132,,o_3,,,0.0,,,,,,,
4,transaction,p_4,144,17.78,,,,0.0,,,,,,,
5,offer received,p_4,168,,o_10,,,,,,,,,,0.0
6,offer viewed,p_4,216,,o_10,,,,,,,,,,1.0
7,transaction,p_4,222,19.67,,,,,,,,,,,1.0
8,transaction,p_4,240,29.72,,,,,,,,,,,
9,transaction,p_4,378,23.93,,,,,,,,,,,


In [80]:
test_flagged_1.loc[test_flagged_1['person_id'] == 'p_101']

Unnamed: 0,event,person_id,time,amount,offer_id,o_1,o_2,o_3,o_4,o_5,o_6,o_7,o_8,o_9,o_10
35,transaction,p_101,138,7.8,,,,,,,,,,,
36,offer received,p_101,168,,o_3,,,0.0,,,,,,,
37,offer viewed,p_101,168,,o_3,,,1.0,,,,,,,
38,transaction,p_101,222,4.85,,,,1.0,,,,,,,
39,transaction,p_101,240,4.33,,,,1.0,,,,,,,
40,transaction,p_101,318,2.08,,,,1.0,,,,,,,
41,transaction,p_101,330,1.78,,,,1.0,,,,,,,
42,transaction,p_101,342,5.49,,,,,,,,,,,
43,transaction,p_101,360,3.45,,,,,,,,,,,
44,offer received,p_101,408,,o_9,,,,,,,,,0.0,


### Bugfix - include completed

In [176]:

def flag_for_completed(df):
    """
    INPUT:
        - df: dataframe, output from previous flagging function
      
    RETURNS:
        - df: dataframe with updated flags
    """
    
    for col in tqdm(df.columns[-10:]):
        index_start_list = []   
        
        for index in df.index:
            if df.loc[index, 'event'] == 'offer viewed' and df.loc[index, col] == 1:
                index_start = index
                index_start_list.append(index_start)

                for index_start in index_start_list: 
                    for idx in range(index_start, df.index[-1], 1):
                        if df.loc[idx, 'event'] == 'offer received' and df.loc[idx, col] == 0:
                            break
                        elif df.loc[idx, 'event'] == 'offer completed' and df.loc[idx, col] == 0:
                                df.loc[idx, col] = 1
                                break
            
    return df

In [222]:

transcript_flagged.loc[285041 : , :]

Unnamed: 0,event,person_id,time,amount,offer_id,o_1,o_2,o_3,o_4,o_5,o_6,o_7,o_8,o_9,o_10
285041,offer viewed,p_5229,612,,o_6,,,,,,1.0,,,,
285042,offer received,p_5260,168,,o_10,,,,,,,,,,0.0
285043,offer viewed,p_5260,168,,o_10,,,,,,,,,,1.0
285044,transaction,p_5260,198,0.36,,,,,,,,,,,1.0
285045,transaction,p_5260,324,3.24,,,,,,,,,,,
285046,offer received,p_5260,408,,o_6,,,,,,0.0,,,,
285047,offer viewed,p_5260,408,,o_6,,,,,,1.0,,,,
285048,transaction,p_5260,414,1.47,,,,,,,1.0,,,,
285049,transaction,p_5260,474,10.73,,,,,,,1.0,,,,
285050,offer completed,p_5260,474,,o_6,,,,,,0.0,,,,


In [223]:
to_be_flagged = transcript_flagged.loc[285042 : , :]

flag_15 = flag_for_completed(to_be_flagged)












A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s











 10%|████████▎                                                                          | 1/10 [01:14<11:12, 74.78s/it]










 20%|████████████████▌                                                                  | 2/10 [02:47<10:41, 80.13s/it]










 30%|████████████████████████▉                                                          | 3/10 [03:22<07:45, 66.48s/it]










 40%|█████████████████████████████████▏                                                 | 4/10 [04:46<07:10, 71.78s/it]










 50%|█████████████████████████████████████████▌                                         | 5/10 [05:03<04:37, 55.47s/it]










 60%|█████████████████████████████████████████████████▊      

In [220]:
print(len(flag_13))
print(flag_13.head(15))

24997
                  event person_id  time  amount offer_id  o_1  o_2  o_3  o_4  \
235047   offer received   p_16794     0     NaN     o_10  NaN  NaN  NaN  NaN   
235048      transaction   p_16794     0   10.65      NaN  NaN  NaN  NaN  NaN   
235049     offer viewed   p_16794    24     NaN     o_10  NaN  NaN  NaN  NaN   
235050   offer received   p_16794   168     NaN      o_8  NaN  NaN  NaN  NaN   
235051     offer viewed   p_16794   180     NaN      o_8  NaN  NaN  NaN  NaN   
235052      transaction   p_16794   246   22.50      NaN  NaN  NaN  NaN  NaN   
235053  offer completed   p_16794   246     NaN      o_8  NaN  NaN  NaN  NaN   
235054   offer received   p_16794   336     NaN      o_3  NaN  NaN  0.0  NaN   
235055     offer viewed   p_16794   360     NaN      o_3  NaN  NaN  1.0  NaN   
235056   offer received   p_16794   408     NaN      o_7  NaN  NaN  NaN  NaN   
235057     offer viewed   p_16794   438     NaN      o_7  NaN  NaN  NaN  NaN   
235058      transaction   p_16794 

In [224]:
flagged_new = pd.concat([flag_1, flag_2, flag_3, flag_4, flag_5, flag_6, flag_7, flag_8, flag_9, flag_10, flag_11, flag_12, flag_13, flag_14, flag_15])

In [227]:
assert list(flagged_new['person_id']) == list(transcript_flagged['person_id'])

In [229]:
transcript_flagged_new = Path.cwd() / "data" / "interim" / "transcript_flagged_v2.csv"
flagged_new.to_csv(transcript_flagged_new, index=False)

In [175]:
to_be_flagged.index[-1]

30006

In [160]:
# call function, measure runtime

import time
start = time.time()

flag_2 = flag_for_completed(to_be_flagged)

print('Duration: {} seconds'.format(time.time() - start))










  0%|                                                                                           | 0/10 [00:00<?, ?it/s]








 10%|████████▎                                                                          | 1/10 [00:00<00:02,  4.43it/s]








 20%|████████████████▌                                                                  | 2/10 [00:00<00:01,  4.58it/s]








 30%|████████████████████████▉                                                          | 3/10 [00:00<00:01,  4.99it/s]








 40%|█████████████████████████████████▏                                                 | 4/10 [00:00<00:01,  5.06it/s]








 50%|█████████████████████████████████████████▌                                         | 5/10 [00:00<00:00,  5.54it/s]








 60%|█████████████████████████████████████████████████▊                                 | 6/10 [00:01<00:00,  5.58it/s]








 70%|██████████████████████████████████████████████████████████                         | 7/10 

Duration: 1.9182631969451904 seconds


In [90]:
test_flagged_2.loc[test_flagged_2['person_id'] == 'p_4']

Unnamed: 0,event,person_id,time,amount,offer_id,o_1,o_2,o_3,o_4,o_5,o_6,o_7,o_8,o_9,o_10
92,offer received,p_4,0,,o_3,,,0.0,,,,,,,
93,offer viewed,p_4,6,,o_3,,,1.0,,,,,,,
94,transaction,p_4,132,19.89,,,,1.0,,,,,,,
95,offer completed,p_4,132,,o_3,,,1.0,,,,,,,
96,transaction,p_4,144,17.78,,,,0.0,,,,,,,
97,offer received,p_4,168,,o_10,,,,,,,,,,0.0
98,offer viewed,p_4,216,,o_10,,,,,,,,,,1.0
99,transaction,p_4,222,19.67,,,,,,,,,,,1.0
100,transaction,p_4,240,29.72,,,,,,,,,,,
101,transaction,p_4,378,23.93,,,,,,,,,,,


In [None]:
"""define function for timespan flagging"""

# create a look-up dict containing the duration for the different offers
duration_dict = dict(zip(portfolio.index, portfolio['duration_hours']))

def flag_completed(df):
    """Create dummy columns for all promotions and set flag of value 0 if an event 
    belongs to a promotion or a transaction falls into the duration timespan.
    
    ARGUMENTS:
        - df: dataframe, containing the pre-cleaned transactions
        - columns_list: list, containing the columns for the extended df
        - duration_dict: dict, containing offer ids as key, durations as values
      
    RETURNS:
        - df_extended: dataframe with flagged dummy columns added
    """
    
    # initialize empty df with dummy columns
    df_extended = pd.DataFrame(columns=columns_list)

    for person in tqdm(df['person_id'].unique()):
        df_slice = df.loc[df['person_id'] == person]
    
        for index in df_slice.index:
            if df_slice.loc[index, 'event'] == 'offer received':
                o_id = df_slice.loc[index, 'offer_id']
                o_start = df_slice.loc[index, 'time']
                o_duration = duration_dict[o_id]
                o_end = o_start + o_duration - 1
#                 print(person, o_id, o_start, o_duration, o_end)

                df_slice[o_id] = np.where((df_slice['time'] >= o_start) & (df_slice['time'] <= o_end) \
                                         & ((df_slice['amount'] >= 0) | (df_slice['offer_id'] == o_id)), \
                                                     0, df_slice[o_id])
#         display(df_slice)
        df_extended = pd.concat([df_extended, df_slice], join='outer')
    
    return df_extended

In [7]:
BREAK here

SyntaxError: invalid syntax (<ipython-input-7-dc25f03f46ca>, line 1)

# Backup: old experimentation / function creation

In [None]:
# extend transcript with an empty column for each promotion
columns_list = list(transcript.columns)+ list(portfolio['offer_id'].values)
transcript = transcript.reindex(columns=columns_list)  # reindex creates df with all NaNs for new cols

In [None]:
# check result
display(transcript.head(2))

In [None]:
# make sample dataframe
transcript_sample = transcript.loc[transcript['person_id'].isin(['p_200', 'p_10126', 'p_1'])]
transcript_sample = transcript_sample.sort_values(['person_id', 'time'])
display(transcript_sample)

In [None]:
# flag for duration

duration_dict = dict(zip(portfolio['offer_id'], portfolio['duration_hours']))

transcript_extended = pd.DataFrame(columns=transcript.columns)

for person in transcript_sample['person_id'].unique():
    t_slice = transcript_sample.loc[transcript_sample['person_id'] == person]
    
    for index, row in t_slice.iterrows():
        if t_slice.loc[index, 'event'] == 'offer received':  
            o_id = t_slice.loc[index, 'offer_id']
            o_start = t_slice.loc[index, 'time']
            o_duration = duration_dict[o_id]
            o_end = o_start + o_duration - 1
            print(person, o_id, o_start, o_duration, o_end)
            
            t_slice[o_id] = np.where((t_slice['time'] >= o_start) & (t_slice['time'] <= o_end) \
                                     & ((t_slice['amount'] >= 0) | (t_slice['offer_id'] == o_id)), \
                                                 1, t_slice[o_id])
    display(t_slice)
    
    transcript_extended = pd.concat([transcript_extended, t_slice], join='outer')

In [None]:
duration_dict

In [None]:
transcript_extended

In [None]:
transcript_extended.reset_index(inplace=True, drop=True)

In [None]:
transcript_extended

In [None]:
transcript_extended[['event', 'person_id', 'offer_id', 'o_7']]

In [113]:
"""create sample dataframe with events of three random customers"""

transcript_sample = transcript.loc[transcript['person_id'].isin(['p_1', 'p_2', 'p_3'])]  # ['p_200', 'p_10126', 'p_1']
transcript_sample = transcript_sample.sort_values(['person_id', 'time']).reset_index(drop=True)
display(transcript_sample)

Unnamed: 0,event,person_id,time,amount,offer_id,o_1,o_2,o_3,o_4,o_5,o_6,o_7,o_8,o_9,o_10
0,offer received,p_1,168,,o_8,,,,,,,,0.0,,
1,offer viewed,p_1,216,,o_8,,,,,,,,0.0,,
2,offer received,p_1,336,,o_5,,,,,0.0,,,,,
3,offer viewed,p_1,348,,o_5,,,,,0.0,,,,,
4,transaction,p_1,360,0.35,,,,,,0.0,,,,,
5,offer received,p_1,408,,o_7,,,,,,,0.0,,,
6,offer viewed,p_1,408,,o_7,,,,,,,0.0,,,
7,transaction,p_1,414,0.74,,,,,,0.0,,0.0,,,
8,transaction,p_1,444,1.89,,,,,,0.0,,0.0,,,
9,offer received,p_1,504,,o_6,,,,,,0.0,,,,


In [189]:
"""create sample dataframe with events of three random customers"""

transcript_sample = transcript.loc[transcript['person_id'].isin(['p_1', 'p_2', 'p_3', 'p_200', 'p_10126', 'p_10', 'p_101', 'p_4'])]
transcript_sample = transcript_sample.sort_values(['person_id', 'time']).reset_index(drop=True)

# flag for activation

def function_v(transcript_extended):
    
    for col in transcript_extended.columns[-10:]:
        index_start_list = []   
        for index, row in transcript_extended.iterrows():

            if transcript_extended.loc[index, 'event'] == 'offer viewed' and transcript_extended.loc[index, col] == 0:
                index_start = index
                index_start_list.append(index_start)

                for index_start in index_start_list:    
                    for idx, roww in transcript_extended.loc[index_start: , :].iterrows():
                        if transcript_extended.loc[idx, 'event'] == 'offer received' and transcript_extended.loc[idx, col] == 0:
                            index_stop = idx-1
                            break
                        elif transcript_extended.loc[idx, 'event'] == 'offer completed' and transcript_extended.loc[idx, col] == 0:
                            index_stop = idx
                            break
                        else:
                            index_stop = len(transcript_extended) - 1 
                            
                    if index_stop < index_start:
                        index_stop = len(transcript_extended) -1

                print(index, index_start, index_stop)

#                 transcript_extended.loc[index_start : index_stop, col].apply(lambda x: 'v' if x == 1 else np.nan)
#                 transcript_extended.loc[index_start : index_stop, col].replace(1, 'v', inplace=True)
    
                for i, rowwww in transcript_extended.loc[index_start: index_stop].iterrows():
                    if transcript_extended.loc[i, col] == 0:
                        transcript_extended.loc[i, col] = 1

    return transcript_extended

In [190]:
import time
start = time.time()

transcript_test = function_v(transcript_extended = transcript_sample)

print('Duration: {} seconds'.format(time.time() - start))



103 103 106
26 26 109
37 37 65
84 84 91
93 93 95
33 33 46
49 49 51
77 77 77
79 79 103
109 109 109
3 3 79
10 10 15
6 6 14
17 17 70
72 72 109
1 1 55
59 59 69
24 24 53
98 98 109
Duration: 0.6101329326629639 seconds


In [96]:
transcript_test

Unnamed: 0,event,person_id,time,amount,offer_id,o_1,o_2,o_3,o_4,o_5,o_6,o_7,o_8,o_9,o_10
0,offer received,p_1,168,,o_8,,,,,,,,0,,
1,offer viewed,p_1,216,,o_8,,,,,,,,v,,
2,offer received,p_1,336,,o_5,,,,,0,,,,,
3,offer viewed,p_1,348,,o_5,,,,,v,,,,,
4,transaction,p_1,360,0.35,,,,,,v,,,,,
5,offer received,p_1,408,,o_7,,,,,,,0,,,
6,offer viewed,p_1,408,,o_7,,,,,,,v,,,
7,transaction,p_1,414,0.74,,,,,,v,,v,,,
8,transaction,p_1,444,1.89,,,,,,v,,v,,,
9,offer received,p_1,504,,o_6,,,,,,0,,,,


In [21]:
ttt = False
print(ttt > 0)

False
