# Assess & Clean Data

Load raw data, assess, clean and save in proper sets.


### Data Sources

Cleaned datafiles, as prepared in notebook 1:
- portfolio_clean.csv - containing offer ids and meta data about each offer (duration, type, etc.)
- profile_clean.csv - demographic data for each customer
- transcript_clean.csv - records for transactions, offers received, offers viewed, and offers completed

### Changes

- 2019-01-02: Started notebook



In [53]:
# load libraries

import numpy as np
import pandas as pd
from tqdm import tqdm

# my own custom functions
import EDA_functions as EDA
import cleaning_functions as cleaning

# visualization
import matplotlib.pyplot as plt
import seaborn as sns #, sns.set_style('whitegrid')
color = 'rebeccapurple'
%matplotlib inline

# display settings
from IPython.display import display
pd.options.display.max_columns = None

from pathlib import Path  # to make file path references relative to notebook directory

In [54]:
# import data

portfolio_file = Path.cwd() / "data" / "processed" / "portfolio_clean.csv"
profile_file = Path.cwd() / "data" / "processed" / "profile_clean.csv"
transcript_file = Path.cwd() / "data" / "processed" / "transcript_clean.csv"

portfolio = pd.read_csv(portfolio_file)
profile = pd.read_csv(profile_file)
transcript = pd.read_csv(transcript_file)

## Explore transcript data

In [55]:
display(transcript.sample(10))
display(transcript.info())

Unnamed: 0,event,person_id,time,amount,offer_id
288803,transaction,p_8735,642,3.14,
174799,transaction,p_2697,426,3.52,
271662,offer completed,p_16257,594,,o_8
54880,offer received,p_2294,168,,o_8
270993,transaction,p_12218,594,28.59,
188915,transaction,p_14708,456,19.45,
267132,offer completed,p_8312,588,,o_4
217828,transaction,p_14578,504,1.18,
12132,offer received,p_16354,0,,o_8
218486,offer completed,p_476,510,,o_5


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306534 entries, 0 to 306533
Data columns (total 5 columns):
event        306534 non-null object
person_id    306534 non-null object
time         306534 non-null int64
amount       138953 non-null float64
offer_id     167581 non-null object
dtypes: float64(1), int64(1), object(3)
memory usage: 11.7+ MB


None

In [56]:
""" change dtypes"""

transcript = cleaning.change_dtypes(transcript, cols_to_category=['event', 'offer_id'])

# safety-check
transcript.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306534 entries, 0 to 306533
Data columns (total 5 columns):
event        306534 non-null category
person_id    306534 non-null object
time         306534 non-null int64
amount       138953 non-null float64
offer_id     167581 non-null category
dtypes: category(2), float64(1), int64(1), object(1)
memory usage: 7.6+ MB


In [57]:
# list possible events
transcript['event'].unique()

[offer received, offer viewed, transaction, offer completed]
Categories (4, object): [offer received, offer viewed, transaction, offer completed]

In [58]:
# extend transcript with an empty column for each promotion
columns_list = list(transcript.columns)+ list(portfolio['offer_id'].values)
transcript = transcript.reindex(columns=columns_list)

In [59]:
# check result
display(transcript.head(2))

Unnamed: 0,event,person_id,time,amount,offer_id,o_1,o_2,o_3,o_4,o_5,o_6,o_7,o_8,o_9,o_10
0,offer received,p_4,0,,o_3,,,,,,,,,,
1,offer received,p_5,0,,o_5,,,,,,,,,,


### Look at events for three customers

In [60]:
transcript_sample = transcript.loc[transcript['person_id'].isin(['p_200', 'p_10126', 'p_1'])]
transcript_sample = transcript_sample.sort_values(['person_id', 'time'])
display(transcript_sample)

Unnamed: 0,event,person_id,time,amount,offer_id,o_1,o_2,o_3,o_4,o_5,o_6,o_7,o_8,o_9,o_10
53174,offer received,p_1,168,,o_8,,,,,,,,,,
85290,offer viewed,p_1,216,,o_8,,,,,,,,,,
110828,offer received,p_1,336,,o_5,,,,,,,,,,
130147,offer viewed,p_1,348,,o_5,,,,,,,,,,
135224,transaction,p_1,360,0.35,,,,,,,,,,,
150596,offer received,p_1,408,,o_7,,,,,,,,,,
163374,offer viewed,p_1,408,,o_7,,,,,,,,,,
167626,transaction,p_1,414,0.74,,,,,,,,,,,
182544,transaction,p_1,444,1.89,,,,,,,,,,,
201570,offer received,p_1,504,,o_6,,,,,,,,,,


In [79]:
duration_dict = dict(zip(portfolio['offer_id'], portfolio['duration_hours']))

transcript_extended = pd.DataFrame(columns=transcript.columns)

for person in transcript_sample['person_id'].unique():
    t_slice = transcript_sample.loc[transcript_sample['person_id'] == person]
    
    for index, row in t_slice.iterrows():
        if t_slice.loc[index, 'event'] == 'offer received':  ### MAYBE TO CHANGE TO OFFERS VIEWED, BUT STARTTIME HAS TO STAY HERE
            o_id = t_slice.loc[index, 'offer_id']
            o_start = t_slice.loc[index, 'time']
            o_duration = duration_dict[o_id]
            o_end = o_start + o_duration - 1
            print(person, o_id, o_start, o_duration, o_end)
            
            t_slice[o_id] = np.where((t_slice['time'] >= o_start) & (t_slice['time'] <= o_end) \
                                     & ((t_slice['amount'] >= 0) | (t_slice['offer_id'] == o_id)), \
                                                 1, t_slice[o_id])
    display(t_slice)
    
    transcript_extended = pd.concat([transcript_extended, t_slice], join='outer')

p_1 o_8 168 168 335


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


p_1 o_5 336 240 575
p_1 o_7 408 240 647
p_1 o_6 504 168 671
p_1 o_7 576 240 815


Unnamed: 0,event,person_id,time,amount,offer_id,o_1,o_2,o_3,o_4,o_5,o_6,o_7,o_8,o_9,o_10
53174,offer received,p_1,168,,o_8,,,,,,,,1.0,,
85290,offer viewed,p_1,216,,o_8,,,,,,,,1.0,,
110828,offer received,p_1,336,,o_5,,,,,1.0,,,,,
130147,offer viewed,p_1,348,,o_5,,,,,1.0,,,,,
135224,transaction,p_1,360,0.35,,,,,,1.0,,,,,
150596,offer received,p_1,408,,o_7,,,,,,,1.0,,,
163374,offer viewed,p_1,408,,o_7,,,,,,,1.0,,,
167626,transaction,p_1,414,0.74,,,,,,1.0,,1.0,,,
182544,transaction,p_1,444,1.89,,,,,,1.0,,1.0,,,
201570,offer received,p_1,504,,o_6,,,,,,1.0,,,,


p_10126 o_10 168 72 239
p_10126 o_8 408 168 575


Unnamed: 0,event,person_id,time,amount,offer_id,o_1,o_2,o_3,o_4,o_5,o_6,o_7,o_8,o_9,o_10
19408,transaction,p_10126,12,22.32,,,,,,,,,,,
60628,offer received,p_10126,168,,o_10,,,,,,,,,,1.0
98432,offer viewed,p_10126,264,,o_10,,,,,,,,,,
158080,offer received,p_10126,408,,o_8,,,,,,,,1.0,,
186196,transaction,p_10126,450,31.33,,,,,,,,,1.0,,
186197,offer completed,p_10126,450,,o_8,,,,,,,,1.0,,
188370,offer viewed,p_10126,456,,o_8,,,,,,,,1.0,,
190424,transaction,p_10126,462,26.13,,,,,,,,,1.0,,
194264,transaction,p_10126,474,28.11,,,,,,,,,1.0,,
216733,transaction,p_10126,504,35.4,,,,,,,,,1.0,,


p_200 o_8 0 168 167
p_200 o_7 168 240 407
p_200 o_4 408 120 527
p_200 o_4 504 120 623
p_200 o_5 576 240 815


Unnamed: 0,event,person_id,time,amount,offer_id,o_1,o_2,o_3,o_4,o_5,o_6,o_7,o_8,o_9,o_10
138,offer received,p_200,0,,o_8,,,,,,,,1.0,,
53314,offer received,p_200,168,,o_7,,,,,,,1.0,,,
77230,offer viewed,p_200,192,,o_7,,,,,,,1.0,,,
81416,transaction,p_200,204,2.32,,,,,,,,1.0,,,
108931,transaction,p_200,324,3.55,,,,,,,,1.0,,,
141586,transaction,p_200,378,3.37,,,,,,,,1.0,,,
150749,offer received,p_200,408,,o_4,,,,1.0,,,,,,
174391,offer viewed,p_200,426,,o_4,,,,1.0,,,,,,
201712,offer received,p_200,504,,o_4,,,,1.0,,,,,,
218437,offer viewed,p_200,510,,o_4,,,,1.0,,,,,,


In [62]:
duration_dict

{'o_1': 168,
 'o_10': 72,
 'o_2': 120,
 'o_3': 168,
 'o_4': 120,
 'o_5': 240,
 'o_6': 168,
 'o_7': 240,
 'o_8': 168,
 'o_9': 96}

In [63]:
transcript_extended

Unnamed: 0,event,person_id,time,amount,offer_id,o_1,o_2,o_3,o_4,o_5,o_6,o_7,o_8,o_9,o_10
53174,offer received,p_1,168,,o_8,,,,,,,,1.0,,
85290,offer viewed,p_1,216,,o_8,,,,,,,,1.0,,
110828,offer received,p_1,336,,o_5,,,,,1.0,,,,,
130147,offer viewed,p_1,348,,o_5,,,,,1.0,,,,,
135224,transaction,p_1,360,0.35,,,,,,1.0,,,,,
150596,offer received,p_1,408,,o_7,,,,,,,1.0,,,
163374,offer viewed,p_1,408,,o_7,,,,,,,1.0,,,
167626,transaction,p_1,414,0.74,,,,,,1.0,,1.0,,,
182544,transaction,p_1,444,1.89,,,,,,1.0,,1.0,,,
201570,offer received,p_1,504,,o_6,,,,,,1.0,,,,


In [64]:
transcript_extended.reset_index(inplace=True, drop=True)

In [65]:
transcript_extended

Unnamed: 0,event,person_id,time,amount,offer_id,o_1,o_2,o_3,o_4,o_5,o_6,o_7,o_8,o_9,o_10
0,offer received,p_1,168,,o_8,,,,,,,,1.0,,
1,offer viewed,p_1,216,,o_8,,,,,,,,1.0,,
2,offer received,p_1,336,,o_5,,,,,1.0,,,,,
3,offer viewed,p_1,348,,o_5,,,,,1.0,,,,,
4,transaction,p_1,360,0.35,,,,,,1.0,,,,,
5,offer received,p_1,408,,o_7,,,,,,,1.0,,,
6,offer viewed,p_1,408,,o_7,,,,,,,1.0,,,
7,transaction,p_1,414,0.74,,,,,,1.0,,1.0,,,
8,transaction,p_1,444,1.89,,,,,,1.0,,1.0,,,
9,offer received,p_1,504,,o_6,,,,,,1.0,,,,


In [66]:
transcript_extended[['event', 'person_id', 'offer_id', 'o_7']]

Unnamed: 0,event,person_id,offer_id,o_7
0,offer received,p_1,o_8,
1,offer viewed,p_1,o_8,
2,offer received,p_1,o_5,
3,offer viewed,p_1,o_5,
4,transaction,p_1,,
5,offer received,p_1,o_7,1.0
6,offer viewed,p_1,o_7,1.0
7,transaction,p_1,,1.0
8,transaction,p_1,,1.0
9,offer received,p_1,o_6,


In [77]:
for col in transcript_extended.columns[-10:]:
    index_start_list = []   
    for index, row in transcript_extended.iterrows():
        
        
        if transcript_extended.loc[index, 'event'] == 'offer viewed' and transcript_extended.loc[index, col] == 1:
            index_start = index
            index_start_list.append(index_start)

            for index_start in index_start_list:    
                for idx, roww in transcript_extended.loc[index_start: , :].iterrows():
                    if transcript_extended.loc[idx, 'event'] == 'offer received' and transcript_extended.loc[idx, col] == 1:
                        index_stop = idx-1
                        break
                    
                if index_stop < index_start:
                    index_stop = len(transcript_extended) -1
                    
            print(index, index_start, index_stop)

39 39 39
41 41 43
3 3 41
10 10 41
6 6 15
17 17 32
34 34 43
1 1 23
27 27 31


In [None]:
len(transcript_extended)

In [32]:
index_start_list

[]