# Assess & Clean Data

Load raw data, assess, clean and save in proper sets.


### Data Sources

Cleaned datafiles, as prepared in notebook 1:
- portfolio_clean.csv - containing offer ids and meta data about each offer (duration, type, etc.)
- profile_clean.csv - demographic data for each customer
- transcript_clean.csv - records for transactions, offers received, offers viewed, and offers completed

### Changes

- 2019-01-02: Started notebook



In [1]:
# load libraries

import numpy as np
import pandas as pd
from tqdm import tqdm

# my own custom functions
import EDA_functions as EDA
import cleaning_functions as cleaning

# visualization
import matplotlib.pyplot as plt
import seaborn as sns #, sns.set_style('whitegrid')
color = 'rebeccapurple'
%matplotlib inline

# display settings
from IPython.display import display
pd.options.display.max_columns = None

from pathlib import Path  # to make file path references relative to notebook directory

In [2]:
# import data

portfolio_file = Path.cwd() / "data" / "processed" / "portfolio_clean.csv"
profile_file = Path.cwd() / "data" / "processed" / "profile_clean.csv"
transcript_file = Path.cwd() / "data" / "processed" / "transcript_clean.csv"

portfolio = pd.read_csv(portfolio_file)
profile = pd.read_csv(profile_file)
transcript = pd.read_csv(transcript_file)

## Explore transcript data

In [3]:
display(transcript.sample(10))
display(transcript.info())

Unnamed: 0,event,person_id,time,amount,offer_id
244822,transaction,p_14211,570,22.49,
207591,offer received,p_8238,504,,o_1
56515,offer received,p_4487,168,,o_4
161678,offer received,p_14859,408,,o_8
180099,transaction,p_1138,438,4.61,
87146,transaction,p_115,222,20.8,
285430,offer completed,p_12530,630,,o_1
229064,transaction,p_8174,528,20.7,
255900,offer received,p_14700,576,,o_6
291204,transaction,p_14173,648,8.03,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306534 entries, 0 to 306533
Data columns (total 5 columns):
event        306534 non-null object
person_id    306534 non-null object
time         306534 non-null int64
amount       138953 non-null float64
offer_id     167581 non-null object
dtypes: float64(1), int64(1), object(3)
memory usage: 11.7+ MB


None

In [4]:
""" change dtypes"""

transcript = cleaning.change_dtypes(transcript, cols_to_category=['event', 'offer_id'])

# safety-check
transcript.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306534 entries, 0 to 306533
Data columns (total 5 columns):
event        306534 non-null category
person_id    306534 non-null object
time         306534 non-null int64
amount       138953 non-null float64
offer_id     167581 non-null category
dtypes: category(2), float64(1), int64(1), object(1)
memory usage: 7.6+ MB


In [5]:
# list possible events
transcript['event'].unique()

[offer received, offer viewed, transaction, offer completed]
Categories (4, object): [offer received, offer viewed, transaction, offer completed]

### Look at events for three customers

In [24]:
transcript_sample = transcript.loc[transcript['person_id'].isin(['p_200', 'p_10126', 'p_1'])]
transcript_sample.sort_values(['person_id', 'time'])

Unnamed: 0,event,person_id,time,amount,offer_id
53174,offer received,p_1,168,,o_8
85290,offer viewed,p_1,216,,o_8
110828,offer received,p_1,336,,o_5
130147,offer viewed,p_1,348,,o_5
135224,transaction,p_1,360,0.35,
150596,offer received,p_1,408,,o_7
163374,offer viewed,p_1,408,,o_7
167626,transaction,p_1,414,0.74,
182544,transaction,p_1,444,1.89,
201570,offer received,p_1,504,,o_6


In [34]:
transcript_extended = pd.DataFrame(columns=transcript.columns)
duration_dict = dict(zip(portfolio['offer_id'], portfolio['duration_hours']))

for person in transcript_sample['person_id'].unique():
    t_slice = transcript_sample.loc[transcript_sample['person_id'] == person]
    
    for index, row in t_slice.iterrows():
        if t_slice.loc[index, 'event'] == 'offer received':  ### MAYBE TO CHANGE TO OFFERS VIEWED, BUT STARTTIME HAS TO STAY HERE
            o_id = t_slice.loc[index, 'offer_id']
            o_start = t_slice.loc[index, 'time']
            o_duration = duration_dict[o_id]
            o_end = o_start + o_duration - 1
            print(person, o_id, o_start, o_duration, o_end)
            
### FALSCH        t_slice[o_id] = np.where((t_slice['time'] >= o_start) & (t_slice['time'] <= o_end), 1, np.NaN)
    display(t_slice)
    
    transcript_extended = pd.concat([transcript_extended, t_slice], join='outer')

p_200 o_8 0 168 167


KeyError: 'o_8'

In [32]:
transcript_extended

Unnamed: 0,amount,event,o_10,o_4,o_5,o_6,o_7,o_8,offer_id,person_id,time
138,,offer received,,,,,,1.0,o_8,p_200,0
53314,,offer received,,,,,1.0,,o_7,p_200,168
77230,,offer viewed,,,,,1.0,,o_7,p_200,192
81416,2.32,transaction,,,,,1.0,,,p_200,204
108931,3.55,transaction,,,,,1.0,,,p_200,324
141586,3.37,transaction,,,,,1.0,,,p_200,378
150749,,offer received,,,,,,,o_4,p_200,408
174391,,offer viewed,,,,,,,o_4,p_200,426
201712,,offer received,,1.0,,,,,o_4,p_200,504
218437,,offer viewed,,1.0,,,,,o_4,p_200,510


In [16]:
duration_dict

{'o_1': 168,
 'o_10': 72,
 'o_2': 120,
 'o_3': 168,
 'o_4': 120,
 'o_5': 240,
 'o_6': 168,
 'o_7': 240,
 'o_8': 168,
 'o_9': 96}

In [27]:
72/24

3.0

In [None]:
duration_dict = dict(zip(portfolio['offer_id'], portfolio['duration_hours']))

for person in transcript_sample['person_id'].unique():
    for index, row in transcript_sample.loc[transcript_sample['person_id'] == person].iterrows():
        if transcript_sample.loc[index, 'event'] == 'offer received':  ### MAYBE TO CHANGE TO OFFERS VIEWED, BUT STARTTIME HAS TO STAY HERE
            o_id = transcript_sample.loc[index, 'offer_id']
            o_start = transcript_sample.loc[index, 'time']
            o_duration = duration_dict[o_id]
            o_end = o_start + o_duration - 1
            print(person, o_id, o_start, o_duration, o_end)
            
        transcript_sample[o_id] = np.where((transcript_sample['time'] >= o_start) & (transcript_sample['time'] <= o_end), 1, np.NaN)
        

In [23]:
transcript_sample.loc[transcript_sample['event'] == 'offer received']

Unnamed: 0,event,person_id,time,amount,offer_id
138,offer received,p_200,0,,o_8
53174,offer received,p_1,168,,o_8
53314,offer received,p_200,168,,o_7
60628,offer received,p_10126,168,,o_10
110828,offer received,p_1,336,,o_5
150596,offer received,p_1,408,,o_7
150749,offer received,p_200,408,,o_4
158080,offer received,p_10126,408,,o_8
201570,offer received,p_1,504,,o_6
201712,offer received,p_200,504,,o_4


In [30]:
df_empty = pd.DataFrame(columns=transcript.columns)
df_empty

Unnamed: 0,event,person_id,time,amount,offer_id


entity person:
- offers received vs. offers viewed