# Starbucks Capstone Challenge 
## Prepare data 

In [1]:
import pandas as pd
import numpy as np
import math
import json
import matplotlib.pyplot as plt

import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
    
sns.set_theme(style='darkgrid')
# read in the json files
portfolio = pd.read_json('../data/raw/portfolio.json', orient='records', lines=True)
profile = pd.read_json('../data/raw/profile.json', orient='records', lines=True)
transcript = pd.read_json('../data/raw/transcript.json', orient='records', lines=True)

### 1. Data processing

In [2]:
# Get offer ids from 'value' column, convert to float, and store in new column
offer_ids = dict()
indx = list(transcript[transcript['event']!='transaction'].index)

for ind in indx: 
    offer_id = list(transcript.iloc[ind]['value'].values())[0]
    offer_ids.update({ind:offer_id})
    
# Make dataframe from dictionary of index, offer_id strings     
offer_id_df = pd.DataFrame.from_dict(offer_ids, orient='index', columns=['offer_ids'])

# Concat transcript_mod and offer_id_df dataframes
transcript_mod = pd.concat([transcript, offer_id_df], axis=1, ignore_index=False)

In [3]:
# rename column 'id' as offer_ids to remain consistent with transcript df
portfolio = portfolio.rename(columns={'id':'offer_ids'})

In [4]:
# merge transcript and portfolio dataframes
transcript_portfolio = transcript_mod.merge(portfolio[['offer_ids', 'offer_type']], on='offer_ids', how='left')

# filter transcript_portfolio to get transcripts corresponding to BOGO offers 
bogo = transcript_portfolio[transcript_portfolio['offer_type']=='bogo']

In [5]:
# count the number of events per person
bogos_per_person = bogo.groupby('person')['event'].value_counts().unstack()

In [6]:
# rename column 'id' to 'person' to remain consistent with transcript
profile = profile.rename(columns={'id':'person'})

In [7]:
# Remove people was ages > 99
profile = profile[profile['age'] <= 99]

In [8]:
# Remove people with income > 1000000
profile = profile[profile['income'] < 1000000.0]

### 2. Feature engineering

In [9]:
# Binarise age based on criteria old=1, young=0
def age_group(age):
    if age > 35: 
        return 'old'
    else:
        return 'young'
profile['age_group'] = profile['age'].apply(lambda x:age_group(x))

In [10]:
# Binarise income based on criteria rich=1, poor=0
def income_group(income):
    if income > 60000: 
        return 'high income'
    else:
        return 'low income'
profile['income_group'] = profile['income'].apply(lambda x:income_group(x))

In [11]:
# select columns needed for training model
profile_subset = profile[['person','income_group','age_group', 'gender']]

In [12]:
# reset index to allow ease concatanation with transformed ohe data
profile_subset = profile_subset.reset_index()

profile_subset.drop('index', axis=1, inplace=True)

In [13]:
# One hot encode age_group, income_group, gender

# intantiate onehotencoder
enc = OneHotEncoder(handle_unknown='ignore')

# fit onehotencoder on data
enc.fit(profile_subset[['income_group','age_group', 'gender']])

# one hot encode features
transformed = enc.transform(profile_subset[['income_group','age_group', 'gender']]).toarray()

#Create a Pandas DataFrame of the hot encoded column
ohe_df = pd.DataFrame(transformed, columns=enc.get_feature_names_out())

#concat with original data
profile_subset_ohe = pd.concat([profile_subset, ohe_df], axis=1)

profile_subset_ohe.drop(['income_group','age_group', 'gender'], axis=1, inplace=True)

In [14]:
# merge bogo df with demographic data
bogos_per_person_demo = bogos_per_person.merge(profile_subset_ohe, left_index=True, right_on='person', how='left').set_index('person')

In [15]:
# add viewed and received columns
bogos_per_person_demo['received'] = bogos_per_person_demo['offer received'] > 0
bogos_per_person_demo['viewed'] = bogos_per_person_demo['offer viewed'] > 0

In [17]:
# drop people with missing demographic data (note: people either have all demographic data filled in or none, so only need to drop based on missing demographic data in one column)
bogos_per_person_demo.dropna(subset=['age_group_old'], inplace=True)

In [18]:
# select columns needed for classifier
bogo_data = bogos_per_person_demo.drop(columns=['offer received','offer viewed', 'offer completed', 'received'], axis=1)
bogo_data.head()

Unnamed: 0_level_0,income_group_high income,income_group_low income,age_group_old,age_group_young,gender_F,gender_M,gender_O,viewed
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0009655768c64bdeb2e877511632db8f,1.0,0.0,0.0,1.0,0.0,1.0,0.0,True
0011e0d4e6b944f998e987f904e8c1e5,0.0,1.0,1.0,0.0,0.0,0.0,1.0,True
0020c2b971eb4e9188eac86d93036a77,1.0,0.0,1.0,0.0,1.0,0.0,0.0,True
0020ccbbb6d84e358d3414a3ff76cffd,0.0,1.0,0.0,1.0,1.0,0.0,0.0,True
004b041fbfe44859945daa2c7f79ee64,1.0,0.0,1.0,0.0,1.0,0.0,0.0,True


In [19]:
# pickle prepared dataframe
bogo_data.to_pickle("../data/processed/bogo_data.pkl") 