# Starbucks Capstone Challenge 
## Prepare data 

In [1]:
import pandas as pd
import numpy as np
import math
import json
import matplotlib.pyplot as plt
from scipy import stats
from scipy.optimize import minimize
import seaborn as sns

sns.set_theme(style='darkgrid')
# read in the json files
portfolio = pd.read_json('data/portfolio.json', orient='records', lines=True)
profile = pd.read_json('data/profile.json', orient='records', lines=True)
transcript = pd.read_json('data/transcript.json', orient='records', lines=True)

### 1. Data processing

In [2]:
portfolio_mod = portfolio.rename(columns={'id':'offer_ids'})

In [3]:
# Get offer ids from 'value' column, convert to float, and store in new column
offer_ids = dict()
indx = list(transcript[transcript['event']!='transaction'].index)

for ind in indx: 
    offer_id = list(transcript.iloc[ind]['value'].values())[0]
    offer_ids.update({ind:offer_id})
    
# Make dataframe from dictionary of index, offer_id strings     
offer_id_df = pd.DataFrame.from_dict(offer_ids, orient='index', columns=['offer_ids'])

# Concat transcript_mod and offer_id_df dataframes
transcript_mod = pd.concat([transcript, offer_id_df], axis=1, ignore_index=False)

In [4]:
# Remove people was ages > 99
profile = profile[profile['age'] <= 99]

In [5]:
# Remove people with income > 1000000, drop people with missing income
profile = profile[profile['income'] < 1000000.0]

In [6]:
# merge transcript and portfolio dataframes
transcript_portfolio = transcript_mod.merge(portfolio_mod[['offer_ids', 'offer_type']], on='offer_ids', how='left')

# filter transcript_portfolio to get transcripts corresponding to BOGO offers 
bogo = transcript_portfolio[transcript_portfolio['offer_type']=='bogo']

In [7]:
# count the number of events per person
bogos_per_person = bogo.groupby('person')['event'].value_counts().unstack()

### 2. Feature engineering

In [8]:
# Binarise age based on criteria old=1, young=0
def age_group(age):
    if age > 35: 
        return 1
    else:
        return 0
profile['age_group'] = profile['age'].apply(lambda x:age_group(x))

In [9]:
# Binarise income based on criteria rich=1, poor=0
def income_group(income):
    if income > 60000: 
        return 1
    else:
        return 0
profile['income_group'] = profile['income'].apply(lambda x:income_group(x))

In [10]:
# Label gender based on criteria other=2, female=1, male=0
def gender_cat(gender):
    if gender == 'O': 
        return 2
    if gender == 'F': 
        return 1
    else:
        return 0
profile['gender_cat'] = profile['gender'].apply(lambda x:gender_cat(x))

In [11]:
# merge bogo df with demographic data
bogos_per_person = bogos_per_person.merge(profile[['id','gender_cat','age_group', 'income_group']], left_index=True, right_on='id', how='left').set_index('id')

In [12]:
# add viewed and received columns
bogos_per_person['received'] = bogos_per_person['offer received'] > 0
bogos_per_person['viewed'] = bogos_per_person['offer viewed'] > 0

In [13]:
# drop people with missing income, age or gender data 
bogos_per_person.dropna(subset=['age_group', 'gender_cat','income_group'], inplace=True)

In [14]:
# select columns needed for classifier
bogo_data = bogos_per_person[['gender_cat', 'age_group', 'income_group','viewed']]
bogo_data.head()

Unnamed: 0_level_0,gender_cat,age_group,income_group,viewed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0009655768c64bdeb2e877511632db8f,0.0,0.0,1.0,True
0011e0d4e6b944f998e987f904e8c1e5,2.0,1.0,0.0,True
0020c2b971eb4e9188eac86d93036a77,1.0,1.0,1.0,True
0020ccbbb6d84e358d3414a3ff76cffd,1.0,0.0,0.0,True
004b041fbfe44859945daa2c7f79ee64,1.0,1.0,1.0,True


In [15]:
bogo_data.to_pickle("bogo_data.pkl") 