# Starbucks Capstone Challenge


In [452]:
# importing libraries
import pandas as pd
import numpy as np
import math
import json
import matplotlib.pyplot as plt
import datetime
import seaborn as sns
import os
cd = os.getcwd()
import re
import statsmodels.api as sm

# magic word for producing visualizations in notebook
%matplotlib inline

import plotly.plotly as py #for creating interactive data visualizations
import plotly.graph_objs as go
from plotly import tools
import plotly.tools as tls
py.sign_in('salitr', '0Vm0IzVDJl70ydZG9ZjW') #API key has been removed for security
from plotly.offline import download_plotlyjs,init_notebook_mode,plot,iplot #to work with data visualization offline
init_notebook_mode(connected=True)
import cufflinks as cf #connects Plotly with pandas to produce the interactive data visualizations
cf.go_offline()

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn import metrics

from IPython.display import Image

In [453]:
# read in the json files
portfolio = pd.read_json('portfolio.json', orient='records', lines=True)
profile = pd.read_json('profile.json', orient='records', lines=True)
transcript = pd.read_json('transcript.json', orient='records', lines=True)

---
# 1. Data Wrangling

## 1.1 Data Sets
The data is contained in three files:

* portfolio.json - containing offer ids and meta data about each offer (duration, type, etc.)
* profile.json - demographic data for each customer
* transcript.json - records for transactions, offers received, offers viewed, and offers completed

### 1.1.1 Portfolio

In [454]:
print(portfolio.shape)
portfolio

(10, 6)


Unnamed: 0,channels,difficulty,duration,id,offer_type,reward
0,"[email, mobile, social]",10,7,ae264e3637204a6fb9bb56bc8210ddfd,bogo,10
1,"[web, email, mobile, social]",10,5,4d5c57ea9a6940dd891ad53e9dbe8da0,bogo,10
2,"[web, email, mobile]",0,4,3f207df678b143eea3cee63160fa8bed,informational,0
3,"[web, email, mobile]",5,7,9b98b8c7a33c4b65b9aebfe6a799e6d9,bogo,5
4,"[web, email]",20,10,0b1e1539f2cc45b7b9fa7c272da2e1d7,discount,5
5,"[web, email, mobile, social]",7,7,2298d6c36e964ae4a3e7e9706d1fb8c2,discount,3
6,"[web, email, mobile, social]",10,10,fafdcd668e3743c1bb461111dcafc2a4,discount,2
7,"[email, mobile, social]",0,3,5a8bc65990b245e5a138643cd4eb9837,informational,0
8,"[web, email, mobile, social]",5,5,f19421c1d4aa40978ebb69ca19b0e20d,bogo,5
9,"[web, email, mobile]",10,7,2906b810c7d4411798c6938adc9daaa5,discount,2


Here is the schema and explanation of each variable in the files:

**portfolio.json**
* id (string) - offer id
* offer_type (string) - type of offer ie BOGO, discount, informational
* difficulty (int) - minimum required spend to complete an offer
* reward (int) - reward given for completing an offer
* duration (int) - time for offer to be open, in days
* channels (list of strings)

In [455]:
portfolio.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 6 columns):
channels      10 non-null object
difficulty    10 non-null int64
duration      10 non-null int64
id            10 non-null object
offer_type    10 non-null object
reward        10 non-null int64
dtypes: int64(3), object(3)
memory usage: 560.0+ bytes


In [456]:
portfolio.describe()

Unnamed: 0,difficulty,duration,reward
count,10.0,10.0,10.0
mean,7.7,6.5,4.2
std,5.831905,2.321398,3.583915
min,0.0,3.0,0.0
25%,5.0,5.0,2.0
50%,8.5,7.0,4.0
75%,10.0,7.0,5.0
max,20.0,10.0,10.0


---
### 1.1.2 Profile

In [458]:
print(profile.shape)
profile.sample(10)

(17000, 5)


Unnamed: 0,age,became_member_on,gender,id,income
706,118,20170329,,c86980b5d65a429795dd59535136a347,
9875,54,20170808,F,849233fe4a5946f39969dbb008b4b8d0,64000.0
13925,60,20171019,M,0a9749f5782748e2933622664a8309af,95000.0
14447,58,20170524,F,8c6b2a81cb294c28b740e9c459ab7767,97000.0
15307,53,20180405,M,49a0367c0c93404e808c77f423759eff,41000.0
7691,52,20170731,M,56be387667484ed98219c34cad38d8c7,91000.0
416,64,20180418,F,cffd46a4ad444357a62f3929cd07a072,112000.0
10329,21,20180713,M,c657123dfe3d4abba20f171cf8bd73a7,64000.0
9510,38,20180128,F,e0fdbcd8ffe04287949d04efeab61be5,49000.0
6221,47,20130929,F,91cec8d70c5a4c5aa04a0ee01653e5ad,99000.0


Here is the schema and explanation of each variable in the files:

**profile.json**
* age (int) - age of the customer 
* became_member_on (int) - date when customer created an app account
* gender (str) - gender of the customer (note some entries contain 'O' for other rather than M or F)
* id (str) - customer id
* income (float) - customer's income

In [459]:
profile.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17000 entries, 0 to 16999
Data columns (total 5 columns):
age                 17000 non-null int64
became_member_on    17000 non-null int64
gender              14825 non-null object
id                  17000 non-null object
income              14825 non-null float64
dtypes: float64(1), int64(2), object(2)
memory usage: 664.1+ KB


In [460]:
profile.describe()

Unnamed: 0,age,became_member_on,income
count,17000.0,17000.0,14825.0
mean,62.531412,20167030.0,65404.991568
std,26.73858,11677.5,21598.29941
min,18.0,20130730.0,30000.0
25%,45.0,20160530.0,49000.0
50%,58.0,20170800.0,64000.0
75%,73.0,20171230.0,80000.0
max,118.0,20180730.0,120000.0


In [461]:
profile.isnull().sum()

age                    0
became_member_on       0
gender              2175
id                     0
income              2175
dtype: int64

In [462]:
print(profile.age[profile.gender.isnull()].nunique())
profile.age[profile.gender.isnull()].value_counts()

1


118    2175
Name: age, dtype: int64

In [463]:
print(profile.age[profile.income.isnull()].nunique())
profile.age[profile.income.isnull()].value_counts()

1


118    2175
Name: age, dtype: int64

In [464]:
profile.age.value_counts().head(3)

118    2175
58      408
53      372
Name: age, dtype: int64

---
### 1.1.3 Transcript

In [465]:
print(transcript.shape)
transcript.sample(5)

(306534, 4)


Unnamed: 0,event,person,time,value
97150,transaction,7fdff5bc0f6c4df99a22b893372df102,258,{'amount': 0.72}
235599,transaction,a18461ce01d340eb8704bf7fb692c6ba,546,{'amount': 22.95}
122989,offer received,fc73a0a1bd924d2998c7ee08c6cc0789,336,{'offer id': '3f207df678b143eea3cee63160fa8bed'}
5414,offer received,e3cd0a0e1af5463d9cc4d18e4b7b8b63,0,{'offer id': '9b98b8c7a33c4b65b9aebfe6a799e6d9'}
117552,offer received,0555eeabf1284cfd85c306caf9360d05,336,{'offer id': '5a8bc65990b245e5a138643cd4eb9837'}


Here is the schema and explanation of each variable in the files:

**transcript.json**
* event (str) - record description (ie transaction, offer received, offer viewed, etc.)
* person (str) - customer id
* time (int) - time in hours since start of test. The data begins at time t=0
* value - (dict of strings) - either an offer id or transaction amount depending on the record

In [466]:
transcript.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306534 entries, 0 to 306533
Data columns (total 4 columns):
event     306534 non-null object
person    306534 non-null object
time      306534 non-null int64
value     306534 non-null object
dtypes: int64(1), object(3)
memory usage: 9.4+ MB


In [467]:
transcript.event.value_counts()

transaction        138953
offer received      76277
offer viewed        57725
offer completed     33579
Name: event, dtype: int64

In [468]:
transcript.time.describe()

count    306534.000000
mean        366.382940
std         200.326314
min           0.000000
25%         186.000000
50%         408.000000
75%         528.000000
max         714.000000
Name: time, dtype: float64

---
## 1.2 Data Cleaning

### 1.2.1 Portfolio

In [469]:
portfolio_clean = portfolio.copy()

In [470]:
portfolio_clean.sample(3)

Unnamed: 0,channels,difficulty,duration,id,offer_type,reward
7,"[email, mobile, social]",0,3,5a8bc65990b245e5a138643cd4eb9837,informational,0
5,"[web, email, mobile, social]",7,7,2298d6c36e964ae4a3e7e9706d1fb8c2,discount,3
3,"[web, email, mobile]",5,7,9b98b8c7a33c4b65b9aebfe6a799e6d9,bogo,5


In [471]:
portfolio_clean.channels.sample(5)

5    [web, email, mobile, social]
7         [email, mobile, social]
4                    [web, email]
1    [web, email, mobile, social]
9            [web, email, mobile]
Name: channels, dtype: object

In [472]:
#splitting the channels

def col_split(df, column):
    splits = []
    for s in df[column]:
        for i in s:
            if i not in splits:
                splits.append(i)
    for split in splits:
        df[split] = df[column].apply(lambda x: 1 if split in x else 0)
        
    df.drop([column], axis=1, inplace=True) 
    
    return splits

col_split(portfolio_clean ,'channels')

['email', 'mobile', 'social', 'web']

In [473]:
portfolio_clean.sample(3)

Unnamed: 0,difficulty,duration,id,offer_type,reward,email,mobile,social,web
5,7,7,2298d6c36e964ae4a3e7e9706d1fb8c2,discount,3,1,1,1,1
4,20,10,0b1e1539f2cc45b7b9fa7c272da2e1d7,discount,5,1,0,0,1
0,10,7,ae264e3637204a6fb9bb56bc8210ddfd,bogo,10,1,1,1,0


In [474]:
portfolio_clean.dtypes

difficulty     int64
duration       int64
id            object
offer_type    object
reward         int64
email          int64
mobile         int64
social         int64
web            int64
dtype: object

In [475]:
portfolio_clean['duration'] = portfolio_clean['duration'] * 24

In [476]:
portfolio_clean.rename(columns={'difficulty': 'difficulty($)', 'duration': 'duration(hours)', 'id': 'offer_id'}, inplace=True)

In [477]:
portfolio_clean.head(3)

Unnamed: 0,difficulty($),duration(hours),offer_id,offer_type,reward,email,mobile,social,web
0,10,168,ae264e3637204a6fb9bb56bc8210ddfd,bogo,10,1,1,1,0
1,10,120,4d5c57ea9a6940dd891ad53e9dbe8da0,bogo,10,1,1,1,1
2,0,96,3f207df678b143eea3cee63160fa8bed,informational,0,1,1,0,1


In [478]:
offers_index_1 = portfolio_clean.set_index('offer_id')
offers_index_1.sample(3)

Unnamed: 0_level_0,difficulty($),duration(hours),offer_type,reward,email,mobile,social,web
offer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
5a8bc65990b245e5a138643cd4eb9837,0,72,informational,0,1,1,1,0
3f207df678b143eea3cee63160fa8bed,0,96,informational,0,1,1,0,1
fafdcd668e3743c1bb461111dcafc2a4,10,240,discount,2,1,1,1,1


---
### 1.2.2 Profile

In [479]:
profile_clean = profile.copy()

In [480]:
profile_clean.head(3)

Unnamed: 0,age,became_member_on,gender,id,income
0,118,20170212,,68be06ca386d4c31939f3a4f0e3dd783,
1,55,20170715,F,0610b486422d4921ae7d2bf64640c50b,112000.0
2,118,20180712,,38fe809add3b4fcf9315a9694bb96ff5,


In [481]:
profile_clean.dtypes

age                   int64
became_member_on      int64
gender               object
id                   object
income              float64
dtype: object

In [482]:
profile_clean['membership_start'] = profile_clean.became_member_on.apply(lambda x: pd.to_datetime(str(x), format='%Y%m%d'))
profile_clean['membership_period'] = profile_clean['membership_start'].dt.to_period('M')
profile_clean.drop(['became_member_on'], axis=1, inplace=True) 

In [483]:
profile_clean.head(3)

Unnamed: 0,age,gender,id,income,membership_start,membership_period
0,118,,68be06ca386d4c31939f3a4f0e3dd783,,2017-02-12,2017-02
1,55,F,0610b486422d4921ae7d2bf64640c50b,112000.0,2017-07-15,2017-07
2,118,,38fe809add3b4fcf9315a9694bb96ff5,,2018-07-12,2018-07


In [484]:
profile_clean.rename(columns={'id': 'customer_id'}, inplace=True)

In [485]:
profile_clean = profile_clean[profile_clean.age != 118]
profile_clean.sample(5)

Unnamed: 0,age,gender,customer_id,income,membership_start,membership_period
4249,68,F,4eeb7adcea7c4be7802014985754090c,106000.0,2017-08-19,2017-08
5347,71,M,3889324c08c44aaaa40cbd2fe19964db,38000.0,2015-05-20,2015-05
10767,74,M,34962645e76943bab69e9aa8c335e574,36000.0,2017-12-12,2017-12
3407,76,M,7a9d39b1142e4f1898fdf2358958197a,58000.0,2018-07-25,2018-07
1024,69,F,ac1cc98adbb3419db722ecbdb1583426,84000.0,2016-10-26,2016-10


In [486]:
profile_clean.isnull().sum()

age                  0
gender               0
customer_id          0
income               0
membership_start     0
membership_period    0
dtype: int64

---
## 1.2.3 Transcript

In [487]:
transcript_clean = transcript.copy()

In [488]:
transcript_clean.sample(5)

Unnamed: 0,event,person,time,value
129483,offer viewed,95d2b968664943d389bc24a6186d7495,342,{'offer id': '2298d6c36e964ae4a3e7e9706d1fb8c2'}
254488,offer received,8c7a45fe318b45b4a9307af34d1bd022,576,{'offer id': '0b1e1539f2cc45b7b9fa7c272da2e1d7'}
50055,offer completed,ecffdade856446168b87a147bc8d2342,144,{'offer_id': '0b1e1539f2cc45b7b9fa7c272da2e1d7...
250471,offer received,930305f046fa4f218d2539fb99baefb9,576,{'offer id': 'ae264e3637204a6fb9bb56bc8210ddfd'}
296010,transaction,5495db54251e492d989822a1444ac7d7,666,{'amount': 2.76}


In [489]:
transcript_clean.dtypes

event     object
person    object
time       int64
value     object
dtype: object

In [490]:
transcript_clean.value.sample(10)

192189                      {'amount': 3.5300000000000002}
120054    {'offer id': '2906b810c7d4411798c6938adc9daaa5'}
305876                                    {'amount': 4.86}
35236     {'offer id': '9b98b8c7a33c4b65b9aebfe6a799e6d9'}
168699    {'offer id': 'ae264e3637204a6fb9bb56bc8210ddfd'}
140150                                    {'amount': 4.64}
76846     {'offer id': '2906b810c7d4411798c6938adc9daaa5'}
14804     {'offer id': '2906b810c7d4411798c6938adc9daaa5'}
4000      {'offer id': 'ae264e3637204a6fb9bb56bc8210ddfd'}
85762                                     {'amount': 4.43}
Name: value, dtype: object

In [491]:
def df_values(df=transcript_clean):
    df['record'] = df.value.apply(lambda x: list(x.keys())[0])
    df['record_value'] = df.value.apply(lambda x: list(x.values())[0])
        
    df.drop(['value'], axis=1, inplace=True) 
    
    return None

df_values()

In [506]:
transcript_clean.rename(columns={'person': 'customer_id', 'time': 'time(hours)'}, inplace=True)
transcript_clean.sample(5)

Unnamed: 0,event,customer_id,time(hours),record,record_value
142810,transaction,b8c5cacabf4d42ea8fd403bb3a7b7706,378,amount,2.86
291105,transaction,2f386a964e3249448cc16cf9dfcd6f65,648,amount,0.51
201737,offer received,18174519037241e9b1ddcb627ca4dc41,504,offer id,2906b810c7d4411798c6938adc9daaa5
64008,offer received,e4cb5295474f42b98b079e9997103ee4,168,offer id,9b98b8c7a33c4b65b9aebfe6a799e6d9
28380,transaction,df72761941ab427b9c8878508ad5814e,42,amount,1.97


In [509]:
offers = transcript_clean[(transcript_clean.event == 'offer viewed') | (transcript_clean.event == 'offer received') | (transcript_clean.event == 'offer completed')]
transactions = transcript_clean[transcript_clean.event == 'transaction']

In [510]:
offers.sample(5)

Unnamed: 0,event,customer_id,time(hours),record,record_value
272029,offer viewed,21f58f38b6ac4f67acd2959072adaa7c,600,offer id,5a8bc65990b245e5a138643cd4eb9837
275734,offer viewed,2f811436e81b461e882079479c68e72d,606,offer id,5a8bc65990b245e5a138643cd4eb9837
116925,offer received,647ef228030343db8598e2198e5a451f,336,offer id,2298d6c36e964ae4a3e7e9706d1fb8c2
294519,offer viewed,79b6efb03873466e93714c376b1f1d1a,660,offer id,f19421c1d4aa40978ebb69ca19b0e20d
256371,offer received,e3d0e74404154aebbce117755841ba91,576,offer id,5a8bc65990b245e5a138643cd4eb9837


In [512]:
offers_index_2 = offers.set_index('record_value')
offers_index_2.sample(3)

Unnamed: 0_level_0,event,customer_id,time(hours),record
record_value,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ae264e3637204a6fb9bb56bc8210ddfd,offer received,68f47ee760824c4295daa45e0f5a29cb,576,offer id
f19421c1d4aa40978ebb69ca19b0e20d,offer received,5120299298b74644b086860f48787d6d,0,offer id
5a8bc65990b245e5a138643cd4eb9837,offer received,2329b14ec2ea4f979f82e66b057b16d6,576,offer id


In [513]:
transactions['record_value'] = pd.to_numeric(transactions['record_value'])
transactions.dtypes

event            object
customer_id      object
time(hours)       int64
record           object
record_value    float64
dtype: object

In [514]:
transactions.sample(5)

Unnamed: 0,event,customer_id,time(hours),record,record_value
31727,transaction,5a8632617ff644e995d3044c07f749b6,54,amount,7.1
107584,transaction,26fd17f8610343a9863c5fe6c287f4e5,312,amount,3.51
133425,transaction,0a03ef333feb47db93bf30d6ef41b239,354,amount,3.5
41631,transaction,896a5bab4952449ba3c972fd71aa9f30,96,amount,0.81
168872,transaction,17fc3f18cb5e4b1cbebe921a69c95c05,414,amount,10.78


---
# 2. Data Exploration