In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

pd.set_option('mode.chained_assignment',  None)
pd.set_option('display.max_columns', None)

Import data.

In [None]:
profile = pd.read_csv('/kaggle/input/starbucks-customer-data/profile.csv', index_col=0)
portfolio = pd.read_csv('/kaggle/input/starbucks-customer-data/portfolio.csv', index_col=0)
transcript = pd.read_csv('/kaggle/input/starbucks-customer-data/transcript.csv', index_col=0)


## 0. Data Filtering

In [None]:
profile.head()

It looks like that there are some null values in the 'profile'.

In [None]:
profile.isna().sum()

Also there are some suspicious aged people in the 'profile'.

In [None]:
sns.histplot(data=profile, x='age')
plt.show()

In [None]:
profile['age'].describe()

The suspicious age is 118.  
And the people who have suspicious age also have the null value of 'gender' and 'income'.  
It is natural to remove them from the original 'profile' for the future works.  
And the cleaned 'profile' is like this.  

In [None]:
profile = profile.dropna()
profile.head()

With cleaned prfile let's do the given tasks.

## 1.1. What is gender distribution?

We can find out the gender distribution from the column 'gender' in the 'profile'.

In [None]:
sns.barplot(x=profile['gender'].value_counts().index, y=profile['gender'].value_counts(normalize=True))
plt.title('Gender Distribution')
plt.xlabel('Gender')
plt.ylabel('Proportion')
plt.show()

From the variable description, some entries contain 'O' for other rather than 'M' or 'F'.  
The gender distirbution is like above, the customers consists of about 57% of male, 40% of female and 3% of the 3rd sex.m

## 1.2. What is the income distribution?

We can plot the histogram by using 'income' column in the 'profile'.  
Before investigate the income distribution, create a 'log_income' column for a comparison.

In [None]:
profile['log_income'] = profile['income'].apply(np.log10)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

for i, col in enumerate(['income', 'log_income']):
    sns.histplot(ax=axes[i], data=profile, x=col, kde=True)

axes[0].set_title('Income Distribution')
axes[1].set_title('Log-Income Distribution')
plt.show()

In [None]:
print('Average income is: {}'.format(round(profile['income'].mean(), 3)))
print('Median income is: {}'.format(profile['income'].median()))

## 1.3. When do people typically become a member?

We can answer this question by using 'became_member_on' column in the 'profile'.

In [None]:
profile.dtypes

Make the type of 'became_member_on' column as datetime.

In [None]:
profile.loc[:, 'became_member_on'] = pd.to_datetime(arg=profile['became_member_on'], format='%Y%m%d')
profile.head()

Create new columns 'year' and 'month' from the 'became_member_on'.

In [None]:
profile['year'] = pd.DatetimeIndex(profile['became_member_on']).year
profile['month'] = pd.DatetimeIndex(profile['became_member_on']).month

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(20, 5))

for i, col in enumerate(['year', 'month']):
    sns.lineplot(ax=axes[i], x=profile[col].unique(), y=profile[col].value_counts(), label='New members')
    axes[i].set_title(f'The number of new members by {col}')
    axes[i].set_xlabel(f'{col}')
    axes[i].set_ylabel('Number of new members')
plt.show()

The number of new memebers higly increase after 2016.

## 1.4. What is the average purchase distribtuion?

To answer the question we have to merge the 'profile' and the 'transcript'.

In [None]:
profile_transcript = profile.merge(right=transcript, how='left', left_on='id', right_on='person')
profile_transcript.head()

The purchase information is in the 'transcript -> value whose event == transaction -> amount'

In [None]:
transaction = profile_transcript.loc[profile_transcript['event'] == 'transaction']
offer = profile_transcript.loc[profile_transcript['event'] != 'transaction']

Extract the amount paid using regular expression module.

In [None]:
import re

pattern = re.compile('[\d]+.[\d]+')
transaction['value'] = transaction['value'].apply(pattern.findall).str.get(0)
transaction.rename(columns={'value': 'amount'}, inplace=True)
transaction = transaction.astype(dtype={'amount': 'float'})
transaction.head()

Create a pivot table that contains mean purchase amount by each member.

In [None]:
mean_amount_by_id = pd.pivot_table(data=transaction, values='amount', index='id', aggfunc='mean').reset_index()
mean_amount_by_id

To add each member's demographic information merge the pivot_table with the 'profile'.

In [None]:
mean_amount_by_id = mean_amount_by_id.merge(right=profile, how='left', on='id')
mean_amount_by_id.head()

Create a generatoin column for a better interpretation.

In [None]:
def to_generation(age):
    if age < 20:
        return 'teenager'
    elif age < 40:
        return 'youth'
    elif age < 60:
        return 'middle'
    elif age < 80:
        return 'older'
    else:
        return 'senior'

mean_amount_by_id['generation'] = mean_amount_by_id['age'].apply(to_generation)
mean_amount_by_id.head()

In [None]:
gby_gender = mean_amount_by_id.groupby(by='gender').mean()['amount']

sns.barplot(x=gby_gender.index, y=gby_gender)
plt.title('Amount spent by gender')
plt.xlabel('Gender')
plt.ylabel('Amount spent')
plt.show()

Female customers averagely spend more than $5 than male customers.

In [None]:
gby_generation = mean_amount_by_id.groupby(by='generation').mean()['amount']

sns.barplot(x=gby_generation.index, y=gby_generation, order=['teenager', 'youth', 'middle', 'older', 'senior'])
plt.title('Amount spent by generation')
plt.xlabel('Generation')
plt.ylabel('Amount spent')
plt.show()

Over the middle-aged customers spend more money than the teenagers and the youth customers.

## 2.1. Do people react to different promotions differently?

To investigate the effect of promotions we have to merge the 'profile_transcript' with the 'portfolio' by using 'value' on the 'profile_transcript' and 'id' on the 'portfolio'.  
Since 'value' contains 'offer id', we have to extract that information.

In [None]:
offer = profile_transcript.loc[profile_transcript['event'] == 'offer completed']

ID_LENGTH = 32

offer['offer_id'] = offer['value'].str.slice(start=14, stop=(14 + ID_LENGTH))
offer = offer.drop(labels='value', axis=1)
offer.head()

In [None]:
cleaned_transcript = pd.concat(objs=[transaction, offer]).sort_index()
cleaned_transcript.head()

There are some null values in the 'generation' columns.  
Apply 'to_generation'.

In [None]:
cleaned_transcript['generation'] = cleaned_transcript['age'].apply(to_generation)

In [None]:
history = cleaned_transcript.merge(right=portfolio, how='left', left_on='offer_id', right_on='id')

In [None]:
history = history.drop(labels=['id_y'], axis=1)
history.rename(columns={'id_x': 'id'}, inplace=True)
history.head()

In [None]:
history['amount'] = history['amount'].fillna(method='ffill')
history = history.drop_duplicates(subset=['id', 'time'], keep='last')
history.head()

'offer completed' in 'event' indicates that the customer has engaged the promotion.

In [None]:
offer_completed = history.loc[history['event'] == 'offer completed']

sns.countplot(data=offer_completed, x='offer_type')
plt.title('Promotion engagement')
plt.xlabel('Promotion type')
plt.show()

Interestingly, customers engaged more the discount promotion than the bogo promotion.  
The 'difficulty'(a.k.a. the minimum required spend) might affect the promotioin engagement.

In [None]:
gby_offer_type = offer_completed.groupby(by='offer_type').mean()['amount']

sns.barplot(x=gby_offer_type.index, y=gby_offer_type)
plt.title('Mean purchase amount: bogo vs. discount')
plt.ylabel('Purchase amount')
plt.show()

In spite of little less engagement, bogo shows little higher mean purchase amount than discount.  
It might be the effect of the 'difficulty'.

## 2.2. Does the reward of the promotion make people react differently?

In [None]:
bogo = offer_completed.loc[offer_completed['offer_type'] == 'bogo']
discount = offer_completed.loc[offer_completed['offer_type'] == 'discount']

fig, axes = plt.subplots(1, 2, figsize=(10, 5))
sns.countplot(ax=axes[0], data=bogo, x='reward')
sns.countplot(ax=axes[1], data=discount, x='reward')
plt.suptitle('Promotion engagement by promotion type and reward')
axes[0].set_title('bogo')
axes[1].set_title('discount')
plt.show()

The lower reward, the more engagements regardless of the promotion type.  
In the same mannar, the 'difficulty' might affect to the result.

## 2.3. Does it make sense to offer certain rewards?

In [None]:
history_non_offer = history.loc[history['event'] == 'transaction']
history_offer = history.loc[history['event'] == 'offer completed']

sns.barplot(x=['non_offer', 'offer'], y=[history_non_offer['amount'].mean(), history_offer['amount'].mean()])
plt.title('Mean purchase amount: non_offer vs. offer')
plt.ylabel('Purchase amount')
plt.show()

There are about $7.5 of mean purcahse amount difference between 'non_offer' and 'offer'.  
Although engaging the promotions needs the minimum required purchase amount(a.k.a. 'difficulty' in our dataset), the difference is pretty significant.

## 3.1. How many clusters should Starbucks use? : To be updated.