In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

![](http://24.media.tumblr.com/0b9f1228307c9b306a18b66f8a912150/tumblr_mhdi8uTpYY1rjv15co1_500.gif)

# Starbucks 
* Starbucks Corporation is an American multinational chain of coffeehouses and roastery reserves headquartered in Seattle, Washington. As the world's largest coffeehouse chain, Starbucks is seen to be the main representation of the United States' second wave of coffee culture.*
Source: Wikipedia

# Starbucks Data 
    1. Profile CSV contains 17000 records and 6 columns where each record has data about each person, their age, salary, id, when they became member on(Date)and gender and unnnamed column.

    2. Portfolio Data has are 10 rows and 7 columns which contains  promotional offers that are possible to receive, and basic information about each one including the promotional type, duration of the promotion, reward, and how the promotion was distributed to customers.
    
    3. Transcript Data has  are 306534 rows and  5 columns which contains the different steps of promotional offers that a customer received, channels through which offers are possible, minimum required to spend to complete an offer.
    4. For each of the csv file, these steps are followed
        *  Reading the csv files, removing unnecessary columns, Data wrangling and Binning
        * Feature Engineering
        * Univariate Analysis
        * Bivariate Analysis
    5.ML Models(work in progress)
    6.Conclusion

In [None]:
transcript = pd.read_csv('/kaggle/input/starbucks-customer-data/transcript.csv')
profile = pd.read_csv('/kaggle/input/starbucks-customer-data/profile.csv')
pf = pd.read_csv('/kaggle/input/starbucks-customer-data/portfolio.csv')

In [None]:
print(f'Transcript data has {transcript.shape[0]} rows and  {transcript.shape[1]} columns')
print(f'Profile data has {profile.shape[0]} rows and  {profile.shape[1]} columns')
print(f'Portfolio data has {pf.shape[0]} rows and  {pf.shape[1]} columns')

# Transcript data sample

In [None]:
transcript.head()

In [None]:
transcript.info()

# Profile data

In [None]:
profile.head()

In [None]:
profile.info()

In [None]:
pf.head()

In [None]:
pf.info()

In [None]:
transcript.columns

In [None]:
profile.columns

In [None]:
pf.columns

# Drop the unnamed column from all dataframes

In [None]:
transcript.drop('Unnamed: 0', axis=1, inplace=True)

pf.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
pf.head()

# Data Wrangling, Binning and Feature Engineering on the profile data

In [None]:
profile = pd.read_csv('/kaggle/input/starbucks-customer-data/profile.csv')

In [None]:
profile.head()

In [None]:
profile.info()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import plotly as py
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode, iplot, plot

import warnings 
warnings.filterwarnings('ignore')
%matplotlib inline

# There are missing values in gender and income columns

In [None]:
sns.boxplot(data=profile, x='age')

In [None]:
 profile[(profile['age'] < 119) & (profile['age'] > 100)]

# **There are 2180 columns with age 118 and another thing we can notice is gender value is NaN and also income is NaN for these observations. Let us drop these rows, as more columns have null values for these observations**

In [None]:
profile.drop(profile.index[(profile['age'] < 119) & (profile['age'] > 100)], inplace = True)

In [None]:
profile.shape

In [None]:
(profile.isnull().sum())

In [None]:
#remove columns
profile.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
sns.countplot(data=profile,x='gender')

In [None]:
sns.boxplot(data=profile, x='income')

In [None]:
#correct time format
profile['became_member_on'] = pd.to_datetime(profile['became_member_on'].astype(str), format='%Y%m%d')
profile['became_member_on'].dtype

In [None]:
profile['became_member_on'].head()

# Create new features

In [None]:
profile['year'] = profile['became_member_on'].dt.year
profile['month'] = profile['became_member_on'].dt.month
profile['day']=profile['became_member_on'].dt.day_name()
profile['member_since_how_manydays'] = (pd.to_datetime('today') - profile['became_member_on']).astype('timedelta64[D]').astype(int)

In [None]:
profile.rename(columns={'id': 'customer_id'}, inplace=True)

In [None]:
profile.shape

In [None]:
profile.head().reset_index()

In [None]:
profile.age.min()

# Binning

In [None]:
profile['age_group'] = pd.cut(x=profile['age'], bins=[18, 20, 40, 60, 80, 101],
                    labels=['Teenage(0-19)', 'young(20-39)', 'Middle-age(40-59)',
                            'Old(60-79)', 'Very-Old(80-100)'], include_lowest=True)
profile['age_group']

# Univariate Analysis - Distribution plot of gender

In [None]:
val = profile['gender'].value_counts().index
cnt = profile['gender'].value_counts().values

fig = go.Figure([go.Bar(x=val, y=cnt, marker_color='lightcoral')])
fig.update_layout(title_text='Distribution of Gender', title_x=0.7)
fig.show()

In [None]:
fig = plt.figure(figsize = (12,7))
sns.countplot(x='age_group',data=profile)
plt.xticks(rotation=30)
plt.title('Age-Group of staff at Starbucks')

# Distribution plot of income

In [None]:
val = profile['income'].value_counts().index
cnt = profile['income'].value_counts().values

fig = go.Figure([go.Bar(x=val, y=cnt, marker_color='darkturquoise')])
fig.update_layout(title_text='Distribution of Income', title_x=0.7)
fig.show()

In [None]:
sns.set(style="darkgrid")
sns.kdeplot(data=profile['income'], shade=True)
plt.title('Density distribution of Income')

In [None]:
sns.kdeplot(
    data=profile, x="age", y="income", hue="gender", fill=True,
)

In [None]:
plt.figure(figsize=(12,10))
sns.set(style="darkgrid")
ax = sns.countplot(y="year", data=profile, palette="Set2", order=profile['year'].value_counts().index[0:15])
plt.title('Members joined frequency in years')

In [None]:
plt.figure(figsize=(12,10))
sns.set(style="darkgrid")
ax = sns.countplot(x="month", data=profile, palette="Set2", order=profile['month'].value_counts().index[0:15])
plt.title('Members joined frequency by months')

In [None]:
plt.figure(figsize=(12,10))
sns.set(style="darkgrid")
ax = sns.countplot(x="day", data=profile, palette="pastel", order=profile['day'].value_counts().index[0:15])
plt.title('Members joined frequency by week-of-day')

In [None]:
profile['gender'].value_counts(normalize=True)

In [None]:
profile['age_group'].value_counts(normalize=True)

# for profile data
1. People joined as member increased from 2015 and highest in 2017
2. Highest number of people joined in August month
3. Ofcourse Sunday and Saturday are the leading days in a week people joined.
4. Income distribution is not skewed much, and is in the range of 50k-73k
5. 57% of the staff are Male, 41.3% Female and 1.4% Others
6. Highest proportion of staff at around 40%  are in the age-group 40-59.

# Bivariate analysis on Portfolio data

In [None]:
pf.head()

In [None]:
channels_df = pf.groupby(['id','channels']).mean().reset_index()
channels_df

In [None]:
plt.figure(figsize=(15,10))
sns.set(style="darkgrid")
ax = sns.countplot(x="duration", data=channels_df, palette="Set2", hue='channels')
plt.ylabel('Number of Offers')
plt.xlabel('Duration of Offers in days')
plt.title('offers based on duration and channels through which they are offered')
plt.show()

# Number of offers based on difficulty and channels through which they are offered

In [None]:
plt.figure(figsize=(15,10))
sns.set(style="darkgrid")
ax = sns.countplot(x="difficulty", data=channels_df, palette="Set3", hue='channels')
plt.ylabel('Number of Offers')
plt.xlabel('Difficulty of Offers in days')
plt.title('Offers based on difficulty and channels through which they are offered')

1. One offer is most diffuclt with level of 20 and it is offered only through web and email

# Types of Discounts on Offers

In [None]:
pie_df = pf.offer_type.value_counts().reset_index().offer_type

In [None]:
plt.figure(figsize=(8,8))
labels = 'DISCOUNT','BOGO','INFORMATIONAL'
colors = ['lightblue','lightsteelblue','silver']
explode = (0, 0.1, 0)
plt.pie(pie_df, labels=labels, autopct='%1.1f%%', startangle=15, shadow = True, colors=colors, explode=explode)
plt.title('Types of Offers')
plt.axis('equal')
plt.show()

# Univariate analysis on Portfolio data

In [None]:

sns.histplot(pf, x="reward")
plt.title('Distribution of Rewards for Offers')
plt.show()

In [None]:
sns.boxplot(data=pf,x='difficulty')

In [None]:
sns.displot(pf, x="difficulty", kind="kde")
plt.title('Density plot of difficulty')
plt.show()

In [None]:

sns.set_theme(style="darkgrid")
sns.displot(pf, x="duration", kind="kde")
plt.title('plot of duration of the offers')
plt.show()

In [None]:
pf.head()

# One hot encoding on portfolio data

In [None]:
pf = pd.concat([pf,pd.get_dummies(pf["offer_type"],prefix='offer',drop_first=True)],axis=1)
pf.drop(['offer_type'],axis=1, inplace=True) # drop the column which was one-hot encoded
pf

# Transcript Data

In [None]:
df = transcript.copy()
transcript.head()

# No missing values in Transcript data

In [None]:
df.isnull().sum()

In [None]:
df.head()

In [None]:
offer_status = df.event.value_counts().reset_index()

In [None]:
offer_status.columns=['Event', 'Number of offers in that event']
offer_status

In [None]:
df.event.value_counts(normalize=True)

In [None]:
val = offer_status['Event']
cnt = offer_status['Number of offers in that event']

fig = go.Figure([go.Bar(x=val, y=cnt, marker_color='darkturquoise')])
fig.update_layout(title_text='Number of offers versus their status', title_x=0.4)
fig.show()

# Takeawy- 45% of the offers are still in transaction while only 11% offers are completed

# Binning time intervals

In [None]:
df.time.unique()

In [None]:
df['time_group'] = pd.cut(x=df['time'], bins=[0, 100, 200, 300, 400, 500,600,700,800], include_lowest=True,
                    labels=['0-100 days', '100-200 days', '200-300 days', '300-400 days', '400-500 days', '500-600 days', '600-700 days', '700-800 days'])

In [None]:
fig = plt.figure(figsize = (12,7))
sns.countplot(x='time_group',data=df)
plt.xticks(rotation=30)
plt.title('No. of offers in the Timespan')

In [None]:
df.person.value_counts() 

In [None]:
offers_received_df = df[df.event == 'offer received']
offers_received_df

# All persons who have received offer may not accept

In [None]:
offers_viewed_df = df[df.event == 'offer viewed']
offers_viewed_df

In [None]:
offers_completed_df = df[df.event == 'offer viewed']
offers_completed_df

In [None]:
offer_completed_people = offers_received_df.isin(list(offers_completed_df['person']))
offers_received_df[offer_completed_people['person']]

# Out of 76277 people who received offer, 75750 completed i.e 99% people completed

# One hot encoding

In [None]:
df = pd.concat([df,pd.get_dummies(df["event"],prefix='event',drop_first=True)],axis=1)
df.drop(['event'],axis=1, inplace=True) # drop the column which was one-hot encoded
df

# Conclusion


1. People joined as member increased from 2015 and highest in 2017
2. Highest number of people joined in August month
3. Ofcourse Sunday and Saturday are the leading days in a week people joined.
4. Income distribution is not skewed much, and is in the range of 50k-73k
5. 57% of the staff are Male, 41.3% Female and 1.4% Others
6. Highest proportion of staff at around 40%  are in the age-group 40-59.
7. 45% of the offers are still in transaction while only 11% offers are completed. 

# Work in Progress, your feedback is appreciated to improve this notebook..