In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#### Problem Statement
Understand the characteristics of customer that will continue buying products

#### Problem Solution

- Analyse and understand the behavioural aspects of Starbucks customer
- Perform customer segmentation based on the study
- Find out key parameters of the customer loyalty using Chi-Square test of independence

In [None]:
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import seaborn as sns
import scipy.stats as st
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv("/kaggle/input/starbucks-customer-retention-malaysia-survey/Starbucks satisfactory survey.csv")
df.head()

#### Data Cleaning

In [None]:
df.columns = ['Timestamp',
              'Gender',
              'Age',
              'Occupation',
              'Annual_Income',
              'Visit_Frequency',
              'Service_preferred',
              'Time_Spent_Frequency',
              'Nearest_Store_Distance',
              'Membership',
              'Frequent_Product',
              'Avg_Money_Spent',
              'Quality_Rating_vs_Other_Brands',
              'Price_Rating',
              'Sales_Promotion_Importance',
              'Ambiance_Rating',
              'WiFi_Rating',
              'Service_Rating',
              'Meetings_hangouts_preference',
              'Promotion_Source',
              'Loyalty'
             ]
df.head()

In [None]:
len(df)

`Check the different Columns datatypes and null values`

In [None]:
df.info()

In [None]:
df[df.Service_preferred.isnull()]

`Only one row has null value. Delete the row
`

In [None]:
df = df[-df.Service_preferred.isnull()]
len(df)

In [None]:
df.describe()

#### Univariate Analysis

In [None]:
num_cols = df.select_dtypes(include='int64').columns
num_cols

In [None]:
def plot_numeric(df,x):
    plt.figure(figsize=(10,8))
    plt.subplot(2,1,1)
    plt.boxplot(df[i], vert=True)
    plt.show()
    plt.figure(figsize=(10,8))
    plt.subplot(2,2,2)
    sns.distplot(df[i], bins=20)
    plt.show()
    print('Quantiles')
    print(df[i].quantile([0.01,0.03,0.1,0.2,0.5,0.75,0.9,1.0]))
    
for i in num_cols:
    print("Plots for Column: "+ i)
    plot_numeric(df,i)
    print("\n")

- Majority customers(50%) rated 4 for quality vs other brands
- Price rating is uniformly distributed. There are approximately equal no of customers who can afford or not afford the prices
- Sales and Promotion have very good impact on the customer purchase decision(90% cusomters rated above 3)
- Majority customers rated ambiance >=3 (90%)
- Majority customers rated wifi service 3 (around 55%)
- The Serivce ratings given by the customers are around 3-5(around 90% customers)
- around 80% customers prefer Starbucks for Meetings/hangouts

`From the above graphs we can see there are outliers in the data. Lets check the dataset`

In [None]:
df[df.Quality_Rating_vs_Other_Brands <2]

In [None]:
df = df[df.Quality_Rating_vs_Other_Brands >1]
len(df)

In [None]:
cat_cols = df.select_dtypes(include='object').columns
cat_cols

In [None]:
for i in cat_cols[1:]:
    print("CountPlot for the column: "+ i)
    sns.countplot(df[i])
    plt.xticks(rotation=90)
    plt.show()

- Both Females and  Males customer are comparable in count
- Customers between age 20 to 29 are the majority customers then 30 to 40 age. 40 and above age group are less interested in the products
- Employees and Students are more interested in the products compared to Self-employed. Ignore Housewife as the category has too less in count to consider to draw insights
- Customers with annual income below RM50000 are the potential customers
- Most of the customers visit rarely. Monthly visitors are also significant in number. Daily visitors are very less in count
- Take away customers are very high next comes the Drive In and then Drive thru categories
- Majority customers spend less than an hour. Very few people spend time around an hour to two
- Majority of customers are more than 3km distance from the store.
- Customers with membership are almost in number to no membership customers
- Coffee, Cold drinks, Pastries seems to be the frequently bought products in Store
- Majority of people spend money less  than RM20. Customer count spending more money decreases with increase in Money
- Social media is the main source of Promotion for the products
- Interestingly, even the customers having/not having membership are equal in number, majority of customers are loyal to our brand

####  Bi-Variate Analysis

In [None]:
for i in num_cols:
    plt.figure(figsize=(25,5))
    plt.subplot(1,4,1)
    sns.countplot(x=df.Loyalty, hue=df[i])
    plt.title(i+" vs Loyalty")
    plt.xticks(rotation=90)
    plt.legend(bbox_to_anchor=(1.3,1), borderaxespad=0)
    plt.show()

##### Among the loyal customer
- 4 is the most common quality rating
- 3 is  the most common price rating and price rating of 5 are loyal customers
- 4 is the most common ambiance rating<br>
##### Among the Non_Loyal customers
- 3 is the most common quality rating
- 2 is the most common price rating
- 3 is the most common ambiance rating

- Loyality doesn't seem to depend on wifi rating and sales promotion much
- Customers giving Service rating 5 have more likely loyal customers. The likeliness seems to be decreasing with decreasing rating
- Customers with meeting_hangout_preference 4 are more likely loyal

Lets analyse the categorical columns

In [None]:
for i in cat_cols[1:]:
    plt.figure(figsize=(25,5))
    plt.subplot(1,4,1)
    sns.countplot(x=df.Loyalty, hue=df[i])
    plt.title(i+" vs Loyalty")
    plt.xticks(rotation=90)
    plt.legend(bbox_to_anchor=(1.3,1), borderaxespad=0)
    plt.show()

- Gender doesn't seem to have impact on the Loyalty. Female customers are more in number
- Customers from age group 20-29 are major segment next comes 30-39 among both loyal and non-loyal customers
- Employed customers are major category among loyal customers whereas Students are major category among non-loyal customers
- Customers with annual_income 50k-100k are more in number among loyal customers
- Customers with visit frequency rarely 
- Take away is the majority among loyal customers, whereas Dine in Non-Loyal customers
- Time spent and nearest store distance doesn't seem to have much influence on the loyalty. To be  analysed futher
- Customers with membership are more loyal
- Coffee, cold drinks, pastries are major categories of frequently bought products
- Customers spent money more than RM20 are more loyal
- Social  Media is the major Promotion source

#### Multi-Variate Analysis

In [None]:
def plot_fun(df, col1, col2, col3):

    plot_df = df[[col1, col2, col3]]
    plot_df['dummy'] = np.ones(len(plot_df), dtype=int)
    plot_df

    grouped_plot = plot_df.groupby([col1, col2, col3]).count().unstack(level=2)
    grouped_plot

    loyalty = grouped_plot.columns.levels[1]
    colors = [plt.get_cmap('viridis')(i) for i in np.linspace(0,1,len(loyalty))]
    colors

    sns.set(context='talk')
    nxplots = len(grouped_plot.index.levels[0])
    nyplots = len(grouped_plot.index.levels[1])
    fig, axes = plt.subplots(nrows=nxplots, ncols=nyplots, sharex=True, sharey=True, figsize=(12,10))
    fig.suptitle(col1 +' vs '+col2+' vs '+col3)

    for a,i in enumerate(grouped_plot.index.levels[0]):
        for b,j in enumerate(grouped_plot.index.levels[1]):
            try:
                axes[a,b].bar(grouped_plot.columns.levels[1], grouped_plot.loc[i,j], color=colors)
                print(i,j)
                axes[a,b].xaxis.set_ticks([])
            except:
                pass

    axeslabel = fig.add_subplot(111, frameon=False)
    plt.tick_params(labelcolor='none')
    plt.grid(False)
    axeslabel.set_ylabel(col1,rotation='horizontal',y=1,weight="bold")
    axeslabel.set_xlabel(col2,y=1,weight="bold")
    for i, j in enumerate(grouped_plot.index.levels[1]):
        axes[-1,i].set_xlabel(j, rotation=90)
    for i, j in enumerate(grouped_plot.index.levels[0]):
        axes[i,0].set_ylabel(j, rotation=90)

    fig.subplots_adjust(right=0.82)

    fig.legend([Patch(facecolor = i) for i in colors],
               grouped_plot.columns.levels[1],
               title="Loyalty",
               loc="center right")
    print(grouped_plot)
    


##### age_vs_AnnualIncome_vs_Loyalty

In [None]:
plot_fun(df, 'Age','Annual_Income','Loyalty')

- Age group 20-29 is the majority group. Their loyality percentge is increasing with the increase in salary
- Age group below 20 customers have income lessthan RM25000
- Customer interest in the products is decreasing with the increase in age from 20

##### age_vs_Occupation_vs_Loyalty

In [None]:
plot_fun(df, 'Age','Occupation','Loyalty')

- Among customers from age group 20-29 Employed are the majority group next comes Student and then Self-Employeed
- Even the Self-Employeed are less in count they have high loyality percentage then comes Employees.  Students have high negitive responses
- Customers below age group 20 have significantly high positive response
- Customers from aage group 30-39 have good positive response

##### Occupation_vs_AnnualIncome_vs_Loyalty

In [None]:
plot_fun(df,'Occupation', 'Annual_Income','Loyalty')

- Customers from Employeed category having income less than RM50000 have high postitve response 
- Students have income less than RM250000 and they are high postive response as well significant negitive response

We have seens the Females and Males are comparale in count. Lets find out the if there are any interesting factors

##### Gender_vs_AnnualIncome_vs_Loyalty

In [None]:
plot_fun(df,'Gender', 'Annual_Income','Loyalty')

- Female customers are more loyal than Male customers
- Above 100000 we seem to have more male customers than Females

Lets understand the difference in the Male and Female loyality based on their age, Occupation, Annual_Income details

Lets convert the Loyalty column values yes to 1 and No to 0 <br>
Also create Loyalty_invert column with values yes as 0 and No as 1

In [None]:
df.Loyalty = df.Loyalty.apply(lambda x: 1 if x=='Yes' else 0)
df['Loyalty_invert'] = df.Loyalty.apply(lambda x: 1 if x==0 else 0)
df.Loyalty_invert.value_counts()

##### Gender_vs_Age_vs_Occupation_vs_Annual_Income_vs_Loyalty

In [None]:
pivot_df = pd.pivot_table(data=df, index=['Gender','Age'], columns=['Occupation','Annual_Income'], values='Loyalty', aggfunc='sum')

plt.figure(figsize=(28,8))
plt.subplot(1,2,1)
sns.heatmap(data=pivot_df, cmap='RdYlGn', annot=True)
pivot_df = pd.pivot_table(data=df, index=['Gender','Age'], columns=['Occupation','Annual_Income'], values='Loyalty_invert', aggfunc='sum')
plt.subplot(1,2,2)
sns.heatmap(data=pivot_df, cmap='RdYlGn_r', annot=True)
plt.show()

- Female in 20-29 age have 10 positive responses and 3 negitive responses. They are employees with income less than RM25000
- Female in 20-29 age have 11 positive responses and 2 negitive responses. They are employees with income greater than RM25000 and less then RM50000
- Employed Males of age 20 to 29 and income less than RM25000 have no negitive response. Whereas Males of same categories except income ranging between RM25000 to RM 50000 almost equal positive and negitive responses(3+vs,  2-vs)
- Males from 30 to 39 age from students lessthanRM25000 have almost equal positive and negitive responses(5+vs,  4-vs)
- Females from 20-29 who are students and income less than RM25000 have high positive and negitive responses
- we can see only one 0 in the left heat map and corresponding value in the right heap map has either zero or less number. This says that all the different categories are having high loyal customers



lets check the different factors for the category female student, age 20-29 with income less than RM25000

In [None]:
df3 = df.loc[(df.Gender== 'Female')&(df.Occupation == 'Student') & (df.Age == 'From 20 to 29')&(df.Annual_Income=='Less than RM25,000'),:]
df3

##### AvgMoneySpent_vs_PriceRating_vs_AmbianceRating_vs_ServiceRating_vs_Loyalty of the above category

In [None]:
pivot_df = pd.pivot_table(data=df3, index=['Avg_Money_Spent','Price_Rating'], columns=['Ambiance_Rating','Service_Rating'], values='Loyalty', aggfunc='sum')

plt.figure(figsize=(28,8))
plt.subplot(1,2,1)
sns.heatmap(data=pivot_df, cmap='RdYlGn', annot=True)
pivot_df = pd.pivot_table(data=df3, index=['Avg_Money_Spent','Price_Rating'], columns=['Ambiance_Rating','Service_Rating'], values='Loyalty_invert', aggfunc='sum')
plt.subplot(1,2,2)
sns.heatmap(data=pivot_df, cmap='RdYlGn_r', annot=True)
plt.show()

- customer with ambiance and service rating between 3 -5 are more loyal
- cusstomers with less price rating are less loyal

##### VisitFrequency_vs_TimeSpentFrequency_vs_NearestStoreDistance_vs_Membership_vs_Loyalty

In [None]:
pivot_df = pd.pivot_table(data=df, index=['Visit_Frequency','Time_Spent_Frequency'], columns=['Nearest_Store_Distance','Membership'], values='Loyalty', aggfunc='sum')

plt.figure(figsize=(28,8))
plt.subplot(1,2,1)
sns.heatmap(data=pivot_df, cmap='RdYlGn', annot=True)
pivot_df = pd.pivot_table(data=df, index=['Visit_Frequency','Time_Spent_Frequency'], columns=['Nearest_Store_Distance','Membership'], values='Loyalty_invert', aggfunc='sum')
plt.subplot(1,2,2)
sns.heatmap(data=pivot_df, cmap='RdYlGn_r', annot=True)
plt.show()

- It is evident that customers having membership are more loyal compared to customers without membership
- Customers visiting rarely, time_spent_frequency below 30min, nearesr store more than 3km and doesn't have membership, they have almost equal postive and negitive responses. whereas nearest store 1km-3km and have membership, they are positively responding
- Weekly visiting customers are very low but stores within 1km they are postively responding
- The green color in the 2nd graph says we don't have much negitive responses. Whereas the red color in the 1st graph says we don't have more postive responses as well. Only  few group of customers are exibiting high postitive responses which is evident
- Both customers with visiting daily spending 30 mins and monthly visiting customers are all loyal except one with time spending more than 3 hours.

Lets analyse the data for the category with most non-loyal customers (9) in the above graph

In [None]:
df2 = df.loc[(df.Visit_Frequency== 'Rarely')&(df.Time_Spent_Frequency == 'Below 30 minutes') & (df.Nearest_Store_Distance == 'more than 3km')&(df.Membership=='No'),:]

In [None]:
pd.set_option('display.max_columns', 50)
df2

- majorty of the customers are Female
- the customers visit frequency is Rare
- Time spent frequency below 30mins
- Nearest store more tha   3km
- The customers don't have membership

##### AvgMoneySpent_vs_PriceRating_vs_AmbianceRating_vs_ServiceRating_vs_Loyalty for above category

In [None]:
pivot_df = pd.pivot_table(data=df2, index=['Avg_Money_Spent','Price_Rating'], columns=['Ambiance_Rating','Service_Rating'], values='Loyalty', aggfunc='sum')

plt.figure(figsize=(28,8))
plt.subplot(1,2,1)
sns.heatmap(data=pivot_df, cmap='RdYlGn', annot=True)
pivot_df = pd.pivot_table(data=df2, index=['Avg_Money_Spent','Price_Rating'], columns=['Ambiance_Rating','Service_Rating'], values='Loyalty_invert', aggfunc='sum')
plt.subplot(1,2,2)
sns.heatmap(data=pivot_df, cmap='RdYlGn_r', annot=True)
plt.show()

- Customers with Price_rating 1 or 2 are mot loyal even the ambiance and Serivce rating is >=3
- customer with Price_rating > 3 are more loyal and the ambiance and Service rating with high rating has positive impact

Lets understand these features impact on complete data

In [None]:
df3 = df.loc[(df.Visit_Frequency== 'Daily')|(df.Visit_Frequency== 'Monthly'),:]
df3

##### AvgMoneySpent_vs_PriceRating_vs_AmbianceRating_vs_ServiceRating_vs_Loyalty

In [None]:
pivot_df = pd.pivot_table(data=df, index=['Avg_Money_Spent','Price_Rating'], columns=['Ambiance_Rating','Service_Rating'], values='Loyalty', aggfunc='sum')

plt.figure(figsize=(28,8))
plt.subplot(1,2,1)
sns.heatmap(data=pivot_df, cmap='RdYlGn', annot=True)
pivot_df = pd.pivot_table(data=df, index=['Avg_Money_Spent','Price_Rating'], columns=['Ambiance_Rating','Service_Rating'], values='Loyalty_invert', aggfunc='sum')
plt.subplot(1,2,2)
sns.heatmap(data=pivot_df, cmap='RdYlGn_r', annot=True)
plt.show()

- Loyality increasing with the increase in Price rating, ambiance rating, service rating
- Customer giving high ambiance rating, Service rating are more loyal if the average money spent greater than RM20. Price rating doesn't  have much impact on this. These are customers who are more interested in Services and ambiance

Create a temp feature to calculate the categories count for the chi-square test

In [None]:
df['temp'] = 1
df.temp

In [None]:
def chiSquareTest(df, col1, col2, col_list):
    cont_table = df.groupby([col1, col2])['temp'].count().unstack()
    cont_table.fillna(0, inplace=True)
    stat, p, dof, expected = st.chi2_contingency(cont_table)

    prob = 0.90
    critical = st.chi2.ppf(prob, dof)
    if abs(stat) >= critical:
        print('Dependent (reject H0) and the features are: '+ col1)
        print(stat, critical)
        col_list.append(col1)

In [None]:
cols = df.columns

In [None]:
col_list = []
for  i in cols[1:-4]:
    #print("Correlation of Col: "+i)
    chiSquareTest(df, i, 'Loyalty', col_list)

The columns with significant difference with the loyalty

In [None]:
print(col_list)

#### Found different categories among loyal and non-loyal customers. As the number of customers belonging to those categories are very less, need more data to understand their behaviour stability.