In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# Data Cleaning
from sklearn.feature_extraction.text import CountVectorizer

# PreProcessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Splitting Data
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score

# Modeling
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC
from xgboost.sklearn import XGBClassifier

# Resampling
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek
from imblearn.pipeline import Pipeline
from imblearn.combine import SMOTETomek

# Raw Datasets

- Contains with 2 datasets, the questions of survey and a data that already encoded. I only process using survey data.

In [None]:
survey = pd.read_csv('../input/starbucks-customer-retention-malaysia-survey/Starbucks satisfactory survey.csv')
survey.head()

In [None]:
survey.info()

# Data Cleaning

In [None]:
sb = survey.copy()

***Drop Columns***

In [None]:
sb.drop(columns=['Timestamp'], inplace = True)

- Drop Timestamp column in eda dataset because it's contains date, month, year and time the customer took this survey, which is irrelevant.

***Rename Columns***

In [None]:
sb.rename({'1. Your Gender' : 'Gender', '2. Your Age' : 'Age', '3. Are you currently....?' : 'Working_Status', '4. What is your annual income?' : 'Annual_Income', '5. How often do you visit Starbucks?' : 'Visit_Duration', '6. How do you usually enjoy Starbucks?' : 'Visit_Plan', '7. How much time do you normally  spend during your visit?' : 'Spending_Time', "8. The nearest Starbucks's outlet to you is...?" : 'Outlet_Location', '9. Do you have Starbucks membership card?' : 'Member_Card', '10. What do you most frequently purchase at Starbucks?' : 'Frequent_Purchase', '11. On average, how much would you spend at Starbucks per visit?' : 'Average_Spending', '12. How would you rate the quality of Starbucks compared to other brands (Coffee Bean, Old Town White Coffee..) to be:' : 'Product_Rating', '13. How would you rate the price range at Starbucks?' : 'Price_Rating', '14. How important are sales and promotions in your purchase decision?' : 'Promotion_Rating', '15. How would you rate the ambiance at Starbucks? (lighting, music, etc...)' : 'Ambiance_Rating', '16. You rate the WiFi quality at Starbucks as..' : 'Wifi_Rating', '17. How would you rate the service at Starbucks? (Promptness, friendliness, etc..)' : 'Service_Rating', '18. How likely you will choose Starbucks for doing business meetings or hangout with friends?' : 'Hangout_Place_Rating', '19. How do you come to hear of promotions at Starbucks? Check all that apply.' : 'Promotion_Tools', '20. Will you continue buying at Starbucks?' : 'Loyal_Customer'} , inplace = True , axis = 1)

* Rename all features name in eda dataset to make it easier while seeing all columns.

***Missing Value***

In [None]:
sb.isna().sum()/len(sb.index)*100

***Feature's Value Checking***

* In this section, I process the value of columns that not suitable to neatly arranged.

*Visit_Plan*

In [None]:
sb['Visit_Plan'].unique()

In [None]:
sb['Visit_Plan'].replace(['never', 'Never buy', 'I dont like coffee', 'Never', 'Never ',], 'Never buy', inplace = True)
sb['Visit_Plan'].value_counts()

*Frequent_Purchase*

In [None]:
sb['Frequent_Purchase'].unique()

In [None]:
cvr = CountVectorizer(tokenizer = lambda x:x.split(';'))
purchase = cvr.fit_transform(sb['Frequent_Purchase'])

print(cvr.get_feature_names())

In [None]:
purchase_value = pd.DataFrame(purchase.toarray(), columns = cvr.get_feature_names())
purchase_value['Never_Buy']= purchase_value.iloc[:, -6:-3].sum(axis=1)

purchase_value.drop(columns=['never', 'never buy any', 'nothing '], inplace = True)

purchase_value.rename({'cake ' : 'Buy_Cake', 'coffee' : 'Buy_Coffee', 'cold drinks' : 'Buy_ColdDrinks', 'jaws chip ' : 'Buy_JawsChip', 'juices' : 'Buy_Juices', 'pastries' : 'Buy_Pastries', 'sandwiches' : 'Buy_Sandwiches'}, inplace = True , axis = 1)

sb = pd.concat([sb, purchase_value], axis = 1)
sb

*Promotion_Tools*

In [None]:
sb['Promotion_Tools'].unique()

In [None]:
sb['Promotion_Tools'].replace([np.nan,], 'Social Media', inplace = True)

* I have to fill the missing value first with mode so the tokenizer can work.

In [None]:
cvr = CountVectorizer(tokenizer = lambda x:x.split(';'))
promo = cvr.fit_transform(sb['Promotion_Tools'])

print(cvr.get_feature_names())

In [None]:
promo_value = pd.DataFrame(promo.toarray(), columns = cvr.get_feature_names())

promo_value.rename({'application offer' : 'Promo_AppsOffer', 'billboards' : 'Promo_Billboards', 'deal sites (fave, iprice, etc...)' : 'Promo_Sites', 'emails' : 'Promo_Emails', 'in store displays' : 'Promo_StoreDisplay', 'never hear' : 'Never_Heard', 'social media' : 'Promo_SocMed', 'starbucks website/apps' : 'Promo_SBucksApps', 'through friends and word of mouth' : 'Promo_WoM'} , inplace = True , axis = 1)

sb = pd.concat([sb, promo_value], axis = 1)
sb

*Drop Columns*

In [None]:
sb.drop(columns=['Frequent_Purchase', 'Promotion_Tools'], inplace = True)

# Datasets

In [None]:
sbucks = sb.copy()
sbucks.head()

* Dataset have 35 columns of features and 122 rows.

# PreProcessing

***Preprocessing Scheme***

- OneHotEncoding: Gender, Age, Working_Status, Annual_Income, Visit_Duration, Spending_Time, Outlet_Location, Member_Card, Average_Spending
    * Simple Imputer Most Frequent: Visit_Plan
- PassThrough: Product_Rating, Price_Rating, Promotion_Rating, Ambiance_Rating, Wifi_Rating, Service_Rating, Hangout_Place_Rating, Buy_Cake, Buy_Coffee, Buy_ColdDrinks, Buy_JawsChip, Buy_Juices, Buy_Pastries, Buy_Sandwiches, Never_Buy, Promo_AppsOffer, Promo_Billboards, Promo_Sites, Promo_Emails, Promo_StoreDisplay, Never_Hear, Promo_SocMed, Promo_SBucksApps, Promo_WoM
- Target: Loyal_Customer

In [None]:
mode_onehot_pipe = Pipeline([
    ('encoder', SimpleImputer(strategy = 'most_frequent')),
    ('one hot encoder', OneHotEncoder(handle_unknown = 'ignore'))])

transformer = ColumnTransformer([
    ('one hot', OneHotEncoder(handle_unknown = 'ignore'), ['Gender', 'Age', 'Working_Status', 'Annual_Income', 'Visit_Duration', 'Spending_Time', 'Outlet_Location', 'Member_Card', 'Average_Spending']),
    ('mode_onehot_pipe', mode_onehot_pipe, ['Visit_Plan']),
], remainder = 'passthrough')

***Define Target Data***

In [None]:
sbucks['Loyal_Customer'].value_counts()/sbucks.shape[0]*100

In [None]:
sbucks['Loyal_Customer'] = np.where(sbucks['Loyal_Customer'] == 'Yes', 1, 0)

***Splitting Data***

In [None]:
X = sbucks.drop('Loyal_Customer', axis = 1)
y = sbucks['Loyal_Customer']

X.shape

- After all data cleaning process, I have 34 features column left.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, stratify = y, test_size = 0.3, random_state = 3434)

* I use 0.3 as default score for test_size and X.shape for random_state so the data will be devided equally.

# Modeling

***Define Model***

In [None]:
logreg = LogisticRegression(random_state = 3434)
tree = DecisionTreeClassifier(random_state = 3434)
knn = KNeighborsClassifier()
rf = RandomForestClassifier(random_state = 3434)
svc = LinearSVC(random_state = 3434)
ada = AdaBoostClassifier(random_state = 3434)
grad = GradientBoostingClassifier(random_state = 3434)
xgb = XGBClassifier(verbosity = 0, random_state = 3434)

In [None]:
logreg_pipe = Pipeline([('transformer', transformer), ('logreg', logreg)])
tree_pipe = Pipeline([('transformer', transformer), ('tree', tree)])
knn_pipe = Pipeline([('transformer', transformer), ('knn', knn)])
rf_pipe = Pipeline([('transformer', transformer), ('rf', rf)])
svc_pipe = Pipeline([('transformer', transformer), ('svc', svc)])
ada_pipe = Pipeline([('transformer', transformer), ('ada', ada)])
grad_pipe = Pipeline([('transformer', transformer), ('grad', grad)])
xgb_pipe = Pipeline([('transformer', transformer), ('xgb', xgb)])

for model in [logreg_pipe, tree_pipe, knn_pipe, rf_pipe, svc_pipe, ada_pipe, grad_pipe, xgb_pipe]:
    model.fit(X_train, y_train)

score_acc = [accuracy_score(y_test, logreg_pipe.predict(X_test)),
             accuracy_score(y_test, tree_pipe.predict(X_test)),
             accuracy_score(y_test, knn_pipe.predict(X_test)),
             accuracy_score(y_test, rf_pipe.predict(X_test)),
             accuracy_score(y_test, svc_pipe.predict(X_test)),
             accuracy_score(y_test, ada_pipe.predict(X_test)),
             accuracy_score(y_test, grad_pipe.predict(X_test)),
             accuracy_score(y_test, xgb_pipe.predict(X_test))]
method_name = ['Logistic Regression', 'Decision Tree Classifier', 'KNN Classifier', 'Random Forest Classifier', 'LinearSVC', 'AdaBoost Classifier', 'Gradient Boosting Classifier', 'XGB Classifier']

acc_summary = pd.DataFrame({'method': method_name, 'accuracy score': score_acc})
acc_summary

* From the cross validation process, AdaBoost Classifier has the highest accuracy score. Let's continue to handle imbalanced data.

# Handling Imbalance

### UnderSampling

*RandomUnderSampler*

In [None]:
rus = RandomUnderSampler(random_state = 3434)

logreg_pipe_rus = Pipeline([('transformer', transformer), ('rus', rus), ('logreg', logreg)])
tree_pipe_rus = Pipeline([('transformer', transformer), ('rus', rus), ('tree', tree)])
knn_pipe_rus = Pipeline([('transformer', transformer), ('rus', rus), ('knn', knn)])
rf_pipe_rus = Pipeline([('transformer', transformer), ('rus', rus), ('rf', rf)])
svc_pipe_rus = Pipeline([('transformer', transformer), ('rus', rus), ('svc', svc)])
ada_pipe_rus = Pipeline([('transformer', transformer), ('rus', rus), ('ada', ada)])
grad_pipe_rus = Pipeline([('transformer', transformer), ('rus', rus), ('grad', grad)])
xgb_pipe_rus = Pipeline([('transformer', transformer), ('rus', rus), ('xgb', xgb)])

for model in [logreg_pipe_rus, tree_pipe_rus, knn_pipe_rus, rf_pipe_rus, svc_pipe_rus, ada_pipe_rus, grad_pipe_rus, xgb_pipe_rus]:
    model.fit(X_train, y_train)

score_acc = [accuracy_score(y_test, logreg_pipe_rus.predict(X_test)),
             accuracy_score(y_test, tree_pipe_rus.predict(X_test)),
             accuracy_score(y_test, knn_pipe_rus.predict(X_test)),
             accuracy_score(y_test, rf_pipe_rus.predict(X_test)),
             accuracy_score(y_test, svc_pipe_rus.predict(X_test)),
             accuracy_score(y_test, ada_pipe_rus.predict(X_test)),
             accuracy_score(y_test, grad_pipe_rus.predict(X_test)),
             accuracy_score(y_test, xgb_pipe_rus.predict(X_test))]
method_name = ['Logistic Regression UnderSampling', 'Decision Tree Classifier UnderSampling', 'KNN Classifier UnderSampling', 'Random Forest Classifier UnderSampling', 'LinearSVC UnderSampling', 'AdaBoost Classifier UnderSampling', 'Gradient Boosting Classifier UnderSampling', 'XGB Classifier UnderSampling']

acc_rus_summary = pd.DataFrame({'method': method_name, 'accuracy score': score_acc})
acc_rus_summary

* From the RandomUnderSampler method, the model with the highest accuracy score is the Random Forest Classifier. Let's use another method. 

### OverSampling

*RandomOverSampler*

In [None]:
ros = RandomOverSampler(random_state = 3434)

logreg_pipe_ros = Pipeline([('transformer', transformer), ('ros', ros), ('logreg', logreg)])
tree_pipe_ros = Pipeline([('transformer', transformer), ('ros', ros), ('tree', tree)])
knn_pipe_ros = Pipeline([('transformer', transformer), ('ros', ros), ('knn', knn)])
rf_pipe_ros = Pipeline([('transformer', transformer), ('ros', ros), ('rf', rf)])
svc_pipe_ros = Pipeline([('transformer', transformer), ('ros', ros), ('svc', svc)])
ada_pipe_ros = Pipeline([('transformer', transformer), ('ros', ros), ('ada', ada)])
grad_pipe_ros = Pipeline([('transformer', transformer), ('ros', ros), ('grad', grad)])
xgb_pipe_ros = Pipeline([('transformer', transformer), ('ros', ros), ('xgb', xgb)])

for model in [logreg_pipe_ros, tree_pipe_ros, knn_pipe_ros, rf_pipe_ros, svc_pipe_ros, ada_pipe_ros, grad_pipe_ros, xgb_pipe_ros]:
    model.fit(X_train, y_train)

score_acc = [accuracy_score(y_test, logreg_pipe_ros.predict(X_test)),
             accuracy_score(y_test, tree_pipe_ros.predict(X_test)),
             accuracy_score(y_test, knn_pipe_ros.predict(X_test)),
             accuracy_score(y_test, rf_pipe_ros.predict(X_test)),
             accuracy_score(y_test, svc_pipe_ros.predict(X_test)),
             accuracy_score(y_test, ada_pipe_ros.predict(X_test)),
             accuracy_score(y_test, grad_pipe_ros.predict(X_test)),
             accuracy_score(y_test, xgb_pipe_ros.predict(X_test))]
method_name = ['Logistic Regression OverSampling', 'Decision Tree Classifier OverSampling', 'KNN Classifier OverSampling', 'Random Forest Classifier OverSampling', 'LinearSVC OverSampling', 'AdaBoost Classifier OverSampling', 'Gradient Boosting Classifier OverSampling', 'XGB Classifier OverSampling']

acc_ros_summary = pd.DataFrame({'method': method_name, 'accuracy score': score_acc})
acc_ros_summary

* From the RandomOverSampler method, the model that has the highest accuracy score are Logistic Regression, Decision Tree and Ada Boost Classifier. Let's use another method to compare. 

### Combine Over and Under

*SMOTETomek*

In [None]:
smotetom = SMOTETomek(random_state = 3434)

logreg_pipe_smotetom = Pipeline([('transformer', transformer), ('smotetomek', smotetom), ('logreg', logreg)])
tree_pipe_smotetom = Pipeline([('transformer', transformer), ('smotetomek', smotetom), ('tree', tree)])
knn_pipe_smotetom = Pipeline([('transformer', transformer), ('smotetomek', smotetom), ('knn', knn)])
rf_pipe_smotetom = Pipeline([('transformer', transformer), ('smotetomek', smotetom), ('rf', rf)])
svc_pipe_smotetom = Pipeline([('transformer', transformer), ('smotetomek', smotetom), ('svc', svc)])
ada_pipe_smotetom = Pipeline([('transformer', transformer), ('smotetomek', smotetom), ('ada', ada)])
grad_pipe_smotetom = Pipeline([('transformer', transformer), ('smotetomek', smotetom), ('grad', grad)])
xgb_pipe_smotetom = Pipeline([('transformer', transformer), ('smotetomek', smotetom), ('xgb', xgb)])

for model in [logreg_pipe_smotetom, tree_pipe_smotetom, knn_pipe_ros, rf_pipe_smotetom, svc_pipe_smotetom, ada_pipe_smotetom, grad_pipe_smotetom, xgb_pipe_smotetom]:
    model.fit(X_train, y_train)

score_acc = [accuracy_score(y_test, logreg_pipe_smotetom.predict(X_test)),
             accuracy_score(y_test, tree_pipe_smotetom.predict(X_test)),
             accuracy_score(y_test, knn_pipe_smotetom.predict(X_test)),
             accuracy_score(y_test, rf_pipe_smotetom.predict(X_test)),
             accuracy_score(y_test, svc_pipe_smotetom.predict(X_test)),
             accuracy_score(y_test, ada_pipe_smotetom.predict(X_test)),
             accuracy_score(y_test, grad_pipe_smotetom.predict(X_test)),
             accuracy_score(y_test, xgb_pipe_smotetom.predict(X_test))]
method_name = ['Logistic Regression SMOTETomek', 'Decision Tree Classifier SMOTETomek', 'KNN Classifier SMOTETomek', 'Random Forest Classifier SMOTETomek', 'LinearSVC SMOTETomek', 'AdaBoost Classifier SMOTETomek', 'Gradient Boosting Classifier SMOTETomek', 'XGB Classifier SMOTETomek']

acc_smotetom_summary = pd.DataFrame({'method': method_name, 'accuracy score': score_acc})
acc_smotetom_summary

* From the SMOTETomek method, there are 3 models that have the highest accuracy score, it's Random Forest, LinearSVC, and Ada Boost Classifier. All of them have been in the same score.

## Summary

* All tree-based models seem to perform pretty well with imbalanced datasets. Since they work by coming up with conditions/rules at each stage of splitting, they end up taking both classes into consideration.
* **From all methods, the highest accuracy score is AdaBoost Classifier with 0.891892. It means 8 to 9 from 10 customers will buy again to Starbucks.**