# Business problem :

**To recommend a particular offer to customer A after finding the offer completed by the customer B most similar to customer A**

We will use **Memory Based Approach** of **Collaborative Filtering** recommender system to solve this problem and to be particular, we will apply **User Based Filtering**

In [None]:
import numpy as np
import numpy.ma as ma
import pandas as pd
from scipy import spatial
# We do this to ignore several specific Pandas warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
starbucks_data = pd.read_csv('starbucks.csv')

In [None]:
starbucks_offers = pd.read_excel('portfolio.xlsx')

In [None]:
starbucks_offers

Unnamed: 0,reward,channels__001,channels__002,channels__003,channels__004,difficulty,duration,offer_type,id
0,10,email,mobile,social,,10,7,bogo,ae264e3637204a6fb9bb56bc8210ddfd
1,10,web,email,mobile,social,10,5,bogo,4d5c57ea9a6940dd891ad53e9dbe8da0
2,0,web,email,mobile,,0,4,informational,3f207df678b143eea3cee63160fa8bed
3,5,web,email,mobile,,5,7,bogo,9b98b8c7a33c4b65b9aebfe6a799e6d9
4,5,web,email,,,20,10,discount,0b1e1539f2cc45b7b9fa7c272da2e1d7
5,3,web,email,mobile,social,7,7,discount,2298d6c36e964ae4a3e7e9706d1fb8c2
6,2,web,email,mobile,social,10,10,discount,fafdcd668e3743c1bb461111dcafc2a4
7,0,email,mobile,social,,0,3,informational,5a8bc65990b245e5a138643cd4eb9837
8,5,web,email,mobile,social,5,5,bogo,f19421c1d4aa40978ebb69ca19b0e20d
9,2,web,email,mobile,,10,7,discount,2906b810c7d4411798c6938adc9daaa5


In [None]:
#renaming columns of starbucks_offers
starbucks_offers['offer_validity'] = starbucks_offers['duration']
starbucks_offers['minimum_amount_req_for_offer_completion'] = starbucks_offers['difficulty']
starbucks_offers.drop(['difficulty','duration'],axis=1,inplace=True)
starbucks_offers['offer_validity'] = starbucks_offers['offer_validity'].astype(str)
starbucks_offers['offer_validity'] = starbucks_offers['offer_validity'] + ' days'
starbucks_offers['minimum_amount_req_for_offer_completion'] = starbucks_offers['minimum_amount_req_for_offer_completion'].astype(str)
starbucks_offers['minimum_amount_req_for_offer_completion'] = starbucks_offers['minimum_amount_req_for_offer_completion'] + ' USD'

In [None]:
starbucks_offers

Unnamed: 0,reward,channels__001,channels__002,channels__003,channels__004,offer_type,id,offer_validity,minimum_amount_req_for_offer_completion
0,10,email,mobile,social,,bogo,ae264e3637204a6fb9bb56bc8210ddfd,7 days,10 USD
1,10,web,email,mobile,social,bogo,4d5c57ea9a6940dd891ad53e9dbe8da0,5 days,10 USD
2,0,web,email,mobile,,informational,3f207df678b143eea3cee63160fa8bed,4 days,0 USD
3,5,web,email,mobile,,bogo,9b98b8c7a33c4b65b9aebfe6a799e6d9,7 days,5 USD
4,5,web,email,,,discount,0b1e1539f2cc45b7b9fa7c272da2e1d7,10 days,20 USD
5,3,web,email,mobile,social,discount,2298d6c36e964ae4a3e7e9706d1fb8c2,7 days,7 USD
6,2,web,email,mobile,social,discount,fafdcd668e3743c1bb461111dcafc2a4,10 days,10 USD
7,0,email,mobile,social,,informational,5a8bc65990b245e5a138643cd4eb9837,3 days,0 USD
8,5,web,email,mobile,social,bogo,f19421c1d4aa40978ebb69ca19b0e20d,5 days,5 USD
9,2,web,email,mobile,,discount,2906b810c7d4411798c6938adc9daaa5,7 days,10 USD


In [None]:
starbucks_data.head()

Unnamed: 0.1,Unnamed: 0,customer_id,offer_id,offer_received,received_time,offer_viewed,viewed_time,time_viewed_received,offer_completed,completed_time,...,reward_each_time,difficulty,duration,email,mobile,social,web,bogo,discount,informational
0,0,ffff82501cea40309d5fdd7edcca4a07,fafdcd668e3743c1bb461111dcafc2a4,1,0.0,1.0,6.0,6.0,1.0,60.0,...,2.0,10.0,10.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0
1,1,fff8957ea8b240a6b5e634b6ee8eafcf,fafdcd668e3743c1bb461111dcafc2a4,1,408.0,1.0,432.0,24.0,0.0,0.0,...,2.0,10.0,10.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0
2,2,fff7576017104bcc8677a8d63322b5e1,fafdcd668e3743c1bb461111dcafc2a4,1,252.0,1.0,273.0,21.0,1.0,393.0,...,2.0,10.0,10.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0
3,3,fff3ba4757bd42088c044ca26d73817a,fafdcd668e3743c1bb461111dcafc2a4,1,0.0,1.0,6.0,6.0,1.0,6.0,...,2.0,10.0,10.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0
4,4,fff29fb549084123bd046dbc5ceb4faa,fafdcd668e3743c1bb461111dcafc2a4,1,168.0,1.0,168.0,0.0,1.0,168.0,...,2.0,10.0,10.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0


In [None]:
starbucks_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147508 entries, 0 to 147507
Data columns (total 30 columns):
Unnamed: 0                 147508 non-null int64
customer_id                147508 non-null object
offer_id                   147508 non-null object
offer_received             147508 non-null int64
received_time              147508 non-null float64
offer_viewed               147508 non-null float64
viewed_time                147508 non-null float64
time_viewed_received       147508 non-null float64
offer_completed            147508 non-null float64
completed_time             147508 non-null float64
time_completed_received    147508 non-null float64
time_completed_viewed      147508 non-null float64
transaction                147508 non-null float64
transaction_time           147508 non-null float64
amount                     147508 non-null float64
total_reward               147508 non-null float64
age                        147508 non-null int64
income                     147

In [None]:
# taking few columns from starbucks_data to create starbucks_user_offer
starbucks_user_offer = starbucks_data[['customer_id','age','income','male','offer_id','offer_completed','offer_viewed','membership_days','informational']]

In [None]:
# Drop transactions not associated with an offer
starbucks_user_offer = starbucks_user_offer[starbucks_user_offer['offer_id'] != 0]

In [None]:
# Assumption: offer_viewed can be seen as completed for informational offer
informational = starbucks_user_offer[starbucks_user_offer['informational'] == 1]

index = informational[informational['offer_viewed'] == 1].index.to_list()
starbucks_user_offer.loc[index,'offer_completed'] = 1

In [None]:
# converting columns to int type
starbucks_user_offer['offer_completed']=starbucks_user_offer['offer_completed'].astype(int)
starbucks_user_offer['income']=starbucks_user_offer['income'].astype(int)

In [None]:
#drop informational/offer_viewed column
starbucks_user_offer = starbucks_user_offer.drop(['informational','offer_viewed'],axis = 1)

In [None]:
starbucks_user_offer.head()

Unnamed: 0,customer_id,age,income,male,offer_id,offer_completed,membership_days
0,ffff82501cea40309d5fdd7edcca4a07,45,62000,0,fafdcd668e3743c1bb461111dcafc2a4,1,608
1,fff8957ea8b240a6b5e634b6ee8eafcf,71,56000,1,fafdcd668e3743c1bb461111dcafc2a4,0,158
2,fff7576017104bcc8677a8d63322b5e1,71,73000,1,fafdcd668e3743c1bb461111dcafc2a4,1,268
3,fff3ba4757bd42088c044ca26d73817a,69,83000,0,fafdcd668e3743c1bb461111dcafc2a4,1,1040
4,fff29fb549084123bd046dbc5ceb4faa,59,93000,0,fafdcd668e3743c1bb461111dcafc2a4,1,329


In [None]:
# creating age groups
starbucks_user_offer['age_15to20'] = starbucks_user_offer['age'].apply(lambda x: 1 if (x>=15 and x<21) else 0)
starbucks_user_offer['age_20to30'] = starbucks_user_offer['age'].apply(lambda x: 1 if (x>=20 and x<31) else 0)
starbucks_user_offer['age_30to40'] = starbucks_user_offer['age'].apply(lambda x: 1 if (x>=30 and x<41) else 0)
starbucks_user_offer['age_40to50'] = starbucks_user_offer['age'].apply(lambda x: 1 if (x>=40 and x<51) else 0)
starbucks_user_offer['age_50to60'] = starbucks_user_offer['age'].apply(lambda x: 1 if (x>=50 and x<61) else 0)
starbucks_user_offer['age_60to70'] = starbucks_user_offer['age'].apply(lambda x: 1 if (x>=60 and x<71) else 0)
starbucks_user_offer['age_morethan70'] = starbucks_user_offer['age'].apply(lambda x: 1 if x>=70 else 0)
#dropping age
starbucks_user_offer.drop('age',axis=1,inplace=True)

#creating income groups
starbucks_user_offer['low_income'] = starbucks_user_offer['income'].apply(lambda x: 1 if (x>=30000 and x<60000) else 0)
starbucks_user_offer['mid_income'] = starbucks_user_offer['income'].apply(lambda x: 1 if (x>=60000 and x<90000) else 0)
starbucks_user_offer['high_income'] = starbucks_user_offer['income'].apply(lambda x: 1 if (x>=90000 and x<=120000) else 0)
#dropping income
starbucks_user_offer.drop('income',axis=1,inplace=True)

#creating membership days groups
starbucks_user_offer['mbr_lessthan_1_year'] = starbucks_user_offer['membership_days'].apply(lambda x: 1 if (x>=0 and x<365) else 0)
starbucks_user_offer['mbr_1to2_years'] = starbucks_user_offer['membership_days'].apply(lambda x: 1 if (x>=365 and x<730) else 0)
starbucks_user_offer['mbr_2to3_years'] = starbucks_user_offer['membership_days'].apply(lambda x: 1 if (x>=730 and x<1095) else 0)
starbucks_user_offer['mbr_3to4_years'] = starbucks_user_offer['membership_days'].apply(lambda x: 1 if (x>=1095 and x<1460) else 0)
starbucks_user_offer['mbr_morethan_4_years'] = starbucks_user_offer['membership_days'].apply(lambda x: 1 if (x>=1460 and x<1825) else 0)

#dropping membership_days
starbucks_user_offer.drop('membership_days',axis=1,inplace=True)

## Manually create more features for customers in order to make the recommender system more practical under real-life scenario

In [None]:
#Marital status (single/married)
starbucks_user_offer['is_Married'] = np.random.choice(2,len(starbucks_user_offer))

In [None]:
#Education(High school degree or less/Bachelor’s degree/Master degree or above)
starbucks_user_offer['education_level'] = np.random.choice(3,len(starbucks_user_offer))
starbucks_user_offer = pd.get_dummies(starbucks_user_offer,columns = ['education_level'],prefix = ['is'])
starbucks_user_offer = starbucks_user_offer.rename(columns={"is_0": "High_school_or_less", "is_1": "is_Bachelor","is_2": "is_Master"})

In [None]:
#Employment (Full-time/Part-time/Unemployed/Student/Retired)
starbucks_user_offer['employment'] = np.random.choice(5,len(starbucks_user_offer))
starbucks_user_offer = pd.get_dummies(starbucks_user_offer,columns = ['employment'],prefix = ['is'])
starbucks_user_offer = starbucks_user_offer.rename(columns={"is_0": "is_Full-time", "is_1": "is_Part-time","is_2": "is_Unemployed","is_3": "is_Student","is_4": "is_Retired"})


In [None]:
#Race(American Indian or Alaska Native/Asian/Black or African_American/White/Native Hawaiian or other Pacific Island)
starbucks_user_offer['race'] = np.random.choice(5,len(starbucks_user_offer))
starbucks_user_offer = pd.get_dummies(starbucks_user_offer,columns = ['race'],prefix = ['is'])
starbucks_user_offer = starbucks_user_offer.rename(columns={"is_0": "is_American_indian", "is_1": "is_Asian","is_2": "is_Black","is_3": "is_White","is_4": "is_native_hawaiian"})


In [None]:
starbucks_user_offer.head()

Unnamed: 0,customer_id,male,offer_id,offer_completed,age_15to20,age_20to30,age_30to40,age_40to50,age_50to60,age_60to70,...,is_Full-time,is_Part-time,is_Unemployed,is_Student,is_Retired,is_American_indian,is_Asian,is_Black,is_White,is_native_hawaiian
0,ffff82501cea40309d5fdd7edcca4a07,0,fafdcd668e3743c1bb461111dcafc2a4,1,0,0,0,1,0,0,...,1,0,0,0,0,1,0,0,0,0
1,fff8957ea8b240a6b5e634b6ee8eafcf,1,fafdcd668e3743c1bb461111dcafc2a4,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
2,fff7576017104bcc8677a8d63322b5e1,1,fafdcd668e3743c1bb461111dcafc2a4,1,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0
3,fff3ba4757bd42088c044ca26d73817a,0,fafdcd668e3743c1bb461111dcafc2a4,1,0,0,0,0,0,1,...,1,0,0,0,0,0,1,0,0,0
4,fff29fb549084123bd046dbc5ceb4faa,0,fafdcd668e3743c1bb461111dcafc2a4,1,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,1,0


In [None]:
starbucks_user_offer.columns

Index(['customer_id', 'male', 'offer_id', 'offer_completed', 'age_15to20',
       'age_20to30', 'age_30to40', 'age_40to50', 'age_50to60', 'age_60to70',
       'age_morethan70', 'low_income', 'mid_income', 'high_income',
       'mbr_lessthan_1_year', 'mbr_1to2_years', 'mbr_2to3_years',
       'mbr_3to4_years', 'mbr_morethan_4_years', 'is_Married',
       'High_school_or_less', 'is_Bachelor', 'is_Master', 'is_Full-time',
       'is_Part-time', 'is_Unemployed', 'is_Student', 'is_Retired',
       'is_American_indian', 'is_Asian', 'is_Black', 'is_White',
       'is_native_hawaiian'],
      dtype='object')

In [None]:
#resetting index after dropping rows
starbucks_user_offer.reset_index(drop=True,inplace=True)
starbucks_user_offer.head()

Unnamed: 0,customer_id,male,offer_id,offer_completed,age_15to20,age_20to30,age_30to40,age_40to50,age_50to60,age_60to70,...,is_Full-time,is_Part-time,is_Unemployed,is_Student,is_Retired,is_American_indian,is_Asian,is_Black,is_White,is_native_hawaiian
0,ffff82501cea40309d5fdd7edcca4a07,0,fafdcd668e3743c1bb461111dcafc2a4,1,0,0,0,1,0,0,...,1,0,0,0,0,1,0,0,0,0
1,fff8957ea8b240a6b5e634b6ee8eafcf,1,fafdcd668e3743c1bb461111dcafc2a4,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
2,fff7576017104bcc8677a8d63322b5e1,1,fafdcd668e3743c1bb461111dcafc2a4,1,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0
3,fff3ba4757bd42088c044ca26d73817a,0,fafdcd668e3743c1bb461111dcafc2a4,1,0,0,0,0,0,1,...,1,0,0,0,0,0,1,0,0,0
4,fff29fb549084123bd046dbc5ceb4faa,0,fafdcd668e3743c1bb461111dcafc2a4,1,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,1,0


In [None]:
starbucks_user = starbucks_user_offer.iloc[:,~starbucks_user_offer.columns.isin(['offer_id'])]
starbucks_user_offer = starbucks_user_offer[['customer_id','offer_id']]

In [None]:
starbucks_user.head()

Unnamed: 0,customer_id,male,offer_completed,age_15to20,age_20to30,age_30to40,age_40to50,age_50to60,age_60to70,age_morethan70,...,is_Full-time,is_Part-time,is_Unemployed,is_Student,is_Retired,is_American_indian,is_Asian,is_Black,is_White,is_native_hawaiian
0,ffff82501cea40309d5fdd7edcca4a07,0,1,0,0,0,1,0,0,0,...,1,0,0,0,0,1,0,0,0,0
1,fff8957ea8b240a6b5e634b6ee8eafcf,1,0,0,0,0,0,0,0,1,...,0,1,0,0,0,0,0,0,1,0
2,fff7576017104bcc8677a8d63322b5e1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,1,1,0,0,0,0
3,fff3ba4757bd42088c044ca26d73817a,0,1,0,0,0,0,0,1,0,...,1,0,0,0,0,0,1,0,0,0
4,fff29fb549084123bd046dbc5ceb4faa,0,1,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0


In [None]:
starbucks_user_offer.head()

Unnamed: 0,customer_id,offer_id
0,ffff82501cea40309d5fdd7edcca4a07,fafdcd668e3743c1bb461111dcafc2a4
1,fff8957ea8b240a6b5e634b6ee8eafcf,fafdcd668e3743c1bb461111dcafc2a4
2,fff7576017104bcc8677a8d63322b5e1,fafdcd668e3743c1bb461111dcafc2a4
3,fff3ba4757bd42088c044ca26d73817a,fafdcd668e3743c1bb461111dcafc2a4
4,fff29fb549084123bd046dbc5ceb4faa,fafdcd668e3743c1bb461111dcafc2a4


In [None]:
starbucks_user_divide = starbucks_user.groupby('customer_id').sum()

In [None]:
starbucks_user_divide.head()

Unnamed: 0_level_0,male,offer_completed,age_15to20,age_20to30,age_30to40,age_40to50,age_50to60,age_60to70,age_morethan70,low_income,...,is_Full-time,is_Part-time,is_Unemployed,is_Student,is_Retired,is_American_indian,is_Asian,is_Black,is_White,is_native_hawaiian
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0009655768c64bdeb2e877511632db8f,10,5,0,0,10,0,0,0,0,0,...,0,3,2,2,3,1,1,1,4,3
0020c2b971eb4e9188eac86d93036a77,0,3,0,0,0,0,10,0,0,0,...,0,1,4,4,1,3,1,3,1,2
0020ccbbb6d84e358d3414a3ff76cffd,0,4,0,13,0,0,0,0,0,0,...,2,3,2,3,3,2,1,1,6,3
003d66b6608740288d6cc97a6903f4f0,0,4,0,19,0,0,0,0,0,0,...,2,3,5,4,5,4,4,3,4,4
00426fe3ffde4c6b9cb9ad6d077a13ea,0,2,20,0,0,0,0,0,0,0,...,2,7,5,3,3,0,3,4,9,4


## Divide customers into Existing customers and New customers
### Definition:
* Existing Customers: users who has already completed at least one offer
* New Customers - users who might be already a member in starbucks customer program who has not completed any offer or might be a totally new customer entering membership program


### Motivation for dividing customers:
* Differring from new customers, existing customers process unique features, such as membership_year and review_toward_Starbucks after completing offers.

In [None]:
# restore dummy variables
starbucks_user_divide.reset_index(inplace=True)
for col in starbucks_user_divide.iloc[:,~starbucks_user_divide.columns.isin(['offer_completed','customer_id'])].columns:
    for row in range(len(starbucks_user_divide[col])):
        if starbucks_user_divide[col][row] != 0:
            starbucks_user_divide[col][row] = 1
            
starbucks_user_divide.head()          

Unnamed: 0,customer_id,male,offer_completed,age_15to20,age_20to30,age_30to40,age_40to50,age_50to60,age_60to70,age_morethan70,...,is_Full-time,is_Part-time,is_Unemployed,is_Student,is_Retired,is_American_indian,is_Asian,is_Black,is_White,is_native_hawaiian
0,0009655768c64bdeb2e877511632db8f,1,5,0,0,1,0,0,0,0,...,0,1,1,1,1,1,1,1,1,1
1,0020c2b971eb4e9188eac86d93036a77,0,3,0,0,0,0,1,0,0,...,0,1,1,1,1,1,1,1,1,1
2,0020ccbbb6d84e358d3414a3ff76cffd,0,4,0,1,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
3,003d66b6608740288d6cc97a6903f4f0,0,4,0,1,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
4,00426fe3ffde4c6b9cb9ad6d077a13ea,0,2,1,0,0,0,0,0,0,...,1,1,1,1,1,0,1,1,1,1


In [None]:
# separating new users from exited users by 'offer_completed'
new_users = starbucks_user_divide[starbucks_user_divide['offer_completed'] == 0]
existed_users = starbucks_user_divide[starbucks_user_divide['offer_completed'] != 0]

In [None]:
exist_df = starbucks_user_divide.iloc[existed_users.index.to_list(),]
new_df = starbucks_user_divide.iloc[new_users.index.to_list(),]

In [None]:
exist_df.set_index('customer_id',inplace=True)
exist_df = exist_df.drop('offer_completed',axis = 1)
exist_df.head()

Unnamed: 0_level_0,male,age_15to20,age_20to30,age_30to40,age_40to50,age_50to60,age_60to70,age_morethan70,low_income,mid_income,...,is_Full-time,is_Part-time,is_Unemployed,is_Student,is_Retired,is_American_indian,is_Asian,is_Black,is_White,is_native_hawaiian
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0009655768c64bdeb2e877511632db8f,1,0,0,1,0,0,0,0,0,1,...,0,1,1,1,1,1,1,1,1,1
0020c2b971eb4e9188eac86d93036a77,0,0,0,0,0,1,0,0,0,0,...,0,1,1,1,1,1,1,1,1,1
0020ccbbb6d84e358d3414a3ff76cffd,0,0,1,0,0,0,0,0,0,1,...,1,1,1,1,1,1,1,1,1,1
003d66b6608740288d6cc97a6903f4f0,0,0,1,0,0,0,0,0,0,1,...,1,1,1,1,1,1,1,1,1,1
00426fe3ffde4c6b9cb9ad6d077a13ea,0,1,0,0,0,0,0,0,0,1,...,1,1,1,1,1,0,1,1,1,1


In [None]:
new_df.set_index('customer_id',inplace=True)
new_df = new_df.drop('offer_completed',axis = 1)
new_df.head()

Unnamed: 0_level_0,male,age_15to20,age_20to30,age_30to40,age_40to50,age_50to60,age_60to70,age_morethan70,low_income,mid_income,...,is_Full-time,is_Part-time,is_Unemployed,is_Student,is_Retired,is_American_indian,is_Asian,is_Black,is_White,is_native_hawaiian
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00857b24b13f4fe0ad17b605f00357f5,1,0,0,0,0,0,0,1,1,0,...,1,1,0,1,1,1,1,0,1,1
00aee28bbb3848dd8a31f0c91dc267dd,1,0,1,0,0,0,0,0,1,0,...,1,1,1,0,1,0,1,0,1,1
00b901d68f8f4fd68075184cd0f772d2,0,0,0,0,0,0,1,0,0,1,...,1,0,1,1,1,1,0,1,1,1
00c32a104f0c4065b5b552895fb22e34,1,0,0,1,0,0,0,0,1,0,...,1,0,1,1,1,0,1,1,1,1
00ceaf16a40341e6996d543d04daa2c2,1,0,0,1,0,0,0,0,1,0,...,1,1,1,1,1,0,1,1,1,1


## For Existing Customers:

* Customer's review after offer-completion can be determinant in subsequent coffee consumption; Make review becomes one customer feature
* Since customer's review toward Starbucks are currently unavailable, plan of execution:
          ** Retrieve review toward Starbucks from Yelp
          ** Apply sentiment analysis to each review and score each review within range [-1,1]
          ** Randomly assign them to each existed customer


Sentiment Analysis

In [None]:
#!pip install textblob
from textblob import TextBlob 

In [None]:
review = pd.read_csv('review_starbucks_from_yelp.csv')
review.head()

Unnamed: 0.1,Unnamed: 0,reviews
0,0,Bad 1st experience! Charged for something I di...
1,1,I'm not sure why this place doesn't have excel...
2,2,WHY SO EXPENSIVE?!?! \n\nI know the reason why...
3,3,really? TWO AND HALF inches room from the top ...
4,4,I stopped in here tonight for some buzz to kee...


In [None]:
#Example of scoring a review
message_text = review.reviews[0]
analysis = TextBlob(message_text) 
analysis.sentiment.polarity

-0.17499999999999996

In [None]:
#append sentiment score to reviews
review['scores'] = [TextBlob(text).sentiment.polarity for text in review.reviews]
review = review.drop('Unnamed: 0',axis = 1)
review.head()

Unnamed: 0,reviews,scores
0,Bad 1st experience! Charged for something I di...,-0.175
1,I'm not sure why this place doesn't have excel...,0.389286
2,WHY SO EXPENSIVE?!?! \n\nI know the reason why...,0.172786
3,really? TWO AND HALF inches room from the top ...,0.196065
4,I stopped in here tonight for some buzz to kee...,0.236111


In [None]:
# generate random sequence and assign them to users
random_arr = np.random.choice(len(review), len(exist_df))
exist_df['review_score'] = [review.scores[i] for i in random_arr]
exist_df.head()

Unnamed: 0_level_0,male,age_15to20,age_20to30,age_30to40,age_40to50,age_50to60,age_60to70,age_morethan70,low_income,mid_income,...,is_Part-time,is_Unemployed,is_Student,is_Retired,is_American_indian,is_Asian,is_Black,is_White,is_native_hawaiian,review_score
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0009655768c64bdeb2e877511632db8f,1,0,0,1,0,0,0,0,0,1,...,1,1,1,1,1,1,1,1,1,-0.5
0020c2b971eb4e9188eac86d93036a77,0,0,0,0,0,1,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0.25
0020ccbbb6d84e358d3414a3ff76cffd,0,0,1,0,0,0,0,0,0,1,...,1,1,1,1,1,1,1,1,1,0.1
003d66b6608740288d6cc97a6903f4f0,0,0,1,0,0,0,0,0,0,1,...,1,1,1,1,1,1,1,1,1,0.194762
00426fe3ffde4c6b9cb9ad6d077a13ea,0,1,0,0,0,0,0,0,0,1,...,1,1,1,1,0,1,1,1,1,-0.406901


In [None]:
#creating groups of different attitude toward Starbucks
exist_df['strong_positive'] = exist_df['review_score'].apply(lambda x: 1 if (x>0.55 and x<=1) else 0)
exist_df['weak_positive'] = exist_df['review_score'].apply(lambda x: 1 if (x>0.1 and x<=0.55) else 0)
exist_df['neutral'] = exist_df['review_score'].apply(lambda x: 1 if (x>-0.1 and x<=0.1) else 0)
exist_df['weak_negative'] = exist_df['review_score'].apply(lambda x: 1 if (x>-0.55 and x<=-0.1) else 0)
exist_df['strong_negative'] = exist_df['review_score'].apply(lambda x: 1 if (x>=-1 and x<-0.55) else 0)

#drop review_score
exist_df.drop('review_score',axis=1,inplace=True)

In [None]:
exist_df.head()

Unnamed: 0_level_0,male,age_15to20,age_20to30,age_30to40,age_40to50,age_50to60,age_60to70,age_morethan70,low_income,mid_income,...,is_American_indian,is_Asian,is_Black,is_White,is_native_hawaiian,strong_positive,weak_positive,neutral,weak_negative,strong_negative
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0009655768c64bdeb2e877511632db8f,1,0,0,1,0,0,0,0,0,1,...,1,1,1,1,1,0,0,0,1,0
0020c2b971eb4e9188eac86d93036a77,0,0,0,0,0,1,0,0,0,0,...,1,1,1,1,1,0,1,0,0,0
0020ccbbb6d84e358d3414a3ff76cffd,0,0,1,0,0,0,0,0,0,1,...,1,1,1,1,1,0,0,1,0,0
003d66b6608740288d6cc97a6903f4f0,0,0,1,0,0,0,0,0,0,1,...,1,1,1,1,1,0,1,0,0,0
00426fe3ffde4c6b9cb9ad6d077a13ea,0,1,0,0,0,0,0,0,0,1,...,0,1,1,1,1,0,0,0,1,0


In [None]:
# create exist user matrix
starbucks_existed_user = exist_df.copy()
starbucks_exist_user_matrix = exist_df.values
starbucks_exist_user_matrix

array([[1, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 1, 0, 0],
       ...,
       [1, 0, 0, ..., 1, 0, 0],
       [1, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 1, 0, 0]], dtype=int64)

In [None]:
exist_df.columns

Index(['male', 'age_15to20', 'age_20to30', 'age_30to40', 'age_40to50',
       'age_50to60', 'age_60to70', 'age_morethan70', 'low_income',
       'mid_income', 'high_income', 'mbr_lessthan_1_year', 'mbr_1to2_years',
       'mbr_2to3_years', 'mbr_3to4_years', 'mbr_morethan_4_years',
       'is_Married', 'High_school_or_less', 'is_Bachelor', 'is_Master',
       'is_Full-time', 'is_Part-time', 'is_Unemployed', 'is_Student',
       'is_Retired', 'is_American_indian', 'is_Asian', 'is_Black', 'is_White',
       'is_native_hawaiian', 'strong_positive', 'weak_positive', 'neutral',
       'weak_negative', 'strong_negative'],
      dtype='object')

## For New Customers:

In [None]:
#dropping membership_days/offer_completed since new customers do not have these features to compare with
whole_user_df = starbucks_user_divide.iloc[:,~starbucks_user_divide.columns.isin(['mbr_lessthan_1_year','mbr_1to2_years','mbr_2to3_years','mbr_3to4_years','mbr_morethan_4_years'])]
whole_user_df.drop('offer_completed',axis=1,inplace=True)
whole_user_df.set_index('customer_id',inplace=True)
whole_user_df.head()

Unnamed: 0_level_0,male,age_15to20,age_20to30,age_30to40,age_40to50,age_50to60,age_60to70,age_morethan70,low_income,mid_income,...,is_Full-time,is_Part-time,is_Unemployed,is_Student,is_Retired,is_American_indian,is_Asian,is_Black,is_White,is_native_hawaiian
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0009655768c64bdeb2e877511632db8f,1,0,0,1,0,0,0,0,0,1,...,0,1,1,1,1,1,1,1,1,1
0020c2b971eb4e9188eac86d93036a77,0,0,0,0,0,1,0,0,0,0,...,0,1,1,1,1,1,1,1,1,1
0020ccbbb6d84e358d3414a3ff76cffd,0,0,1,0,0,0,0,0,0,1,...,1,1,1,1,1,1,1,1,1,1
003d66b6608740288d6cc97a6903f4f0,0,0,1,0,0,0,0,0,0,1,...,1,1,1,1,1,1,1,1,1,1
00426fe3ffde4c6b9cb9ad6d077a13ea,0,1,0,0,0,0,0,0,0,1,...,1,1,1,1,1,0,1,1,1,1


In [None]:
whole_user_df.columns

Index(['male', 'age_15to20', 'age_20to30', 'age_30to40', 'age_40to50',
       'age_50to60', 'age_60to70', 'age_morethan70', 'low_income',
       'mid_income', 'high_income', 'is_Married', 'High_school_or_less',
       'is_Bachelor', 'is_Master', 'is_Full-time', 'is_Part-time',
       'is_Unemployed', 'is_Student', 'is_Retired', 'is_American_indian',
       'is_Asian', 'is_Black', 'is_White', 'is_native_hawaiian'],
      dtype='object')

In [None]:
#create new user matrix
starbucks_new_user_matrix = whole_user_df.values
starbucks_new_user_matrix

array([[1, 0, 0, ..., 1, 1, 1],
       [0, 0, 0, ..., 1, 1, 1],
       [0, 0, 1, ..., 1, 1, 1],
       ...,
       [1, 0, 0, ..., 1, 1, 0],
       [1, 0, 0, ..., 1, 1, 1],
       [0, 0, 0, ..., 1, 1, 1]], dtype=int64)

## Entering customer information through user input
* If you enter 'Y' for "Does This Customer Has a Customer ID (Y or N)", please enter a customer ID present in starbucks_existed_user dataframe
* If you enter 'N', please enter the details as asked

In [None]:
# Define the Cosine Similarity function
def cosine_similarity(u, v, weights):
    return(1 - spatial.distance.cosine(u,v,weights))

#existing customer(completed at least one offer) customer id list
user_id_lst = starbucks_existed_user.index.to_list()

# new customer(has customer id but has not completed any offer) customer id list
new_id_lst = new_df.index.to_list()


try:
    #ask if the customer has a customer id
    has_id_or_not = input('Does This Customer Has a Customer ID (Y or N) : ')
    if has_id_or_not == 'Y':
        #ask existed customer id
        cust_id = input('Enter Existing Customer ID : ')
        if cust_id in user_id_lst:
            indices = user_id_lst.index(cust_id)
            x = np.delete(starbucks_exist_user_matrix,(indices),axis = 0)
            y = starbucks_exist_user_matrix[indices]
        elif cust_id in new_id_lst:
            indices = new_id_lst.index(cust_id)
            x = np.delete(starbucks_new_user_matrix,(indices),axis = 0)
            y = starbucks_new_user_matrix[indices]
        else:
            print('Invalid input for Customer ID, please try again !')
        
    elif has_id_or_not == 'N':
        x = starbucks_new_user_matrix
        # creating space for new customer
        y = np.zeros(starbucks_new_user_matrix.shape[1], dtype=np.int32)
        # ask customer gender
        while (True):
            try: 
                cust_gender = input('Enter Customer Gender M or F : ')
                if cust_gender == 'M':
                    y[0] = 1
                    break
                elif cust_gender == 'F':
                    y[0] = 0
                    break
                else :
                    print('Invalid input for gender, please try again !')
            except (ValueError,TypeError):
                print("Invalid input for gender, please try again !")
        # ask customer age
        while (True):
            try:
                cust_age = int(input('Enter Customer Age (should be at least 15 years old)  : '))
                if (cust_age>=15 and cust_age<21):
                    y[1] = 1
                    break
                elif (cust_age>=20 and cust_age<31):
                    y[2] = 1
                    break
                elif (cust_age>=30 and cust_age<41):
                    y[3] = 1
                    break
                elif (cust_age>=40 and cust_age<51):
                    y[4] = 1
                    break
                elif (cust_age>=50 and cust_age<61):
                    y[5] = 1
                    break
                elif (cust_age>=60 and cust_age<71):
                    y[6] = 1
                    break
                elif cust_age>=70:
                    y[7] = 1
                    break
                else:
                    print('Invalid input for age, please try again !')
            except ValueError:
                print("Invalid input for age, please try again !")
        
        # ask customer annual income
        while (True):
            try:
                cust_income = int(input('Enter Customer Annual Income (in $) : '))
                if (cust_income>0 and cust_income<60000):
                    y[8] = 1
                    break
                elif (cust_income>=60000 and cust_income<90000):
                    y[9] = 1
                    break
                elif cust_income>=90000:
                    y[10] = 1
                    break
                else:
                    print('Invalid input for income, please try again !')
            except ValueError:
                print("Invalid input for income, please try again !")
        
        # ask customer marital status
        while (True):
            try:
                cust_mar = int(input('Enter Customer Marital Status (Single - 1 / Married - 2) : '))
                if cust_mar == 1:
                    y[11] = 1
                    break

                elif cust_mar == 2:
                    y[11] = 0
                    break
                else:
                    print('Invalid input for marital status, please try again !')
            except ValueError:
                print("Invalid input for marital status, please try again !")
       
        # ask customer education background
        while (True):
            try:
                cust_edu = int(input('Enter Customer Education Background (High School or Less - 1/Bachelor - 2/Master or Above - 3) : '))
                if cust_edu == 1:
                    y[12] = 1
                    break

                elif cust_edu == 2:
                    y[13] = 1
                    break
                elif cust_edu == 3:
                    y[14] = 1
                    break
                else:
                    print('Invalid input for education background, please try again !')
            except ValueError:
                print("Invalid input for education background, please try again !")
        
        # ask customer employment
        while (True):
            try:
                cust_emp = int(input('Enter Customer Employment (Full_time - 1/Part_time - 2/Unemployment - 3/Student - 4/Retired - 5) : '))
                if cust_emp == 1:
                    y[15] = 1
                    break

                elif cust_emp == 2:
                    y[16] = 1
                    break
                elif cust_emp == 3:
                    y[17] = 1
                    break
                elif cust_emp == 4:
                    y[18] = 1
                    break
                elif cust_emp == 5:
                    y[19] = 1
                    break
                else:
                    print('Invalid input for employment, please try again !')
            except ValueError:
                print("Invalid input for employment, please try again !")
        
        # ask customer race
        while (True):
            try:
                cust_race = int(input('Enter Customer Race (American Indian or Alaska Native - 1/Asian - 2/Black - 3/White - 4/Native Hawaiin or other Pacific Island - 5) : '))
                if cust_race == 1:
                    y[20] = 1
                    break
                elif cust_race == 2:
                    y[21] = 1
                    break
                elif cust_race == 3:
                    y[22] = 1
                    break
                elif cust_race == 4:
                    y[23] = 1
                    break
                elif cust_race == 5:
                    y[24] = 1
                    break
                else:
                    print('Invalid input for race, please try again !')
            except ValueError:
                print("Invalid input for race, please try again !")
        
    else :
        print('Invalid input for has_id_or_not, please try again !')
        
except (ValueError,TypeError):
    print("Invalid input, please try again !")

    
# Compute similarity with weights, find maximum value
weights_user_exists = [2,3,3,3,3,3,3,3,5,5,5,2,2,2,2,2,2,3,3,3,5,5,5,5,5,1,1,1,1,1,4,4,4,4,4]
weights_user_not_exists = [2,3,3,3,3,3,3,3,5,5,5,2,3,3,3,5,5,5,5,5,1,1,1,1,1]

if has_id_or_not == 'Y':
    sims = np.apply_along_axis(cosine_similarity, 1, x, y, weights_user_exists)
elif has_id_or_not == 'N':
    sims = np.apply_along_axis(cosine_similarity, 1, x, y, weights_user_not_exists)
    
mx = np.nanmax(sims)
# Find the best matching customer
usr_idx = np.where(sims==mx)[0][0]
# info of new customer and matched customer.
print(y[:])
print(x[usr_idx, :])            
if has_id_or_not == 'Y':
    print('\nCosine Similarity(y, x[{0:d}]) = {1:4.3f}'.format(usr_idx, cosine_similarity(y, x[usr_idx],weights_user_exists)), end='\n\n')
elif has_id_or_not == 'N':
    print('\nCosine Similarity(y, x[{0:d}]) = {1:4.3f}'.format(usr_idx, cosine_similarity(y, x[usr_idx],weights_user_not_exists)), end='\n\n')

if has_id_or_not == 'Y' and cust_id in user_id_lst:
    print('The Existing Customer is most similar to customer id : {} \n'.format(starbucks_existed_user.index[usr_idx]))
    # storing the offer IDs of the completed offers of the matched customer in a list
    starbucks_user_completed_offers = list(starbucks_user_offer[starbucks_user_offer['customer_id'] == starbucks_existed_user.index[usr_idx]]['offer_id'].values)
else:
    print('The New customer is most similar to customer id : {} \n'.format(whole_user_df.index[usr_idx]))
    starbucks_user_completed_offers = list(starbucks_user_offer[starbucks_user_offer['customer_id'] == whole_user_df.index[usr_idx]]['offer_id'].values)
    
# collecting info for for those offer IDs and storing in dataframe
recommended_offers = pd.DataFrame()
for i in range(0,len(starbucks_user_completed_offers)):
    if starbucks_user_completed_offers[i] in starbucks_offers['id'].values:
        recommended_offers = recommended_offers.append(starbucks_offers[starbucks_offers['id']==starbucks_user_completed_offers[i]][['offer_type','offer_validity','minimum_amount_req_for_offer_completion']])
        
        
print('The Recommended Offer(s) for the Customer ->  \n')
recommended_offers

Does This Customer Has a Customer ID (Y or N) : N
Enter Customer Gender M or F : M
Enter Customer Age (should be at least 15 years old)  : 45
Enter Customer Annual Income (in $) : 66000
Enter Customer Marital Status (Single - 1 / Married - 2) : 2
Enter Customer Education Background (High School or Less - 1/Bachelor - 2/Master or Above - 3) : 3
Enter Customer Employment (Full_time - 1/Part_time - 2/Unemployment - 3/Student - 4/Retired - 5) : 1
Enter Customer Race (American Indian or Alaska Native - 1/Asian - 2/Black - 3/White - 4/Native Hawaiin or other Pacific Island - 5) : 1
The New customer is most similar to customer id : aab351e60ea5439b8dc2876e6e1145c6 

The Recommended Offer(s) for the Customer ->  



Unnamed: 0,offer_type,offer_validity,minimum_amount_req_for_offer_completion
8,bogo,5 days,5 USD
7,informational,3 days,0 USD
2,informational,4 days,0 USD
