In [1]:
# Load libraries
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import summary_col
from scipy.stats import pearsonr

In [2]:
# Read the dataset 
dataFile = "data/Shining32.csv"
data = pd.read_csv(dataFile)
data.head()

Unnamed: 0,buyerid,age,education,inc,gender,location,purchase,brandid,mint,white,fluoride,kids,size,discount,familypack,priceperoz,priceperpack,Chosen,SecondChoice
0,1,56,College,79523.203,Female,Brooklyn,Online,Colgate,Yes,Yes,No,No,4.8,1,0,0.92,4.42,0,
1,1,56,College,79523.203,Female,Brooklyn,Online,Colgate,Yes,No,Yes,No,3.9,0,1,0.96,3.74,1,
2,1,56,College,79523.203,Female,Brooklyn,Online,Colgate,No,Yes,Yes,Yes,3.9,1,0,1.04,4.06,0,
3,1,56,College,79523.203,Female,Brooklyn,Online,Crest,Yes,Yes,No,No,4.8,1,0,0.92,4.42,0,
4,1,56,College,79523.203,Female,Brooklyn,Online,Crest,No,No,Yes,Yes,4.8,1,0,1.01,4.85,0,


In [3]:
# Processing

# Convert yes/no to 1/0
dict  = {"Yes" : 1, "No" : 0}
data.replace({"mint": dict,"white": dict,"fluoride": dict, "kids": dict},inplace=True)

# Define product id
uniqueProducts  = data.groupby(['brandid','mint','white','fluoride','kids']).size().reset_index().rename(columns={0:'count'})
data["productId"] = 0
for i in range(10):
    data.iloc[(data.brandid == uniqueProducts.iloc[i,0]) & 
        (data.mint == uniqueProducts.iloc[i,1]) &
        (data.white == uniqueProducts.iloc[i,2]) &
        (data.fluoride == uniqueProducts.iloc[i,3]) &
        (data.kids == uniqueProducts.iloc[i,4]),-1] = i + 1

# Add dummies
data = data.join(pd.get_dummies(data.education,prefix="ed"))
data = data.join(pd.get_dummies(data.location,prefix="loc"))
data = data.join(pd.get_dummies(data.gender,prefix="gen"))
data = data.join(pd.get_dummies(data.purchase,prefix="purchase"))
data = data.join(pd.get_dummies(data.brandid,prefix="brand"))

# Rename some columns
data = data.rename({'ed_High School':'ed_HighSchool', 'ed_More than College':'ed_MoreCollege',
     'purchase_In Store':'purchase_InStore'}, axis='columns')

data.head(12)

Unnamed: 0,buyerid,age,education,inc,gender,location,purchase,brandid,mint,white,...,loc_Queens,gen_Female,gen_Male,gen_Other,purchase_InStore,purchase_Online,brand_Aquafresh,brand_Colgate,brand_Crest,brand_Sensodyne
0,1,56,College,79523.203,Female,Brooklyn,Online,Colgate,1,1,...,0,1,0,0,0,1,0,1,0,0
1,1,56,College,79523.203,Female,Brooklyn,Online,Colgate,1,0,...,0,1,0,0,0,1,0,1,0,0
2,1,56,College,79523.203,Female,Brooklyn,Online,Colgate,0,1,...,0,1,0,0,0,1,0,1,0,0
3,1,56,College,79523.203,Female,Brooklyn,Online,Crest,1,1,...,0,1,0,0,0,1,0,0,1,0
4,1,56,College,79523.203,Female,Brooklyn,Online,Crest,0,0,...,0,1,0,0,0,1,0,0,1,0
5,1,56,College,79523.203,Female,Brooklyn,Online,Sensodyne,1,0,...,0,1,0,0,0,1,0,0,0,1
6,1,56,College,79523.203,Female,Brooklyn,Online,Sensodyne,0,1,...,0,1,0,0,0,1,0,0,0,1
7,1,56,College,79523.203,Female,Brooklyn,Online,Aquafresh,1,0,...,0,1,0,0,0,1,1,0,0,0
8,1,56,College,79523.203,Female,Brooklyn,Online,Aquafresh,0,1,...,0,1,0,0,0,1,1,0,0,0
9,1,56,College,79523.203,Female,Brooklyn,Online,Aquafresh,0,1,...,0,1,0,0,0,1,1,0,0,0


In [4]:
dictBorough = {"Brooklyn": 1, "Manhattan": 2, "Queens": 3, "Other":4}
data["market"] = data["location"].map(dictBorough)
data.head(20)

Unnamed: 0,buyerid,age,education,inc,gender,location,purchase,brandid,mint,white,...,gen_Female,gen_Male,gen_Other,purchase_InStore,purchase_Online,brand_Aquafresh,brand_Colgate,brand_Crest,brand_Sensodyne,market
0,1,56,College,79523.203,Female,Brooklyn,Online,Colgate,1,1,...,1,0,0,0,1,0,1,0,0,1
1,1,56,College,79523.203,Female,Brooklyn,Online,Colgate,1,0,...,1,0,0,0,1,0,1,0,0,1
2,1,56,College,79523.203,Female,Brooklyn,Online,Colgate,0,1,...,1,0,0,0,1,0,1,0,0,1
3,1,56,College,79523.203,Female,Brooklyn,Online,Crest,1,1,...,1,0,0,0,1,0,0,1,0,1
4,1,56,College,79523.203,Female,Brooklyn,Online,Crest,0,0,...,1,0,0,0,1,0,0,1,0,1
5,1,56,College,79523.203,Female,Brooklyn,Online,Sensodyne,1,0,...,1,0,0,0,1,0,0,0,1,1
6,1,56,College,79523.203,Female,Brooklyn,Online,Sensodyne,0,1,...,1,0,0,0,1,0,0,0,1,1
7,1,56,College,79523.203,Female,Brooklyn,Online,Aquafresh,1,0,...,1,0,0,0,1,1,0,0,0,1
8,1,56,College,79523.203,Female,Brooklyn,Online,Aquafresh,0,1,...,1,0,0,0,1,1,0,0,0,1
9,1,56,College,79523.203,Female,Brooklyn,Online,Aquafresh,0,1,...,1,0,0,0,1,1,0,0,0,1


In [5]:
# Fit some logit models to explain chosen product characteristcs

# Restrict the dataset to the chosen prodicts
dataChosen = data[data['Chosen'] == 1].copy()

# Normalization
dataChosen["sizeNorm"] = np.divide(dataChosen["size"]-np.min(dataChosen["size"]),
                                np.max(dataChosen["size"])-np.min(dataChosen["size"]))
dataChosen["inc"] = dataChosen["inc"]/1000

# Regressors = consumer attributes
regressors  = '~ age + inc + ed_College + ed_MoreCollege +'\
                'loc_Brooklyn + loc_Manhattan + loc_Other +'\
                'gen_Female + gen_Other + purchase_InStore'

# Predict the chosen brand
mBrand_1 = smf.logit('brand_Colgate'+regressors, data=dataChosen).fit()
mBrand_2 = smf.logit('brand_Aquafresh'+regressors, data=dataChosen).fit()
mBrand_3 = smf.logit('brand_Sensodyne'+regressors, data=dataChosen).fit()
mBrand_4 = smf.logit('brand_Crest'+regressors, data=dataChosen).fit()

# Predict other chosen characteristics
mChrc_1 = smf.logit('mint'+regressors, data=dataChosen).fit()
mChrc_2 = smf.logit('white'+regressors, data=dataChosen).fit()
mChrc_3 = smf.logit('fluoride'+regressors, data=dataChosen).fit()
mChrc_4 = smf.logit('kids'+regressors, data=dataChosen).fit()
mChrc_5 = smf.logit('sizeNorm'+regressors, data=dataChosen).fit()
mChrc_6 = smf.logit('discount'+regressors, data=dataChosen).fit()
mChrc_7 = smf.logit('familypack'+regressors, data=dataChosen).fit()
mChrc_8 = smf.ols('priceperoz'+regressors, data=dataChosen).fit()

Optimization terminated successfully.
         Current function value: 0.598719
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.377925
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.313782
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.237058
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.154480
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.537584
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.618602
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.079393
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.565306
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.342223
  

In [41]:
dataChosen2 = data[~np.isnan(data['SecondChoice'])].copy()



characteristics = ['brand_Colgate', 'brand_Aquafresh', 'brand_Sensodyne', 
        'brand_Crest', 'mint', 'white', 'fluoride',
        'kids', 'size', 'discount', 'familypack', 'priceperoz',
        'priceperpack']
dataChosen2 = dataChosen2[['buyerid','Chosen','SecondChoice']+characteristics]

# normalize
data_norm = data[characteristics].copy()
means = np.expand_dims(data_norm.to_numpy().mean(axis=0),axis=0)
sds = np.expand_dims(data_norm.to_numpy().std(axis=0),axis=0)

norm_data = (dataChosen2[characteristics].to_numpy() - means)/sds

dataChosen2[characteristics] = norm_data
         

firstChoice = dataChosen2[dataChosen2["Chosen"]>0]
secondChoice = dataChosen2[dataChosen2["SecondChoice"]>0]

corr = []
pval = []
for i in range(len(characteristics)):
    ch = characteristics[i]
    corr.append(np.round(np.corrcoef(firstChoice[ch].to_numpy(),
                    secondChoice[ch].to_numpy())[0,1],4))
    pval.append(np.round(pearsonr(firstChoice[ch].to_numpy(),
                    secondChoice[ch].to_numpy())[1],4))    
correlations = pd.DataFrame({'Characteristic': characteristics,
                            'Correlation': corr,
                            'p-value': pval})
correlations = correlations.sort_values(by='p-value')
correlations

Unnamed: 0,Characteristic,Correlation,p-value
6,fluoride,-0.2557,0.0
12,priceperpack,0.1913,0.0
2,brand_Sensodyne,-0.1466,0.0009
0,brand_Colgate,-0.1106,0.0123
8,size,0.108,0.0146
5,white,-0.099,0.0252
1,brand_Aquafresh,-0.098,0.0267
11,priceperoz,0.0673,0.1289
9,discount,0.0473,0.2861
3,brand_Crest,-0.0436,0.325


In [48]:
secondChoice.describe()

Unnamed: 0,buyerid,Chosen,SecondChoice,brand_Colgate,brand_Aquafresh,brand_Sensodyne,brand_Crest,mint,white,fluoride,kids,size,discount,familypack,priceperoz,priceperpack
count,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0
mean,5026.031311,0.027397,1.0,0.344622,-0.125123,-0.108611,-0.142857,0.90998,-0.242073,-0.181996,-0.594868,-0.486296,0.103234,0.136845,-0.589135,-0.747557
std,2960.683833,0.163398,0.0,1.088285,0.936399,0.909344,0.875675,0.415058,1.020914,0.984263,0.356564,0.985812,0.914308,1.049771,0.772869,0.744872
min,2.0,0.0,1.0,-0.654654,-0.654654,-0.5,-0.5,-1.0,-1.224745,-1.0,-0.654654,-1.234495,-2.001313,-0.65654,-1.584121,-1.842154
25%,2271.0,0.0,1.0,-0.654654,-0.654654,-0.5,-0.5,1.0,-1.224745,-1.0,-0.654654,-1.234495,0.499672,-0.65654,-0.956549,-1.402307
50%,5182.0,0.0,1.0,-0.654654,-0.654654,-0.5,-0.5,1.0,-1.224745,-1.0,-0.654654,-1.234495,0.499672,-0.65654,-0.851954,-0.701809
75%,7659.0,0.0,1.0,1.527525,-0.654654,-0.5,-0.5,1.0,0.816497,1.0,-0.654654,0.810048,0.499672,1.523136,0.193999,-0.131637
max,9989.0,1.0,1.0,1.527525,1.527525,2.0,2.0,1.0,0.816497,1.0,1.527525,0.810048,0.499672,1.523136,1.553739,1.741785


In [49]:
(firstChoice["fluoride"].to_numpy()*secondChoice["fluoride"].to_numpy()).mean()

-0.30332681017612523

In [25]:
resultsAlll = summary_col( [mBrand_1,mBrand_2,mBrand_3,mBrand_4]+
    [mChrc_1,mChrc_2,mChrc_3,mChrc_4,mChrc_5,mChrc_6,mChrc_7,mChrc_8],stars=True)
print(resultsAlll)


                 brand_Colgate brand_Aquafresh brand_Sensodyne brand_Crest    mint      white     fluoride    kids    sizeNorm   discount  familypack priceperoz
----------------------------------------------------------------------------------------------------------------------------------------------------------------
Intercept        0.1102        -1.7976***      -1.6076***      -1.3702**   3.4020***  0.2520     -0.7704** -2.5050** -1.9591*** 3.4508***  0.3148     0.9023*** 
                 (0.3484)      (0.4805)        (0.5336)        (0.6373)    (0.8259)   (0.3716)   (0.3395)  (1.2395)  (0.3656)   (0.5132)   (0.3204)   (0.0101)  
R-squared                                                                                                                                             0.0016    
R-squared Adj.                                                                                                                                        0.0006    
age              0.0093        -0

In [27]:
print(resultsAlll.as_latex())

\begin{table}
\caption{}
\label{}
\begin{center}
\begin{tabular}{lllllllllllll}
\hline
                  & brand\_Colgate & brand\_Aquafresh & brand\_Sensodyne & brand\_Crest & mint       & white      & fluoride  & kids      & sizeNorm   & discount   & familypack & priceperoz  \\
\hline
Intercept         & 0.1102         & -1.7976***       & -1.6076***       & -1.3702**    & 3.4020***  & 0.2520     & -0.7704** & -2.5050** & -1.9591*** & 3.4508***  & 0.3148     & 0.9023***   \\
                  & (0.3484)       & (0.4805)         & (0.5336)         & (0.6373)     & (0.8259)   & (0.3716)   & (0.3395)  & (1.2395)  & (0.3656)   & (0.5132)   & (0.3204)   & (0.0101)    \\
R-squared         &                &                  &                  &              &            &            &           &           &            &            &            & 0.0016      \\
R-squared Adj.    &                &                  &                  &              &            &            &           &   

: 