In [4]:
# Author - Rishabh Samdarshi , Pricing Research Group- ASU 
# Code for simulating price based choices for multiple choices available 


import numpy as np
import random as rd
import pandas as pd
from sklearn import metrics
from numpy.random import rand
from numpy.random import seed
from sklearn import linear_model
from sklearn.model_selection import train_test_split
import statsmodels.api as sm


In [79]:
randomArrayGenerator = lambda x,y: x*(1+rd.uniform(-y,y)*0.01) 


def priceArrayGenerator(variationLimit,truePriceVector):
    randomArray = tuple(map(randomArrayGenerator, truePriceVector,np.repeat(variationLimit,len(truePriceVector))))
    return randomArray
    
def priceSampleGenerator(numberOfVectorsNeeded,variationLimit,truePriceVector,a=[]):
    for i in range(numberOfVectorsNeeded):
        a.append(priceArrayGenerator(variationLimit,truePriceVector))
    a = tuple(a)
    priceVectorDict = dict(enumerate(a,1))
    return priceVectorDict

def choiceProbabilty(arbitraryPriceVector, aCoefficientVector, priceSensitivityVector,choice = []):
    choiceExpExpression = list(map(lambda x,y,z: np.exp(x+(y*z)),aCoefficientVector,priceSensitivityVector,arbitraryPriceVector))
    noChoiceExpression = [1/(1+sum(choiceExpExpression))]
    choiceProbabilityVector = noChoiceExpression + [x/(1+sum(choiceExpExpression)) for x in choiceExpExpression]
    print (sum(choiceProbabilityVector))
    return choiceProbabilityVector

def choiceSimulator(choiceProbabilityVector,sampleCount,seedValue=1):
    cumulativeChoice = np.cumsum(choiceProbabilityVector)
    seed(seedValue) 
    randomProbabilityValues = list(rand(sampleCount))
    outCounter = 0 
    inCounter = 0 
    choices = []
    for i in randomProbabilityValues:
        inCounter = 1
        for j in cumulativeChoice: 
            if (i < j): 
                choice = inCounter
                break
            elif (inCounter < len(cumulativeChoice)-1):
                  inCounter = inCounter+1
            else:
                inCounter = 0 
                break
        choices.append(inCounter)
    simulatedChoices = pd.DataFrame({"Probability Value" : randomProbabilityValues, "Choice" : choices})
    return simulatedChoices


In [80]:
truePriceVectorVal = np.array([1,1.2,1.3,1.4,2])
aCoefficientVectorVal = [1.001,1.002,1.003,1.004,1.005]
priceSensitivityVectorVal = [1,1,1,1,1]

In [81]:

truePriceVectorVal = np.array([1,1.2,1.3,1.4,2])
aCoefficientVectorVal = [1.001,1.002,1.003,1.004,1.005]
priceSensitivityVectorVal = [1,1,1,1,1]
choiceVectorDemo = choiceProbabilty(truePriceVectorVal, aCoefficientVectorVal, priceSensitivityVectorVal)
choiceVectorDemo

print(np.cumsum(choiceVectorDemo))
choiceSimulator(choiceVectorDemo,10,seedValue=1)

1.0
[0.01703657 0.14304671 0.29710982 0.46754624 0.65609607 1.        ]


Unnamed: 0,Probability Value,Choice
0,0.417022,4
1,0.720324,0
2,0.000114,1
3,0.302333,4
4,0.146756,3
5,0.092339,2
6,0.18626,3
7,0.345561,4
8,0.396767,4
9,0.538817,5


In [84]:
truePriceVectorVal = np.array([1,1.2,1.3])
priceSample = priceSampleGenerator(10,10,truePriceVectorVal)
totalRandomChoiceSimulationKeys = 5
aCoefficientVectorVal = [1.001,1.002,1.003]
priceSensitivityVectorVal = [1,1,1]
sampleCountVal = 1000
simulatedDataFrame = pd.DataFrame()
sampleNumber = 0 

for i in range(totalRandomChoiceSimulationKeys):
# random key selection for as many times mentioned, set at 5 by default 
    sampleNumber = sampleNumber + 1
    keyForPriceDict = rd.randint(1,len(priceSample)) 
    # Randomly is taking one of the keys for chosing a price vector from priceSample
    
    arbitraryPriceVectorVal = priceSample[keyForPriceDict]
    
    #Takes a price vector for the given key 
    
    choiceProbabilityVectorVal = choiceProbabilty(arbitraryPriceVectorVal, aCoefficientVectorVal, priceSensitivityVectorVal)
    #Calculates the choice probability vector for the chosen price vector in the previous line 
    
    simulatedModelDataFrameRaw = choiceSimulator(choiceProbabilityVectorVal,sampleCountVal)
    # simulation for the raw dataframe of the vector for the same 
    
    priceColumnCounter = 1
    priceValueDict = {}
    for j in arbitraryPriceVectorVal:
        simulatedModelDataFrameRaw[priceColumnCounter] =  np.repeat(j,sampleCountVal)
        # for creating the price value columns
        priceColumnCounter = priceColumnCounter+1

    simulatedModelDataFrameRaw["keyID_SampleID"] = [i + j + k for i, j, k in zip(str(keyForPriceDict)*sampleCountVal, ["_"]*sampleCountVal, str(i+1)*sampleCountVal)]
    simulatedDataFrame = pd.concat([simulatedDataFrame,simulatedModelDataFrameRaw],ignore_index=True)
    
    
colNameUpdate = [(i,'Price_of_'+str(i)) for i in simulatedDataFrame.iloc[:, 2:len(truePriceVectorVal)+2].columns.values]
simulatedDataFrame.rename(columns = dict(colNameUpdate), inplace=True)

simulatedDataFrame






# train_x, test_x, train_y, test_y = train_test_split(simulatedDfForMNL[list(simulatedDfForMNL)[:-1]],
#                                                                       simulatedDfForMNL[list(simulatedDfForMNL)[-1]], train_size=0.7)


                                                                      

# mul_lr = linear_model.LogisticRegression(multi_class='multinomial', solver='newton-cg').fit(train_x, train_y)
    


0.9999999999999999
1.0
1.0
0.9999999999999999
1.0


Unnamed: 0,Probability Value,Choice,Price_of_1,Price_of_2,Price_of_3,keyID_SampleID
0,0.417022,3,1.018364,1.170158,1.279425,5_1
1,0.720324,0,1.018364,1.170158,1.279425,5_1
2,0.000114,1,1.018364,1.170158,1.279425,5_1
3,0.302333,2,1.018364,1.170158,1.279425,5_1
4,0.146756,2,1.018364,1.170158,1.279425,5_1
5,0.092339,2,1.018364,1.170158,1.279425,5_1
6,0.186260,2,1.018364,1.170158,1.279425,5_1
7,0.345561,3,1.018364,1.170158,1.279425,5_1
8,0.396767,3,1.018364,1.170158,1.279425,5_1
9,0.538817,3,1.018364,1.170158,1.279425,5_1


In [94]:
priceOfChoice = []
counter = 0 
for i in len(simulatedDataFrame):
    try: 
        simulatedDataFrame.loc[counter,:]["Price_of_" + str(simulatedDataFrame.loc[counter,:]["Choice"])]
        counter = counter + 1
    except: 
        
    

1.1701581863443145

In [21]:
from sklearn.linear_model import LogisticRegression

simulatedDfForMNL = simulatedDataFrame.drop(["Probability Value"], axis = 1)
simulatedDfForMNL = simulatedDfForMNL[[c for c in simulatedDfForMNL if c not in ["Choice"]]  + ['Choice']]
X = simulatedDfForMNL[list(simulatedDfForMNL)[:-2]]
X["Qualitative"] = np.ones(X.shape[0])
Y = simulatedDfForMNL[list(simulatedDfForMNL)[-1]]
# mlogit_mod = sm.MNLogit(Y,X)
# mlogit_res = mlogit_mod.fit(maxiter = 10000)
# print(mlogit_res.summary())

clf = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial').fit(X,Y)
Result = pd.DataFrame(clf.coef_,X.columns)



# for keyId in set(simulatedDfForMNL["keyID_SampleID"]):
#     keySubset = simulatedDfForMNL[simulatedDfForMNL["keyID_SampleID"] == keyId]
#     X = keySubset[list(keySubset)[:-2]]
#     X["intercept"] = np.ones(X.shape[0])
#     print (X.head(10))
#     Y = keySubset[list(keySubset)[-1]]
#     print (Y.head(10))
#     mlogit_mod = sm.MNLogit(Y,X)
#     mlogit_res = mlogit_mod.fit(maxiter = 10000)
#     print(mlogit_res.summary())


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [24]:
print (pd.DataFrame(clf.coef_,X.columns))

                    0         1         2         3
Price_of_1  -0.040051 -0.112445 -0.032410 -0.024658
Price_of_2  -0.064315 -0.212392 -0.091899 -0.002349
Price_of_3   0.029441  0.480808 -0.225549  0.014170
Qualitative  0.074926 -0.155970  0.349857  0.012837


In [25]:
mlogit_mod = sm.MNLogit(Y,X)
mlogit_res = mlogit_mod.fit(maxiter = 10000)
print(mlogit_res.summary())

         Current function value: 1.233920
         Iterations: 10000


  bse = np.sqrt(np.diag(self.cov_params()))


                          MNLogit Regression Results                          
Dep. Variable:                 Choice   No. Observations:                 5000
Model:                        MNLogit   Df Residuals:                     4991
Method:                           MLE   Df Model:                            6
Date:                Thu, 14 Nov 2019   Pseudo R-squ.:               0.0004587
Time:                        12:48:29   Log-Likelihood:                -6169.6
converged:                      False   LL-Null:                       -6172.4
Covariance Type:            nonrobust   LLR p-value:                    0.4620
   Choice=1       coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
Price_of_1      0.6002        nan        nan        nan         nan         nan
Price_of_2     -0.2601        nan        nan        nan         nan         nan
Price_of_3     -0.1319        nan        nan    

  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


In [2]:
import statsmodels.api as sm
spector_data = sm.datasets.spector.load()
spector_data.exog = sm.add_constant(spector_data.exog, prepend=False)



In [49]:
import pandas as pd
anes_data = sm.datasets.anes96.load()
pd.DataFrame(anes_data.exog).head(10)

Unnamed: 0,0,1,2,3,4
0,-2.302585,7.0,36.0,3.0,1.0
1,5.24755,3.0,20.0,4.0,1.0
2,3.437208,2.0,24.0,6.0,1.0
3,4.420045,3.0,28.0,6.0,1.0
4,6.461624,5.0,68.0,6.0,1.0
5,4.701389,3.0,21.0,4.0,1.0
6,4.60617,5.0,77.0,4.0,1.0
7,3.437208,5.0,21.0,4.0,1.0
8,5.193512,4.0,31.0,4.0,1.0
9,7.93741,3.0,39.0,3.0,1.0


In [25]:
spector_data = sm.datasets.spector.load_pandas()

In [48]:
anes_data = sm.datasets.anes96.load()
anes_exog = anes_data.exog
anes_exog = sm.add_constant(anes_exog, prepend=False)
# print(anes_data.exog[:5,:])
print(np.unique(anes_data.endog))
mlogit_mod = sm.MNLogit(anes_data.endog, anes_exog)
mlogit_res = mlogit_mod.fit()
print(mlogit_res.summary())

[0. 1. 2. 3. 4. 5. 6.]
Optimization terminated successfully.
         Current function value: 1.548647
         Iterations 7
                          MNLogit Regression Results                          
Dep. Variable:                      y   No. Observations:                  944
Model:                        MNLogit   Df Residuals:                      908
Method:                           MLE   Df Model:                           30
Date:                Mon, 16 Dec 2019   Pseudo R-squ.:                  0.1648
Time:                        01:34:32   Log-Likelihood:                -1461.9
converged:                       True   LL-Null:                       -1750.3
Covariance Type:            nonrobust   LLR p-value:                1.822e-102
       y=1       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
x1            -0.0115      0.034     -0.336      0.736      -0.079       0.056
x2    