In [362]:
"""
This script creates artificial data for a discrete choice problem.
Assume there are three modes of transportation to choose from. Six fixed
variables were designed as significant and five as non-significant.
Seven random variables (five normal, one uniform, one triangular) were
designed as significant.
Three normal variables were correlated.
Two normal variables were non-linearly transformed.
"""

'\nThis script creates artificial data for a discrete choice problem.\nAssume there are three modes of transportation to choose from. Six fixed\nvariables were designed as significant and five as non-significant.\nSeven random variables (five normal, one uniform, one triangular) were\ndesigned as significant.\nThree normal variables were correlated.\nTwo normal variables were non-linearly transformed.\n'

In [363]:
import numpy as np
import pandas as pd
import scipy.stats as ss
from scipy.special import boxcox, inv_boxcox
from searchlogit import MultinomialLogit, MixedLogit
import statsmodels.api as sm

In [364]:
def noise(n_obs, perc=1, random_state=None):
    random_state = random_state or np.random
    noise_vec = random_state.normal(0, 1, n_obs)
    return noise_vec

In [365]:
def random_col(N, P, J, random_state=None):
    rand_nums = random_state.randint(low=5, high=25, size=(P,))/10
    return np.tile(rand_nums, N*P) + 0.5*noise(N*P*J, random_state=random_state)

def generate_random_df(N, P, J, num_fixed=0, num_isvars=0, num_randvars=0, random_state=None):
    df = pd.DataFrame()
    #df['id'] = np.repeat(np.arange(1, (N)), J)
    
    

    varnames = []
    for i in range(num_fixed):
        coef_name = 'added_fixed' + str(i+1)
        varnames.append(coef_name)
        df[coef_name] = random_col(N, P, J, random_state=random_state)

    for i in range(num_isvars):
        coef_name = 'added_isvar' + str(i+1)
        varnames.append(coef_name)
        col_vals = np.repeat(random_state.random(N*P)*100, J)
        for j in range(J):
            if j == 0:
                df[coef_name] = col_vals
            else:
                df[coef_name + "." + str(j+1)] = col_vals

    for i in range(num_randvars):
        coef_name = 'added_random' + str(i+1)
        varnames.append(coef_name)
        df[coef_name] = random_col(N, P, J, random_state=random_state)

    df_interp = np.interp(df.values, (df.values.min(), df.values.max()), (0,1))
    df = pd.DataFrame(df_interp, columns = df.columns)
    df = sm.add_constant(df, prepend=False)
    return df, varnames


In [366]:
np.random.seed(0)
N = 2500  # Number of observations
P = 1  # Number of choices per individual
J = 1  # Number of alternatives
num_fixed = 11
num_isvars = 0
num_nonsig = 8
num_randvars = 3

random_state = np.random.RandomState(2)

df, varnames = generate_random_df(N, P, J, num_fixed=num_fixed, num_isvars=num_isvars,
                                  num_randvars=num_randvars, random_state=random_state)






In [367]:
df.head(10)

Unnamed: 0,added_fixed1,added_fixed2,added_fixed3,added_fixed4,added_fixed5,added_fixed6,added_fixed7,added_fixed8,added_fixed9,added_fixed10,added_fixed11,added_random1,added_random2,added_random3,const
0,0.467868,0.568597,0.688279,0.406144,0.564087,0.821708,0.521241,0.586324,0.571353,0.524014,0.426019,0.524577,0.051612,0.845504,1.0
1,0.531144,0.517127,0.707921,0.270024,0.41508,0.65701,0.564002,0.616189,0.559772,0.429366,0.588756,0.53294,0.369221,0.785749,1.0
2,0.387974,0.539561,0.687525,0.483609,0.592859,0.759468,0.553827,0.402048,0.588376,0.421159,0.448327,0.785196,0.35996,0.666975,1.0
3,0.42738,0.647022,0.523592,0.259232,0.524342,0.813557,0.54599,0.54898,0.447392,0.36277,0.452281,0.548662,0.184152,0.665827,1.0
4,0.405862,0.555099,0.669269,0.343465,0.587582,0.447511,0.612323,0.449593,0.699723,0.243164,0.678353,0.480311,0.210272,0.72748,1.0
5,0.503489,0.640211,0.724305,0.232997,0.42002,0.47999,0.547605,0.676789,0.663659,0.507904,0.460546,0.545232,0.448895,0.610747,1.0
6,0.680897,0.507313,0.622094,0.441843,0.46245,0.573206,0.533681,0.560468,0.625889,0.702195,0.416505,0.766813,0.312212,0.782205,1.0
7,0.446395,0.532131,0.680856,0.389386,0.40266,0.634301,0.508989,0.466617,0.671382,0.345605,0.499675,0.539454,0.137977,0.796997,1.0
8,0.521149,0.5634,0.707182,0.306031,0.480871,0.762507,0.521639,0.487345,0.506256,0.149819,0.347014,0.547008,0.372885,0.630852,1.0
9,0.558967,0.593495,0.865758,0.531721,0.521892,0.782302,0.628417,0.573644,0.589021,0.242862,0.396752,0.510883,0.461662,0.72058,1.0


In [368]:
# Define coefficients (betas)
# Fixed betas
fixed_coefs = [random_state.choice([-1,1]) * random_state.uniform(.25, 1) for i in range(num_fixed)]
fixed_coefs = np.array(fixed_coefs)

# rewrite old coef names
fixed_coefs[-num_nonsig::] = 0
old_coef_names = varnames[num_fixed-num_nonsig:num_fixed]
varnames[num_fixed-num_nonsig:num_fixed] = ['nonsig' + str(i+1) for i in range(num_nonsig)]

for ii, old_name in enumerate(old_coef_names):
    new_name = 'nonsig' + str(ii+1)
    df = df.rename(columns={old_name: new_name})

fixed_coefs = list(fixed_coefs)
isvar_coefs =  [0 for i in range(num_isvars)]
new_isvars = []
for coef in isvar_coefs:
    for j in range(J):
        isvar_alt = coef + np.random.uniform(0, 1)
        new_isvars.append(isvar_alt)

isvar_coefs = new_isvars
isvar_coefs = np.array(isvar_coefs)
isvar_coefs[np.arange(0, J*num_isvars, J)] = 0
isvar_coefs = list(isvar_coefs)

In [369]:
print(fixed_coefs)
fixed_coefs[0] = .9
fixed_coefs[0] = .9

print(isvar_coefs)

[0.660704799826025, -0.36323172324238295, -0.7625729604822207, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[]


In [370]:
# Random mean between -1.5 and 1.5, excluding -.1 - .1 as hard to detect effect
random_coefs_mean = [random_state.choice([-1,1, 1, 1, 1]) * random_state.uniform(.5, 1.5) for i in range(num_randvars)]
random_coefs_sd = [random_state.uniform(1.0, 1.5) for i in range(num_randvars)]

cov_mat = np.diag(random_coefs_sd)
cov_mat[0, 1] = cov_mat[1, 0] = 0.25
cov_mat[0, 2] = cov_mat[2, 0] = 0.4
cov_mat[1, 2] = cov_mat[2, 1] = 0.5

random_coefs_uniform_a = 0
random_coefs_uniform_b = random_state.uniform(1, 2)

random_coefs_tri_left = 0
random_coefs_tri_right = random_state.uniform(1, 2)
random_coefs_tri_mode = random_coefs_tri_right/2

rand_coefs = [np.array([]) for i in range(num_randvars)]

for i in range(N):
    res_normal = random_state.multivariate_normal(random_coefs_mean, cov_mat)
    res_uniform = np.array([random_state.normal(random_coefs_uniform_a, random_coefs_uniform_b)])
    res_triangular = np.array([random_state.normal(random_coefs_tri_left, 1)])
    res = np.concatenate((res_normal, res_uniform, res_triangular))

    for r in range(num_randvars):
        rand_coefs[r] = np.append(rand_coefs[r], np.repeat(res[r], P*J))


In [371]:
print(random_coefs_mean)
print(random_coefs_sd)
print(cov_mat)

[0.891588498295317, 1.178326389199667, 0.8270550359608905]
[1.451454056306701, 1.0375393330086704, 1.3259607180851236]
[[1.45145406 0.25       0.4       ]
 [0.25       1.03753933 0.5       ]
 [0.4        0.5        1.32596072]]


In [372]:
random_coefs_uniform_a, random_coefs_uniform_b

(0, 1.2211723077934675)

In [373]:
random_coefs_tri_left, random_coefs_tri_mode, random_coefs_tri_right

(0, 0.8188426980957979, 1.6376853961915958)

In [374]:
cov_mat

array([[1.45145406, 0.25      , 0.4       ],
       [0.25      , 1.03753933, 0.5       ],
       [0.4       , 0.5       , 1.32596072]])

In [375]:
B_fixed = [np.repeat(f_coef, P*N*J) for f_coef in fixed_coefs]
B_const = [np.repeat(-1, P*N*J)]

B_isvar = [np.tile(isvar_coefs[(i*J):(i*J)+J], P*N) for i in range(num_isvars)]

# Convert betas to matrix for easy product
B = [B_fixed, B_isvar, rand_coefs, B_const]
B = [B_i for B_i in B if B_i != []]
B = np.vstack(B).T

In [376]:
# Visualise values after non-linear transformation
# import matplotlib.pyplot as plt
# plt.hist(inv_boxcox(df['added_random4'], 0.4), bins=30)

In [377]:
# Multiply and generate probability
isvars = ['added_isvar' + str(i+1) for i in range(num_isvars)]

X = df.values[:, 0:]  # Extract only necessary columns
XB = (X*B).sum(axis=1).reshape(N*P, J)
eps = np.random.gumbel(0, 1, (N*P, J))
eXB = np.exp(XB).ravel()

# Use monte carlo simulation to predict choice
# y = np.apply_along_axis(lambda p: np.eye(J, dtype='int64')[np.argmax(p)], 1, prob).reshape(N*P*J,)
# y = y.reshape(N*P*J,)



df['Y'] = eXB.astype(int)

print(max(df['Y']))

# Save to CSV
df.to_csv("C:/Users/n9471103/source/repos/HS_BIC/artificial_mixed_corr_2023_MOOF.csv", index=False)

260
