In [1]:
import sys, os, copy, itertools

In [2]:
import numpy as np

In [3]:
import pandas as pd
import statsmodels.api as st
import statsmodels.formula.api as smf

In [4]:
import matplotlib.pyplot as plt

In [5]:
INDIR = r'Q:\Model Research\Swiftly\processed\v0.6'
INFILE = r'choices_with_valid_rideshare.csv'
choices = pd.read_csv(os.path.join(INDIR,INFILE))

In [6]:
# drop duplicate listings within a search_id
choices = choices.drop_duplicates(subset=['search_id','travel_provider','travel_product'])

In [7]:
# filter to just records where a choice is made
chosen_search_ids = choices.loc[choices['chosen']==1,'search_id'].tolist()
choices = choices[choices['search_id'].isin(chosen_search_ids)]

In [8]:
# clean data using 'drop' flag
choices['drop'] = 0
# flag records with invalid walk time or travel time
choices.loc[(choices['travel_mode'] == 'transit') & ((choices['travel_initial_walk_time'] == -1) | (pd.isnull(choices['travel_initial_walk_time']))),'drop'] = 1
choices.loc[choices['travel_time'] == 0,'drop'] = 1
# flag records with both trip-ends outside of 9 counties
choices.loc[(choices['county_start']=='Other') & (choices['county_end']=='Other'),'drop'] = 1
# flag records with invalid cost
choices.loc[(choices['travel_mode'] == 'transit') & (choices['travel_price']==-1),'drop'] = 1
# drop flagged records
choices = choices[choices['drop']==0]

In [9]:
choices.loc[:,['start_is_near_home','end_is_near_home','start_is_near_work','end_is_near_work']].drop_duplicates()

Unnamed: 0,start_is_near_home,end_is_near_home,start_is_near_work,end_is_near_work
16,1,0,0,0
51,0,0,0,0
140,0,1,0,0
160,0,0,0,1
275,1,0,0,1
331,0,0,-1,-1
409,-1,-1,-1,-1
970,0,0,1,0
1268,1,0,-1,-1
1399,0,1,1,0


In [10]:
#ucost = choices[(choices['travel_mode']=='rideshare') & (choices['chosen']==1)]['travel_distance']
#ln_ucost = np.log(ucost)
#fig = plt.figure()
#ax = ucost.plot.hist(bins=25)
#fig2 = plt.figure()
#ax2 = ln_ucost.plot.hist(bins=50)
#plt.show()

In [11]:
# flag 'travel_provider' if they're infrequently or never chosen.
transit_options = choices[choices['travel_mode']=='transit']
#transit_choice_freq = transit_options.groupby(['travel_provider'])['chosen'].sum()
#unchosen = transit_choice_freq[transit_choice_freq == 0]
#choices.loc[choices['travel_provider'].isin(unchosen.index.tolist()),'drop'] = 1
#infrequent = transit_choice_freq[transit_choice_freq.between(1,100)]
#infreq_search_ids = choices.loc[(choices['chosen']==1) & (choices['travel_provider'].isin(infrequent.index.tolist())),'search_id']
#choices.loc[choices['travel_provider'].isin(infrequent.index.tolist()),'drop'] = 1
# flag the entire search if the chosen provider is an infrequent is the chosen mode
#choices.loc[choices['search_id'].isin(infreq_search_ids.tolist()),'drop'] = 1

In [12]:
#choices = choices[choices['drop']==0]

In [13]:
# get walk and wait time in minutes to next 3 arrivals
choices['search_time'] = pd.to_datetime(choices['timestamp'])

choices['arr1'] = pd.to_datetime(choices['travel_arrival_time_1'])
choices['arr2'] = pd.to_datetime(choices['travel_arrival_time_2'])
choices['arr3'] = pd.to_datetime(choices['travel_arrival_time_3'])

choices['wait1'] = choices['arr1'] - choices['search_time']
choices['wait2'] = choices['arr2'] - choices['search_time']
choices['wait3'] = choices['arr3'] - choices['search_time']

choices['wait1'] = choices['wait1'].map(lambda x:x.total_seconds() / 60.0)
choices['wait2'] = choices['wait2'].map(lambda x:x.total_seconds() / 60.0)
choices['wait3'] = choices['wait3'].map(lambda x:x.total_seconds() / 60.0)

choices['walk_time'] = choices['travel_initial_walk_time'] / 60.0

In [14]:
# check that wait times are set correctly
choices.loc[choices['wait1']<choices['walk_time'],['walk_time','wait1','wait2','wait3']]

Unnamed: 0,walk_time,wait1,wait2,wait3
141,8.383333,6.566667,22.383333,27.183333
157,8.000000,5.716667,31.916667,
216,8.500000,0.483333,17.833333,39.100000
282,3.950000,3.483333,9.016667,17.983333
301,4.933333,2.333333,12.366667,17.866667
340,7.516667,2.766667,12.033333,23.050000
349,4.100000,2.300000,11.833333,22.566667
441,3.466667,2.416667,13.616667,21.500000
482,2.150000,-0.250000,,
626,8.483333,8.100000,15.050000,24.133333


In [15]:
# calc headways in minutes
choices['h1_2'] = choices['wait2'] - choices['wait1']
choices['h2_3'] = choices['wait3'] - choices['wait2']
choices['h3_3'] = choices['wait3'] - choices['wait1']

In [16]:
# pick first wait time that the user has time to walk to
choices['wait'] = choices['wait1']
choices.loc[(choices['wait'] < choices['walk_time']) & (pd.notnull(choices['wait2'])),'wait'] = choices['wait2']
choices.loc[(choices['wait'] < choices['walk_time']) & (pd.notnull(choices['wait3'])),'wait'] = choices['wait3']
choices['wait'] = choices['wait'] - choices['walk_time']
choices['headway'] = 99
choices.loc[pd.notnull(choices['h1_2']),'headway'] = choices['h1_2']
choices['n_arrivals'] = 0
choices.loc[pd.notnull(choices['arr1']),'n_arrivals'] += 1
choices.loc[pd.notnull(choices['arr2']),'n_arrivals'] += 1
choices.loc[pd.notnull(choices['arr3']),'n_arrivals'] += 1

In [17]:
choices.loc[choices['travel_product']=='pool','travel_product'] = 'uberpool'

In [18]:
choices.loc[choices['travel_provider']=='Uber',['travel_provider','travel_product']].drop_duplicates()

Unnamed: 0,travel_provider,travel_product
16,Uber,uberxl
19,Uber,ubertaxi
20,Uber,assist
22,Uber,uberselect
23,Uber,uberblack
24,Uber,uberx
26,Uber,ubersuv
66,Uber,uberpool
199,Uber,uberwav


In [19]:
# classify uber products into 4 types
choices['uber_mode_class'] = np.nan
choices.loc[choices['travel_product'].isin(['uberx','uberxl']),'uber_mode_class'] = 'basic'
choices.loc[choices['travel_product'].isin(['uberselect','uberblack','ubersuv']),'uber_mode_class'] = 'premium'
choices.loc[choices['travel_product']=='uberpool','uber_mode_class'] = 'pool'
choices.loc[choices['travel_product']=='ubertaxi','uber_mode_class'] = 'taxi'
choices.loc[(pd.isnull(choices['uber_mode_class'])) & (choices['travel_provider']=='Uber'),'uber_mode_class'] = 'other'

In [20]:
# classify uber products into 2 types
choices['uber_mode_class2'] = np.nan
choices.loc[choices['travel_product'].isin(['uberx','uberxl','pool','taxi']),'uber_mode_class2'] = 'basic'
choices.loc[choices['travel_product'].isin(['uberselect','uberblack','ubersuv']),'uber_mode_class2'] = 'premium'
choices.loc[(pd.isnull(choices['uber_mode_class'])) & (choices['travel_provider']=='Uber'),'uber_mode_class2'] = 'basic'

In [21]:
user_info = pd.read_csv(r'Q:\Model Research\Swiftly\processed\v0.6\user_data\users_with_demographics.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [22]:
print len(choices)

382018


In [23]:
user_info = user_info.set_index('user_id')
choices = choices.set_index('user_id')
choices['master_user_id'] = user_info['master_user_id']
choices['gender'] = user_info['gender']
choices['age'] = user_info['age']
choices['age_cat'] = user_info['age_cat']
choices['hhinc_cat'] = user_info['hhinc_cat']
choices['race'] = user_info['race']
choices['employ'] = user_info['employ']
choices['hhinc'] = user_info['hhinc']
choices['hhsize'] = user_info['hhsize']
choices['hhvehs'] = user_info['hhvehs']
choices['female'] = 0
choices.loc[choices['gender']==2,'female'] = 1
choices = choices.reset_index()

In [24]:
print len(choices)

382018


In [25]:
# set up estimation set.
estimation_set = pd.DataFrame(index=choices['search_id'].drop_duplicates())
chosen_mode = choices[choices['chosen']==1]
chosen_mode = chosen_mode.set_index('search_id')
estimation_set['chosen_mode'] = chosen_mode['travel_mode']
estimation_set['chosen_mode2'] = chosen_mode['travel_mode']
estimation_set['chosen_mode3'] = chosen_mode['travel_mode']
estimation_set.loc[estimation_set['chosen_mode']=='rideshare','chosen_mode'] = chosen_mode['uber_mode_class']
estimation_set.loc[estimation_set['chosen_mode2']=='rideshare','chosen_mode2'] = chosen_mode['uber_mode_class2']
estimation_set.loc[estimation_set['chosen_mode3']=='rideshare','chosen_mode3'] = "uber"

estimation_set = estimation_set.dropna()
estimation_set.groupby(['chosen_mode3']).count().max(axis=1)

chosen_mode3
biking       107
scoot        348
transit    19189
uber         779
walking     1571
dtype: int64

In [26]:
estimation_set['start_is_near_home'] = chosen_mode['start_is_near_home'].replace(-1, np.nan)
estimation_set['start_is_near_work'] = chosen_mode['start_is_near_work'].replace(-1, np.nan)
estimation_set['end_is_near_home'] = chosen_mode['end_is_near_home'].replace(-1, np.nan)
estimation_set['end_is_near_work'] = chosen_mode['end_is_near_work'].replace(-1, np.nan)

# add demographcis to estimation set
estimation_set['master_user_id'] = chosen_mode['master_user_id']
estimation_set['age_cat'] = chosen_mode['age_cat']
estimation_set['age'] = chosen_mode['age']
estimation_set['hhinc_cat'] = chosen_mode['hhinc_cat']
estimation_set['hhinc'] = chosen_mode['hhinc']
estimation_set['employ'] = chosen_mode['employ']
estimation_set['hhvehs'] = chosen_mode['hhvehs']
estimation_set['female'] = chosen_mode['female']

# convert to dummies
estimation_set = pd.get_dummies(estimation_set,columns=['master_user_id','age_cat','hhinc_cat','employ'], dummy_na=True, drop_first=True)

# get dummy columns
user_dummy_cols = []
age_dummy_cols = []
inc_dummy_cols = []
employ_dummy_cols = []
for col in estimation_set.columns.tolist():
    if col[:14] == 'master_user_id':
        user_dummy_cols.append(col)
    elif col[:7] == 'age_cat':
        age_dummy_cols.append(col)
    elif col[:9] == 'hhinc_cat':
        inc_dummy_cols.append(col)
    elif col[:6] == 'employ':
        employ_dummy_cols.append(col)
        
estimation_set['master_user_id'] = chosen_mode['master_user_id']
print age_dummy_cols
print inc_dummy_cols
print employ_dummy_cols

['age_cat_18-24', 'age_cat_25-34', 'age_cat_35-44', 'age_cat_45-54', 'age_cat_55-64', 'age_cat_65+', 'age_cat_nan']
['hhinc_cat_$25-$50', 'hhinc_cat_$50-$75', 'hhinc_cat_$75+', 'hhinc_cat_nan']
['employ_2.0', 'employ_3.0', 'employ_4.0', 'employ_5.0', 'employ_nan']


In [27]:
transit_options = choices[choices['travel_mode']=='transit']

In [28]:
transit_options.groupby(['travel_provider'])['chosen'].sum()

travel_provider
Alcatraz Hornblower Ferry                            0.0
Altamont Commuter Express                            0.0
Bay Area Rapid Transit                            1917.0
Bear Transit - UC Berkeley Shuttle                   6.0
Blue & Gold Fleet                                    1.0
Caltrain                                           109.0
Commute.org Shuttle                                  6.0
County Connection                                   13.0
Golden Gate Ferry                                    0.0
Marin Transit                                        6.0
Mountain View Community Shuttle                      3.0
PresidiGo Shuttle                                   37.0
SamTrans                                           151.0
San Francisco Bay Ferry                              2.0
San Francisco Municipal Transportation Agency    16428.0
Santa Cruz Metro                                     5.0
SolTrans                                             2.0
VTA            

In [29]:
trn_sch_group = transit_options.groupby(['search_id'])
best_transit = transit_options.loc[trn_sch_group['travel_time'].idxmin(),:]
best_transit = best_transit.set_index('search_id')

In [30]:
estimation_set['transit_cost'] = best_transit['travel_price']
estimation_set['transit_time'] = best_transit['travel_time'] / 60.0
estimation_set['transit_dist'] = best_transit['travel_distance'] / 5280.0
estimation_set['wait'] = best_transit['wait']
estimation_set['headway'] = best_transit['headway']
estimation_set['walk'] = best_transit['walk_time']
estimation_set['n_arr'] = best_transit['n_arrivals']

In [31]:
n_transit = transit_options.groupby(['search_id']).count().max(axis=1)

In [32]:
estimation_set['n_transit'] = n_transit

In [33]:
estimation_set = estimation_set.dropna(subset=['transit_cost','transit_time','transit_dist'])

In [34]:
estimation_set['wait'] = estimation_set['wait'].fillna(999)

In [35]:
estimation_set

Unnamed: 0_level_0,chosen_mode,chosen_mode2,chosen_mode3,start_is_near_home,start_is_near_work,end_is_near_home,end_is_near_work,age,hhinc,hhvehs,...,employ_nan,master_user_id,transit_cost,transit_time,transit_dist,wait,headway,walk,n_arr,n_transit
search_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
372006,walking,walking,walking,0.0,0.0,0.0,0.0,,,,...,1.0,,2.25,6.916667,0.249432,15.566667,11.600000,0.833333,3.0,1.0
379254,walking,walking,walking,0.0,0.0,1.0,0.0,,,,...,1.0,,2.25,16.650000,0.622159,23.916667,26.200000,8.000000,2.0,4.0
349759,transit,transit,transit,1.0,0.0,0.0,0.0,,,,...,1.0,,2.25,26.233333,0.969697,21.533333,11.650000,0.016667,3.0,1.0
350042,transit,transit,transit,0.0,0.0,1.0,0.0,,,,...,1.0,,2.25,19.516667,0.862121,16.983333,19.250000,0.016667,2.0,3.0
350052,transit,transit,transit,0.0,0.0,0.0,0.0,,,,...,1.0,,2.25,4.900000,0.123485,15.950000,19.666667,0.016667,2.0,3.0
430969,transit,transit,transit,1.0,0.0,0.0,1.0,,,,...,1.0,,2.25,42.016667,1.542424,2.400000,6.850000,3.416667,3.0,3.0
414984,transit,transit,transit,0.0,0.0,0.0,0.0,,,,...,1.0,,2.25,44.183333,2.141098,999.000000,99.000000,3.316667,0.0,2.0
365085,transit,transit,transit,0.0,,0.0,,,,,...,1.0,,2.25,21.533333,0.868939,2.550000,20.033333,3.583333,2.0,3.0
365086,transit,transit,transit,0.0,,0.0,,,,,...,1.0,,2.25,26.333333,1.011553,9.566667,13.666667,4.100000,3.0,3.0
365103,walking,walking,walking,0.0,,0.0,,,,,...,1.0,,2.25,9.150000,0.236174,9.633333,19.700000,1.833333,2.0,3.0


In [36]:
choices.groupby('uber_mode_class')['chosen'].sum()

uber_mode_class
basic      444.0
other       43.0
pool       363.0
premium    335.0
taxi        49.0
Name: chosen, dtype: float64

In [37]:
g = choices.groupby(['travel_provider','travel_product'])
s = g['chosen'].sum()
s.loc[['Uber',]]

travel_provider  travel_product
Uber             assist             37.0
                 uberblack         154.0
                 uberpool          363.0
                 uberselect        103.0
                 ubersuv            78.0
                 ubertaxi           49.0
                 uberwav             6.0
                 uberx             373.0
                 uberxl             71.0
Name: chosen, dtype: float64

In [38]:
uber_grouped = choices[choices['travel_provider']=='Uber'].groupby(['search_id','uber_mode_class'])

In [39]:
best_uber = uber_grouped['travel_price'].idxmin()

In [40]:
choices.loc[best_uber,['search_id','travel_provider','uber_mode_class','travel_product','travel_price','travel_distance']]

Unnamed: 0,search_id,travel_provider,uber_mode_class,travel_product,travel_price,travel_distance
224920,309474,Uber,basic,uberx,6.55,2144.0
224921,309474,Uber,premium,uberselect,13.49,2144.0
224925,309474,Uber,taxi,ubertaxi,6.55,2144.0
100855,309475,Uber,basic,uberx,7.48,3335.0
100856,309475,Uber,premium,uberselect,15.77,3335.0
78546,309477,Uber,basic,uberx,38.95,39564.0
78547,309477,Uber,premium,uberselect,90.36,39564.0
59904,309481,Uber,basic,uberx,22.74,20872.0
59907,309481,Uber,premium,uberselect,51.93,20872.0
310828,309487,Uber,basic,uberx,15.29,10647.0


In [41]:
best_uber_idx = copy.deepcopy(best_uber)

In [42]:
best_uber = choices.loc[best_uber_idx,['search_id','travel_provider','uber_mode_class','travel_product','travel_price','travel_distance','travel_time']]

In [43]:
best_uber

Unnamed: 0,search_id,travel_provider,uber_mode_class,travel_product,travel_price,travel_distance,travel_time
224920,309474,Uber,basic,uberx,6.55,2144.0,393.0
224921,309474,Uber,premium,uberselect,13.49,2144.0,393.0
224925,309474,Uber,taxi,ubertaxi,6.55,2144.0,393.0
100855,309475,Uber,basic,uberx,7.48,3335.0,422.0
100856,309475,Uber,premium,uberselect,15.77,3335.0,422.0
78546,309477,Uber,basic,uberx,38.95,39564.0,1945.0
78547,309477,Uber,premium,uberselect,90.36,39564.0,1945.0
59904,309481,Uber,basic,uberx,22.74,20872.0,1166.0
59907,309481,Uber,premium,uberselect,51.93,20872.0,1166.0
310828,309487,Uber,basic,uberx,15.29,10647.0,1128.0


In [44]:
price = best_uber.pivot(index='search_id',columns='uber_mode_class',values='travel_price')
dist = best_uber.pivot(index='search_id',columns='uber_mode_class',values='travel_distance')
time = best_uber.pivot(index='search_id',columns='uber_mode_class',values='travel_time')

In [45]:
estimation_set['uber_basic_cost'] = price['basic']
estimation_set['uber_basic_dist'] = dist['basic'] / 5280.0
estimation_set['uber_basic_time'] = time['basic'] / 60.0

estimation_set['uber_premium_cost'] = price['premium']
estimation_set['uber_premium_dist'] = dist['premium'] / 5280.0
estimation_set['uber_premium_time'] = time['premium'] / 60.0

estimation_set['uber_taxi_cost'] = price['taxi']
estimation_set['uber_taxi_dist'] = dist['taxi'] / 5280.0
estimation_set['uber_taxi_time'] = time['taxi'] / 60.0

estimation_set['uber_other_cost'] = price['other']
estimation_set['uber_other_dist'] = dist['other'] / 5280.0
estimation_set['uber_other_time'] = time['other'] / 60.0

In [46]:
estimation_set['timestamp'] = choices.groupby('search_id')['timestamp'].first()

In [47]:
dti = pd.DatetimeIndex(estimation_set['timestamp'])

In [48]:
dti = dti.tz_localize('UTC')

In [49]:
dti_uspac =dti.tz_convert('US/Pacific')

In [50]:
estimation_set['dayofweek'] = dti_uspac.dayofweek
estimation_set['time'] = dti_uspac.time
estimation_set['date'] = dti_uspac.date
estimation_set['hour'] = dti_uspac.hour
estimation_set['late_night'] = 0
estimation_set.loc[estimation_set['hour'].isin([22,23,0,1,2,3]),'late_night'] = 1

In [51]:
estimation_set['chosen_mode2'].drop_duplicates()

search_id
372006    walking
349759    transit
409169      basic
385323      scoot
388246    premium
364796     biking
Name: chosen_mode2, dtype: object

In [52]:
estimation_subset = estimation_set[estimation_set['chosen_mode3'].isin(['transit','uber'])]

In [53]:
estimation_subset.groupby('chosen_mode').count().max(axis=1)

chosen_mode
basic        255
premium      202
transit    19189
dtype: int64

In [54]:
estimation_subset.groupby(['chosen_mode3']).count().max(axis=1)

chosen_mode3
transit    19189
uber         457
dtype: int64

In [55]:
estimation_set.columns

Index([u'chosen_mode', u'chosen_mode2', u'chosen_mode3', u'start_is_near_home',
       u'start_is_near_work', u'end_is_near_home', u'end_is_near_work', u'age',
       u'hhinc', u'hhvehs',
       ...
       u'uber_taxi_time', u'uber_other_cost', u'uber_other_dist',
       u'uber_other_time', u'timestamp', u'dayofweek', u'time', u'date',
       u'hour', u'late_night'],
      dtype='object', length=961)

In [56]:
subset2 = estimation_set[estimation_set['chosen_mode3'].isin(['transit','uber'])]
subset2['walkwait'] = subset2['walk'] + subset2['wait']
subset2['ln_wait'] = np.log(subset2['wait']+1)
subset2['ln_walk'] = np.log(subset2['walk']+1)
subset2['ln_walkwait'] = np.log(subset2['walkwait']+1)
subset2['ln_trn_cost'] = np.log(subset2['transit_cost']+1)
subset2['ln_trn_time'] = np.log(subset2['transit_time']+1)
subset2['ln_ubr_time'] = np.log(subset2['uber_basic_time']+1)
subset2['ln_headway'] = np.log(subset2['headway']+1)
subset2['tot_trn_time'] = subset2['walk'] + subset2['wait'] + subset2['transit_time']
subset2['uber_time_save'] = subset2['tot_trn_time'] - subset2['uber_basic_time']
subset2['uber_pct_time_save'] = subset2['uber_time_save'] / subset2['tot_trn_time']
subset2['start_or_end_near_home'] = np.nan
subset2.loc[(subset2['start_is_near_home']==0) & (subset2['end_is_near_home']==0),'start_or_end_near_home'] = 0
subset2.loc[(subset2['start_is_near_home']==1) | (subset2['end_is_near_home']==1),'start_or_end_near_home'] = 1
subset2['start_or_end_near_work'] = np.nan
subset2.loc[(subset2['start_is_near_work']==0) & (subset2['end_is_near_work']==0),'start_or_end_near_work'] = 0
subset2.loc[(subset2['start_is_near_work']==1) | (subset2['end_is_near_work']==1),'start_or_end_near_work'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas

In [57]:
exog_cols = ['n_transit','walkwait','ln_ubr_time']
defn = subset2.dropna(subset=exog_cols)
y = defn['chosen_mode3']
x = defn.loc[:,exog_cols]
x['const'] = 1

s2_mdl = st.MNLogit(y, x)
s2_fit = s2_mdl.fit()
s2_fit.summary()

Optimization terminated successfully.
         Current function value: 0.105802
         Iterations 8


0,1,2,3
Dep. Variable:,chosen_mode3,No. Observations:,19641.0
Model:,MNLogit,Df Residuals:,19637.0
Method:,MLE,Df Model:,3.0
Date:,"Wed, 20 Jul 2016",Pseudo R-squ.:,0.04249
Time:,11:18:10,Log-Likelihood:,-2078.1
converged:,True,LL-Null:,-2170.3
,,LLR p-value:,9.662e-40

chosen_mode3=uber,coef,std err,z,P>|z|,[95.0% Conf. Int.]
n_transit,-0.5829,0.052,-11.108,0.0,-0.686 -0.480
walkwait,0.0006,9.82e-05,6.47,0.0,0.000 0.001
ln_ubr_time,0.1644,0.079,2.075,0.038,0.009 0.320
const,-2.901,0.264,-10.997,0.0,-3.418 -2.384


In [58]:
exog_cols = ['n_transit','ln_walkwait','transit_time','late_night']
defn = subset2.dropna(subset=exog_cols)
y = defn['chosen_mode3']
x = defn.loc[:,exog_cols]
x['const'] = 1

s2_mdl = st.MNLogit(y, x)
s2_fit = s2_mdl.fit()
s2_fit.summary()

Optimization terminated successfully.
         Current function value: 0.105330
         Iterations 8


0,1,2,3
Dep. Variable:,chosen_mode3,No. Observations:,19589.0
Model:,MNLogit,Df Residuals:,19584.0
Method:,MLE,Df Model:,4.0
Date:,"Wed, 20 Jul 2016",Pseudo R-squ.:,0.04546
Time:,11:18:10,Log-Likelihood:,-2063.3
converged:,True,LL-Null:,-2161.6
,,LLR p-value:,2.1049999999999998e-41

chosen_mode3=uber,coef,std err,z,P>|z|,[95.0% Conf. Int.]
n_transit,-0.5744,0.053,-10.888,0.0,-0.678 -0.471
ln_walkwait,0.135,0.021,6.534,0.0,0.094 0.175
transit_time,0.0046,0.002,2.176,0.03,0.000 0.009
late_night,0.4363,0.167,2.606,0.009,0.108 0.764
const,-2.9335,0.175,-16.788,0.0,-3.276 -2.591


In [59]:
exog_cols = ['n_transit','ln_walkwait','transit_time','transit_cost','uber_basic_time','uber_basic_cost']
defn = subset2.dropna(subset=exog_cols)
y = defn['chosen_mode3']
x = defn.loc[:,exog_cols]
x['const'] = 1

s2_mdl = st.MNLogit(y, x)
s2_fit = s2_mdl.fit()
s2_fit.summary()

Optimization terminated successfully.
         Current function value: 0.105403
         Iterations 8


0,1,2,3
Dep. Variable:,chosen_mode3,No. Observations:,19584.0
Model:,MNLogit,Df Residuals:,19577.0
Method:,MLE,Df Model:,6.0
Date:,"Wed, 20 Jul 2016",Pseudo R-squ.:,0.04499
Time:,11:18:10,Log-Likelihood:,-2064.2
converged:,True,LL-Null:,-2161.5
,,LLR p-value:,2.824e-39

chosen_mode3=uber,coef,std err,z,P>|z|,[95.0% Conf. Int.]
n_transit,-0.5868,0.054,-10.862,0.0,-0.693 -0.481
ln_walkwait,0.1352,0.021,6.548,0.0,0.095 0.176
transit_time,0.0036,0.004,0.944,0.345,-0.004 0.011
transit_cost,0.0559,0.029,1.913,0.056,-0.001 0.113
uber_basic_time,0.0106,0.009,1.181,0.238,-0.007 0.028
uber_basic_cost,-0.0162,0.013,-1.283,0.2,-0.041 0.009
const,-2.9744,0.18,-16.504,0.0,-3.328 -2.621


In [60]:
exog_cols = ['n_transit','n_arr']
defn = subset2.dropna(subset=exog_cols)
y = defn['chosen_mode3']
x = defn.loc[:,exog_cols]
x['const'] = 1

s2_mdl = st.MNLogit(y, x)
s2_fit = s2_mdl.fit()
s2_fit.summary()

Optimization terminated successfully.
         Current function value: 0.105985
         Iterations 8


0,1,2,3
Dep. Variable:,chosen_mode3,No. Observations:,19646.0
Model:,MNLogit,Df Residuals:,19643.0
Method:,MLE,Df Model:,2.0
Date:,"Wed, 20 Jul 2016",Pseudo R-squ.:,0.04064
Time:,11:18:10,Log-Likelihood:,-2082.2
converged:,True,LL-Null:,-2170.4
,,LLR p-value:,4.901e-39

chosen_mode3=uber,coef,std err,z,P>|z|,[95.0% Conf. Int.]
n_transit,-0.5897,0.052,-11.276,0.0,-0.692 -0.487
n_arr,-0.2181,0.035,-6.244,0.0,-0.287 -0.150
const,-1.8346,0.14,-13.15,0.0,-2.108 -1.561


In [61]:
exog_cols = ['n_transit','headway']
defn = subset2.dropna(subset=exog_cols)
y = defn['chosen_mode3']
x = defn.loc[:,exog_cols]
x['const'] = 1

s2_mdl = st.MNLogit(y, x)
s2_fit = s2_mdl.fit()
s2_fit.summary()

Optimization terminated successfully.
         Current function value: 0.106076
         Iterations 8


0,1,2,3
Dep. Variable:,chosen_mode3,No. Observations:,19646.0
Model:,MNLogit,Df Residuals:,19643.0
Method:,MLE,Df Model:,2.0
Date:,"Wed, 20 Jul 2016",Pseudo R-squ.:,0.03982
Time:,11:18:11,Log-Likelihood:,-2084.0
converged:,True,LL-Null:,-2170.4
,,LLR p-value:,2.896e-38

chosen_mode3=uber,coef,std err,z,P>|z|,[95.0% Conf. Int.]
n_transit,-0.5894,0.052,-11.292,0.0,-0.692 -0.487
headway,0.0064,0.001,5.936,0.0,0.004 0.009
const,-2.5223,0.146,-17.265,0.0,-2.809 -2.236


In [62]:
exog_cols = ['n_transit','walkwait','transit_time','uber_basic_time']
defn = subset2.dropna(subset=exog_cols)
y = defn['chosen_mode3']
x = defn.loc[:,exog_cols]
x['const'] = 1

s2_mdl = st.MNLogit(y, x)
s2_fit = s2_mdl.fit(maxiter=100)
s2_fit.summary()

Optimization terminated successfully.
         Current function value: 0.105762
         Iterations 8


0,1,2,3
Dep. Variable:,chosen_mode3,No. Observations:,19641.0
Model:,MNLogit,Df Residuals:,19636.0
Method:,MLE,Df Model:,4.0
Date:,"Wed, 20 Jul 2016",Pseudo R-squ.:,0.04285
Time:,11:18:11,Log-Likelihood:,-2077.3
converged:,True,LL-Null:,-2170.3
,,LLR p-value:,3.812e-39

chosen_mode3=uber,coef,std err,z,P>|z|,[95.0% Conf. Int.]
n_transit,-0.5733,0.053,-10.853,0.0,-0.677 -0.470
walkwait,0.0006,9.86e-05,6.347,0.0,0.000 0.001
transit_time,0.0043,0.003,1.388,0.165,-0.002 0.010
uber_basic_time,0.0022,0.005,0.412,0.681,-0.008 0.013
const,-2.6325,0.162,-16.251,0.0,-2.950 -2.315


In [63]:
exog_cols = ['n_transit','headway','tot_trn_time','uber_basic_time']
defn = subset2.dropna(subset=exog_cols)
y = defn['chosen_mode3']
x = defn.loc[:,exog_cols]
x['const'] = 1

s2_mdl = st.MNLogit(y, x)
s2_fit = s2_mdl.fit(maxiter=100)
s2_fit.summary()

Optimization terminated successfully.
         Current function value: 0.105776
         Iterations 8


0,1,2,3
Dep. Variable:,chosen_mode3,No. Observations:,19641.0
Model:,MNLogit,Df Residuals:,19636.0
Method:,MLE,Df Model:,4.0
Date:,"Wed, 20 Jul 2016",Pseudo R-squ.:,0.04273
Time:,11:18:11,Log-Likelihood:,-2077.5
converged:,True,LL-Null:,-2170.3
,,LLR p-value:,4.991e-39

chosen_mode3=uber,coef,std err,z,P>|z|,[95.0% Conf. Int.]
n_transit,-0.5737,0.053,-10.876,0.0,-0.677 -0.470
headway,0.0019,0.002,0.922,0.356,-0.002 0.006
tot_trn_time,0.0005,0.0,2.671,0.008,0.000 0.001
uber_basic_time,0.0068,0.004,1.903,0.057,-0.000 0.014
const,-2.6461,0.167,-15.876,0.0,-2.973 -2.319


In [64]:
# try everything except late_night
exog_cols = ['n_transit','n_arr','walk','wait','headway','transit_time','uber_basic_time','transit_cost','uber_basic_cost']
defn = subset2.dropna(subset=exog_cols)
y = defn['chosen_mode3']
x = defn.loc[:,exog_cols]
x['const'] = 1

s2_mdl = st.MNLogit(y, x)
s2_fit = s2_mdl.fit(maxiter=100)
s2_fit.summary()

Optimization terminated successfully.
         Current function value: 0.105631
         Iterations 8


0,1,2,3
Dep. Variable:,chosen_mode3,No. Observations:,19641.0
Model:,MNLogit,Df Residuals:,19631.0
Method:,MLE,Df Model:,9.0
Date:,"Wed, 20 Jul 2016",Pseudo R-squ.:,0.04404
Time:,11:18:11,Log-Likelihood:,-2074.7
converged:,True,LL-Null:,-2170.3
,,LLR p-value:,2.363e-36

chosen_mode3=uber,coef,std err,z,P>|z|,[95.0% Conf. Int.]
n_transit,-0.5788,0.054,-10.705,0.0,-0.685 -0.473
n_arr,0.0629,0.202,0.311,0.756,-0.333 0.459
walk,-0.004,0.013,-0.316,0.752,-0.029 0.021
wait,0.0006,0.0,1.992,0.046,8.97e-06 0.001
headway,0.003,0.005,0.641,0.521,-0.006 0.012
transit_time,0.0044,0.004,1.101,0.271,-0.003 0.012
uber_basic_time,0.0095,0.009,1.058,0.29,-0.008 0.027
transit_cost,0.0585,0.029,2.012,0.044,0.002 0.115
uber_basic_cost,-0.0161,0.013,-1.274,0.203,-0.041 0.009
const,-2.9183,0.646,-4.515,0.0,-4.185 -1.651


In [65]:
# drop n_arr
exog_cols = ['n_transit','walk','wait','headway','transit_time','uber_basic_time','transit_cost','uber_basic_cost']
defn = subset2.dropna(subset=exog_cols)
y = defn['chosen_mode3']
x = defn.loc[:,exog_cols]
x['const'] = 1

s2_mdl = st.MNLogit(y, x)
s2_fit = s2_mdl.fit(maxiter=100)
s2_fit.summary()

Optimization terminated successfully.
         Current function value: 0.105634
         Iterations 8


0,1,2,3
Dep. Variable:,chosen_mode3,No. Observations:,19641.0
Model:,MNLogit,Df Residuals:,19632.0
Method:,MLE,Df Model:,8.0
Date:,"Wed, 20 Jul 2016",Pseudo R-squ.:,0.04402
Time:,11:18:11,Log-Likelihood:,-2074.8
converged:,True,LL-Null:,-2170.3
,,LLR p-value:,4.8869999999999995e-37

chosen_mode3=uber,coef,std err,z,P>|z|,[95.0% Conf. Int.]
n_transit,-0.5794,0.054,-10.719,0.0,-0.685 -0.473
walk,-0.0038,0.013,-0.302,0.762,-0.029 0.021
wait,0.0005,0.0,2.653,0.008,0.000 0.001
headway,0.0017,0.002,0.83,0.406,-0.002 0.006
transit_time,0.0044,0.004,1.099,0.272,-0.003 0.012
uber_basic_time,0.0096,0.009,1.072,0.284,-0.008 0.027
transit_cost,0.0584,0.029,2.01,0.044,0.001 0.115
uber_basic_cost,-0.0161,0.013,-1.277,0.202,-0.041 0.009
const,-2.7244,0.171,-15.967,0.0,-3.059 -2.390


In [66]:
# drop walk
exog_cols = ['n_transit','wait','headway','transit_time','uber_basic_time','transit_cost','uber_basic_cost']
defn = subset2.dropna(subset=exog_cols)
y = defn['chosen_mode3']
x = defn.loc[:,exog_cols]
x['const'] = 1

s2_mdl = st.MNLogit(y, x)
s2_fit = s2_mdl.fit(maxiter=100)
s2_fit.summary()

Optimization terminated successfully.
         Current function value: 0.105636
         Iterations 8


0,1,2,3
Dep. Variable:,chosen_mode3,No. Observations:,19641.0
Model:,MNLogit,Df Residuals:,19633.0
Method:,MLE,Df Model:,7.0
Date:,"Wed, 20 Jul 2016",Pseudo R-squ.:,0.04399
Time:,11:18:11,Log-Likelihood:,-2074.8
converged:,True,LL-Null:,-2170.3
,,LLR p-value:,9.391e-38

chosen_mode3=uber,coef,std err,z,P>|z|,[95.0% Conf. Int.]
n_transit,-0.5804,0.054,-10.763,0.0,-0.686 -0.475
wait,0.0005,0.0,2.666,0.008,0.000 0.001
headway,0.0017,0.002,0.816,0.415,-0.002 0.006
transit_time,0.0041,0.004,1.054,0.292,-0.003 0.012
uber_basic_time,0.0097,0.009,1.076,0.282,-0.008 0.027
transit_cost,0.0589,0.029,2.03,0.042,0.002 0.116
uber_basic_cost,-0.016,0.013,-1.265,0.206,-0.041 0.009
const,-2.727,0.17,-15.996,0.0,-3.061 -2.393


In [67]:
# drop headway
exog_cols = ['n_transit','wait','transit_time','uber_basic_time','transit_cost','uber_basic_cost']
defn = subset2.dropna(subset=exog_cols)
y = defn['chosen_mode3']
x = defn.loc[:,exog_cols]
x['const'] = 1

s2_mdl = st.MNLogit(y, x)
s2_fit = s2_mdl.fit(maxiter=100)
s2_fit.summary()

Optimization terminated successfully.
         Current function value: 0.105652
         Iterations 8


0,1,2,3
Dep. Variable:,chosen_mode3,No. Observations:,19641.0
Model:,MNLogit,Df Residuals:,19634.0
Method:,MLE,Df Model:,6.0
Date:,"Wed, 20 Jul 2016",Pseudo R-squ.:,0.04385
Time:,11:18:12,Log-Likelihood:,-2075.1
converged:,True,LL-Null:,-2170.3
,,LLR p-value:,2.18e-38

chosen_mode3=uber,coef,std err,z,P>|z|,[95.0% Conf. Int.]
n_transit,-0.5801,0.054,-10.754,0.0,-0.686 -0.474
wait,0.0006,9.89e-05,6.241,0.0,0.000 0.001
transit_time,0.0041,0.004,1.05,0.294,-0.004 0.012
uber_basic_time,0.009,0.009,1.003,0.316,-0.009 0.026
transit_cost,0.0606,0.029,2.088,0.037,0.004 0.118
uber_basic_cost,-0.0153,0.013,-1.217,0.224,-0.040 0.009
const,-2.6928,0.165,-16.331,0.0,-3.016 -2.370


In [68]:
# drop uber time
exog_cols = ['n_transit','wait','transit_time','transit_cost','uber_basic_cost']
defn = subset2.dropna(subset=exog_cols)
y = defn['chosen_mode3']
x = defn.loc[:,exog_cols]
x['const'] = 1

s2_mdl = st.MNLogit(y, x)
s2_fit = s2_mdl.fit(maxiter=100)
s2_fit.summary()

Optimization terminated successfully.
         Current function value: 0.105678
         Iterations 8


0,1,2,3
Dep. Variable:,chosen_mode3,No. Observations:,19641.0
Model:,MNLogit,Df Residuals:,19635.0
Method:,MLE,Df Model:,5.0
Date:,"Wed, 20 Jul 2016",Pseudo R-squ.:,0.04362
Time:,11:18:12,Log-Likelihood:,-2075.6
converged:,True,LL-Null:,-2170.3
,,LLR p-value:,5.468e-39

chosen_mode3=uber,coef,std err,z,P>|z|,[95.0% Conf. Int.]
n_transit,-0.5738,0.053,-10.73,0.0,-0.679 -0.469
wait,0.0006,9.9e-05,6.221,0.0,0.000 0.001
transit_time,0.0039,0.004,1.021,0.307,-0.004 0.011
transit_cost,0.0556,0.029,1.933,0.053,-0.001 0.112
uber_basic_cost,-0.0054,0.008,-0.709,0.478,-0.020 0.010
const,-2.6671,0.162,-16.467,0.0,-2.985 -2.350


In [69]:
# drop transit_time
exog_cols = ['n_transit','wait','transit_cost']
defn = subset2.dropna(subset=exog_cols)
y = defn['chosen_mode3']
x = defn.loc[:,exog_cols]
x['const'] = 1

s2_mdl = st.MNLogit(y, x)
s2_fit = s2_mdl.fit(maxiter=100)
s2_fit.summary()

Optimization terminated successfully.
         Current function value: 0.105683
         Iterations 8


0,1,2,3
Dep. Variable:,chosen_mode3,No. Observations:,19646.0
Model:,MNLogit,Df Residuals:,19642.0
Method:,MLE,Df Model:,3.0
Date:,"Wed, 20 Jul 2016",Pseudo R-squ.:,0.04338
Time:,11:18:12,Log-Likelihood:,-2076.2
converged:,True,LL-Null:,-2170.4
,,LLR p-value:,1.42e-40

chosen_mode3=uber,coef,std err,z,P>|z|,[95.0% Conf. Int.]
n_transit,-0.569,0.053,-10.787,0.0,-0.672 -0.466
wait,0.0006,9.89e-05,6.234,0.0,0.000 0.001
transit_cost,0.061,0.02,3.053,0.002,0.022 0.100
const,-2.6575,0.158,-16.807,0.0,-2.967 -2.348


In [70]:
# drop uber cost
exog_cols = ['n_transit','walkwait','start_or_end_near_work','start_or_end_near_home']
defn = subset2.dropna(subset=exog_cols)
y = defn['chosen_mode3']
x = defn.loc[:,exog_cols]
x['const'] = 1

s2_mdl = st.MNLogit(y, x)
s2_fit = s2_mdl.fit(maxiter=100)
s2_fit.summary()

Optimization terminated successfully.
         Current function value: 0.106765
         Iterations 8


0,1,2,3
Dep. Variable:,chosen_mode3,No. Observations:,16031.0
Model:,MNLogit,Df Residuals:,16026.0
Method:,MLE,Df Model:,4.0
Date:,"Wed, 20 Jul 2016",Pseudo R-squ.:,0.04976
Time:,11:18:12,Log-Likelihood:,-1711.6
converged:,True,LL-Null:,-1801.2
,,LLR p-value:,1.081e-37

chosen_mode3=uber,coef,std err,z,P>|z|,[95.0% Conf. Int.]
n_transit,-0.5659,0.058,-9.758,0.0,-0.680 -0.452
walkwait,0.0007,0.0,6.769,0.0,0.001 0.001
start_or_end_near_work,0.5787,0.105,5.518,0.0,0.373 0.784
start_or_end_near_home,0.2435,0.109,2.229,0.026,0.029 0.458
const,-2.8788,0.177,-16.262,0.0,-3.226 -2.532


In [71]:
exog_cols = ['start_or_end_near_work']
defn = subset2.dropna(subset=exog_cols)
y = defn['chosen_mode3']
x = defn.loc[:,exog_cols]
x['const'] = 1

s2_mdl = st.MNLogit(y, x)
s2_fit = s2_mdl.fit(maxiter=100)
s2_fit.summary()

Optimization terminated successfully.
         Current function value: 0.111069
         Iterations 8


0,1,2,3
Dep. Variable:,chosen_mode3,No. Observations:,16259.0
Model:,MNLogit,Df Residuals:,16257.0
Method:,MLE,Df Model:,1.0
Date:,"Wed, 20 Jul 2016",Pseudo R-squ.:,0.006561
Time:,11:18:12,Log-Likelihood:,-1805.9
converged:,True,LL-Null:,-1817.8
,,LLR p-value:,1.04e-06

chosen_mode3=uber,coef,std err,z,P>|z|,[95.0% Conf. Int.]
start_or_end_near_work,0.5078,0.103,4.912,0.0,0.305 0.710
const,-3.9413,0.072,-54.37,0.0,-4.083 -3.799


In [72]:
# try everything except late_night
exog_cols = ['n_transit','n_arr','walk','wait','headway','transit_time','uber_basic_time','transit_cost','uber_basic_cost','start_or_end_near_home','start_or_end_near_work']
defn = subset2.dropna(subset=exog_cols)
y = defn['chosen_mode3']
x = defn.loc[:,exog_cols]
x['const'] = 1

s2_mdl = st.MNLogit(y, x)
s2_fit = s2_mdl.fit(maxiter=100)
s2_fit.summary()

Optimization terminated successfully.
         Current function value: 0.106675
         Iterations 8


0,1,2,3
Dep. Variable:,chosen_mode3,No. Observations:,16026.0
Model:,MNLogit,Df Residuals:,16014.0
Method:,MLE,Df Model:,11.0
Date:,"Wed, 20 Jul 2016",Pseudo R-squ.:,0.05079
Time:,11:18:12,Log-Likelihood:,-1709.6
converged:,True,LL-Null:,-1801.1
,,LLR p-value:,2.514e-33

chosen_mode3=uber,coef,std err,z,P>|z|,[95.0% Conf. Int.]
n_transit,-0.5445,0.06,-9.142,0.0,-0.661 -0.428
n_arr,-0.0417,0.221,-0.188,0.851,-0.475 0.392
walk,-0.0004,0.015,-0.024,0.981,-0.029 0.028
wait,0.0006,0.0,1.884,0.06,-2.34e-05 0.001
headway,0.0002,0.005,0.032,0.974,-0.010 0.010
transit_time,-0.0029,0.005,-0.602,0.547,-0.013 0.007
uber_basic_time,-0.0124,0.011,-1.182,0.237,-0.033 0.008
transit_cost,0.0152,0.044,0.345,0.73,-0.071 0.102
uber_basic_cost,0.0208,0.015,1.358,0.174,-0.009 0.051
start_or_end_near_home,0.2658,0.111,2.387,0.017,0.048 0.484


In [73]:
# drop walk, headway, n_arr
exog_cols = ['n_transit','wait','transit_time','uber_basic_time','transit_cost','uber_basic_cost','start_or_end_near_home','start_or_end_near_work']
defn = subset2.dropna(subset=exog_cols)
y = defn['chosen_mode3']
x = defn.loc[:,exog_cols]
x['const'] = 1

s2_mdl = st.MNLogit(y, x)
s2_fit = s2_mdl.fit(maxiter=100)
s2_fit.summary()

Optimization terminated successfully.
         Current function value: 0.106683
         Iterations 8


0,1,2,3
Dep. Variable:,chosen_mode3,No. Observations:,16026.0
Model:,MNLogit,Df Residuals:,16017.0
Method:,MLE,Df Model:,8.0
Date:,"Wed, 20 Jul 2016",Pseudo R-squ.:,0.05073
Time:,11:18:13,Log-Likelihood:,-1709.7
converged:,True,LL-Null:,-1801.1
,,LLR p-value:,2.7540000000000003e-35

chosen_mode3=uber,coef,std err,z,P>|z|,[95.0% Conf. Int.]
n_transit,-0.544,0.059,-9.163,0.0,-0.660 -0.428
wait,0.0007,0.0,6.526,0.0,0.000 0.001
transit_time,-0.003,0.005,-0.643,0.52,-0.012 0.006
uber_basic_time,-0.0129,0.01,-1.234,0.217,-0.033 0.008
transit_cost,0.0165,0.044,0.373,0.709,-0.070 0.103
uber_basic_cost,0.0213,0.015,1.391,0.164,-0.009 0.051
start_or_end_near_home,0.2676,0.111,2.406,0.016,0.050 0.486
start_or_end_near_work,0.6074,0.108,5.631,0.0,0.396 0.819
const,-2.9769,0.204,-14.601,0.0,-3.377 -2.577


In [74]:
# drop transit_cost, transit_time
exog_cols = ['n_transit','wait','uber_basic_time','uber_basic_cost','start_or_end_near_home','start_or_end_near_work']
defn = subset2.dropna(subset=exog_cols)
y = defn['chosen_mode3']
x = defn.loc[:,exog_cols]
x['const'] = 1

s2_mdl = st.MNLogit(y, x)
s2_fit = s2_mdl.fit(maxiter=100)
s2_fit.summary()

Optimization terminated successfully.
         Current function value: 0.106698
         Iterations 8


0,1,2,3
Dep. Variable:,chosen_mode3,No. Observations:,16026.0
Model:,MNLogit,Df Residuals:,16019.0
Method:,MLE,Df Model:,6.0
Date:,"Wed, 20 Jul 2016",Pseudo R-squ.:,0.05059
Time:,11:18:13,Log-Likelihood:,-1709.9
converged:,True,LL-Null:,-1801.1
,,LLR p-value:,1.1370000000000001e-36

chosen_mode3=uber,coef,std err,z,P>|z|,[95.0% Conf. Int.]
n_transit,-0.5459,0.059,-9.24,0.0,-0.662 -0.430
wait,0.0007,0.0,6.551,0.0,0.000 0.001
uber_basic_time,-0.0135,0.01,-1.321,0.186,-0.033 0.007
uber_basic_cost,0.0196,0.011,1.713,0.087,-0.003 0.042
start_or_end_near_home,0.2541,0.109,2.322,0.02,0.040 0.469
start_or_end_near_work,0.6009,0.107,5.596,0.0,0.390 0.811
const,-2.9656,0.197,-15.031,0.0,-3.352 -2.579


In [75]:
# wait -> walkwait
exog_cols = ['n_transit','walkwait','uber_basic_time','uber_basic_cost','start_or_end_near_home','start_or_end_near_work']
defn = subset2.dropna(subset=exog_cols)
y = defn['chosen_mode3']
x = defn.loc[:,exog_cols]
x['const'] = 1

s2_mdl = st.MNLogit(y, x)
s2_fit = s2_mdl.fit(maxiter=100)
s2_fit.summary()

Optimization terminated successfully.
         Current function value: 0.106699
         Iterations 8


0,1,2,3
Dep. Variable:,chosen_mode3,No. Observations:,16026.0
Model:,MNLogit,Df Residuals:,16019.0
Method:,MLE,Df Model:,6.0
Date:,"Wed, 20 Jul 2016",Pseudo R-squ.:,0.05059
Time:,11:18:13,Log-Likelihood:,-1710.0
converged:,True,LL-Null:,-1801.1
,,LLR p-value:,1.1500000000000001e-36

chosen_mode3=uber,coef,std err,z,P>|z|,[95.0% Conf. Int.]
n_transit,-0.5462,0.059,-9.245,0.0,-0.662 -0.430
walkwait,0.0007,0.0,6.549,0.0,0.000 0.001
uber_basic_time,-0.0134,0.01,-1.319,0.187,-0.033 0.007
uber_basic_cost,0.0195,0.011,1.707,0.088,-0.003 0.042
start_or_end_near_home,0.2537,0.109,2.318,0.02,0.039 0.468
start_or_end_near_work,0.6007,0.107,5.594,0.0,0.390 0.811
const,-2.9659,0.197,-15.03,0.0,-3.353 -2.579


In [76]:
# drop uber_basic_time, uber_basic_cost
exog_cols = ['n_transit','walkwait','start_or_end_near_work','start_or_end_near_home','late_night']
defn = subset2.dropna(subset=exog_cols)
y = defn['chosen_mode3']
x = defn.loc[:,exog_cols]
x['const'] = 1

s2_mdl = st.MNLogit(y, x)
s2_fit = s2_mdl.fit(maxiter=100)
s2_fit.summary()

Optimization terminated successfully.
         Current function value: 0.106564
         Iterations 8


0,1,2,3
Dep. Variable:,chosen_mode3,No. Observations:,16031.0
Model:,MNLogit,Df Residuals:,16025.0
Method:,MLE,Df Model:,5.0
Date:,"Wed, 20 Jul 2016",Pseudo R-squ.:,0.05155
Time:,11:18:13,Log-Likelihood:,-1708.3
converged:,True,LL-Null:,-1801.2
,,LLR p-value:,3.235e-38

chosen_mode3=uber,coef,std err,z,P>|z|,[95.0% Conf. Int.]
n_transit,-0.5596,0.058,-9.654,0.0,-0.673 -0.446
walkwait,0.0007,0.0,6.733,0.0,0.001 0.001
start_or_end_near_work,0.5979,0.105,5.678,0.0,0.392 0.804
start_or_end_near_home,0.2262,0.11,2.065,0.039,0.011 0.441
late_night,0.5189,0.192,2.703,0.007,0.143 0.895
const,-2.9255,0.178,-16.419,0.0,-3.275 -2.576


In [77]:
# drop uber_basic_time, uber_basic_cost
exog_cols = ['n_transit','wait','start_or_end_near_work']
defn = subset2.dropna(subset=exog_cols)
y = defn['chosen_mode3']
x = defn.loc[:,exog_cols]
x['const'] = 1

s2_mdl = st.MNLogit(y, x)
s2_fit = s2_mdl.fit(maxiter=100)
s2_fit.summary()

Optimization terminated successfully.
         Current function value: 0.106457
         Iterations 8


0,1,2,3
Dep. Variable:,chosen_mode3,No. Observations:,16259.0
Model:,MNLogit,Df Residuals:,16255.0
Method:,MLE,Df Model:,3.0
Date:,"Wed, 20 Jul 2016",Pseudo R-squ.:,0.04781
Time:,11:18:13,Log-Likelihood:,-1730.9
converged:,True,LL-Null:,-1817.8
,,LLR p-value:,1.904e-37

chosen_mode3=uber,coef,std err,z,P>|z|,[95.0% Conf. Int.]
n_transit,-0.5616,0.058,-9.735,0.0,-0.675 -0.449
wait,0.0007,0.0,6.799,0.0,0.001 0.001
start_or_end_near_work,0.5626,0.104,5.394,0.0,0.358 0.767
const,-2.7365,0.162,-16.896,0.0,-3.054 -2.419


In [78]:
exog_cols = ['walkwait']
defn = subset2.dropna(subset=exog_cols)
y = defn['chosen_mode3']
x = defn.loc[:,exog_cols]
x['const'] = 1

s2_mdl = st.MNLogit(y, x)
s2_fit = s2_mdl.fit(maxiter=100)
s2_fit.summary()

Optimization terminated successfully.
         Current function value: 0.109119
         Iterations 8


0,1,2,3
Dep. Variable:,chosen_mode3,No. Observations:,19646.0
Model:,MNLogit,Df Residuals:,19644.0
Method:,MLE,Df Model:,1.0
Date:,"Wed, 20 Jul 2016",Pseudo R-squ.:,0.01227
Time:,11:18:14,Log-Likelihood:,-2143.8
converged:,True,LL-Null:,-2170.4
,,LLR p-value:,2.892e-13

chosen_mode3=uber,coef,std err,z,P>|z|,[95.0% Conf. Int.]
walkwait,0.0007,9.73e-05,7.514,0.0,0.001 0.001
const,-3.9842,0.062,-64.002,0.0,-4.106 -3.862


In [79]:
exog_cols = ['female'] + age_dummy_cols[:-1] + inc_dummy_cols[:-1]
defn = subset2.dropna(subset=exog_cols)
y = defn['chosen_mode3']
x = defn.loc[:,exog_cols]
x['const'] = 1

s2_mdl = st.MNLogit(y, x)
s2_fit = s2_mdl.fit(maxiter=500)
s2_fit.summary()

Optimization terminated successfully.
         Current function value: 0.110271
         Iterations 8


0,1,2,3
Dep. Variable:,chosen_mode3,No. Observations:,19646.0
Model:,MNLogit,Df Residuals:,19635.0
Method:,MLE,Df Model:,10.0
Date:,"Wed, 20 Jul 2016",Pseudo R-squ.:,0.001844
Time:,11:18:14,Log-Likelihood:,-2166.4
converged:,True,LL-Null:,-2170.4
,,LLR p-value:,0.6285

chosen_mode3=uber,coef,std err,z,P>|z|,[95.0% Conf. Int.]
female,0.0948,0.325,0.292,0.77,-0.541 0.731
age_cat_18-24,0.0114,0.499,0.023,0.982,-0.967 0.990
age_cat_25-34,0.7462,0.392,1.904,0.057,-0.022 1.514
age_cat_35-44,0.6662,0.5,1.332,0.183,-0.314 1.647
age_cat_45-54,0.0199,0.751,0.026,0.979,-1.451 1.491
age_cat_55-64,1.2127,0.629,1.927,0.054,-0.021 2.446
age_cat_65+,1.0979,1.071,1.025,0.305,-1.002 3.197
hhinc_cat_$25-$50,-0.4863,0.428,-1.136,0.256,-1.325 0.353
hhinc_cat_$50-$75,-0.5806,0.488,-1.19,0.234,-1.537 0.376
hhinc_cat_$75+,-0.6991,0.432,-1.617,0.106,-1.546 0.148


In [80]:
exog_cols = ['female','age_cat_25-34','age_cat_55-64','hhinc_cat_$75+']
defn = subset2.dropna(subset=exog_cols)
y = defn['chosen_mode3']
x = defn.loc[:,exog_cols]
x['const'] = 1

s2_mdl = st.MNLogit(y, x)
s2_fit = s2_mdl.fit(maxiter=500)
s2_fit.summary()

Optimization terminated successfully.
         Current function value: 0.110354
         Iterations 8


0,1,2,3
Dep. Variable:,chosen_mode3,No. Observations:,19646.0
Model:,MNLogit,Df Residuals:,19641.0
Method:,MLE,Df Model:,4.0
Date:,"Wed, 20 Jul 2016",Pseudo R-squ.:,0.001094
Time:,11:18:14,Log-Likelihood:,-2168.0
converged:,True,LL-Null:,-2170.4
,,LLR p-value:,0.3142

chosen_mode3=uber,coef,std err,z,P>|z|,[95.0% Conf. Int.]
female,0.0706,0.276,0.255,0.798,-0.471 0.612
age_cat_25-34,0.3871,0.277,1.397,0.162,-0.156 0.930
age_cat_55-64,0.8686,0.558,1.557,0.12,-0.225 1.962
hhinc_cat_$75+,-0.3055,0.342,-0.893,0.372,-0.976 0.365
const,-3.7535,0.049,-76.623,0.0,-3.850 -3.657


In [81]:
exog_cols = ['age_cat_25-34','age_cat_55-64','hhinc_cat_$75+']
defn = subset2.dropna(subset=exog_cols)
y = defn['chosen_mode3']
x = defn.loc[:,exog_cols]
x['const'] = 1

s2_mdl = st.MNLogit(y, x)
s2_fit = s2_mdl.fit(maxiter=500)
s2_fit.summary()

Optimization terminated successfully.
         Current function value: 0.110356
         Iterations 8


0,1,2,3
Dep. Variable:,chosen_mode3,No. Observations:,19646.0
Model:,MNLogit,Df Residuals:,19642.0
Method:,MLE,Df Model:,3.0
Date:,"Wed, 20 Jul 2016",Pseudo R-squ.:,0.001079
Time:,11:18:14,Log-Likelihood:,-2168.1
converged:,True,LL-Null:,-2170.4
,,LLR p-value:,0.1965

chosen_mode3=uber,coef,std err,z,P>|z|,[95.0% Conf. Int.]
age_cat_25-34,0.4194,0.246,1.704,0.088,-0.063 0.902
age_cat_55-64,0.9167,0.525,1.747,0.081,-0.112 1.945
hhinc_cat_$75+,-0.2939,0.339,-0.866,0.387,-0.959 0.371
const,-3.7524,0.049,-76.947,0.0,-3.848 -3.657


In [82]:
exog_cols = ['age_cat_25-34','age_cat_55-64']
defn = subset2.dropna(subset=exog_cols)
y = defn['chosen_mode3']
x = defn.loc[:,exog_cols]
x['const'] = 1

s2_mdl = st.MNLogit(y, x)
s2_fit = s2_mdl.fit(maxiter=500)
s2_fit.summary()

Optimization terminated successfully.
         Current function value: 0.110376
         Iterations 8


0,1,2,3
Dep. Variable:,chosen_mode3,No. Observations:,19646.0
Model:,MNLogit,Df Residuals:,19643.0
Method:,MLE,Df Model:,2.0
Date:,"Wed, 20 Jul 2016",Pseudo R-squ.:,0.0008952
Time:,11:18:15,Log-Likelihood:,-2168.5
converged:,True,LL-Null:,-2170.4
,,LLR p-value:,0.1433

chosen_mode3=uber,coef,std err,z,P>|z|,[95.0% Conf. Int.]
age_cat_25-34,0.3291,0.227,1.45,0.147,-0.116 0.774
age_cat_55-64,0.8242,0.515,1.599,0.11,-0.186 1.834
const,-3.7554,0.049,-77.157,0.0,-3.851 -3.660


In [83]:
exog_cols = ['n_transit','walkwait','transit_time'] + user_dummy_cols[1:]
sub3 = subset2[pd.notnull(subset2['master_user_id'])]
defn = sub3.dropna(subset=exog_cols)
y = defn['chosen_mode3']
x = defn.loc[:,exog_cols]
x['const'] = 1

s2_mdl = st.MNLogit(y, x)
s2_fit = s2_mdl.fit_regularized(maxiter=50,)
s2_fit.summary()

Iteration limit exceeded    (Exit mode 9)
            Current function value: 0.0296224941782
            Iterations: 51
            Function evaluations: 63
            Gradient evaluations: 51


LinAlgError: Singular matrix

In [None]:
exog_cols = user_dummy_cols[1:]
defn = subset2.dropna(subset=exog_cols)
y = defn['chosen_mode3']
x = defn.loc[:,exog_cols]
x['const'] = 1

s2_mdl = st.MNLogit(y, x)
s2_fit = s2_mdl.fit(maxiter=500)
s2_fit.summary()