In [4]:
import pandas as pd
import numpy as np

# Load CSV data
file_path = 'csv/trippub.csv'
data = pd.read_csv(file_path)

#REFRENCE CODES: https://nhts.ornl.gov/assets/2017/doc/codebook_v1.2.pdf


# Display basic information about the dataset
print("Number of rows and columns:", data.shape)

Number of rows and columns: (923572, 115)


In [5]:
# Display the first 50 columns
print("First 50 columns:", data.columns[:50])

# Display the next 50 columns
print("\nNext 50 columns:", data.columns[50:100])

# Display the remaining columns
print("\nRemaining columns:", data.columns[100:])

First 50 columns: Index(['HOUSEID', 'PERSONID', 'TDTRPNUM', 'STRTTIME', 'ENDTIME', 'TRVLCMIN',
       'TRPMILES', 'TRPTRANS', 'TRPACCMP', 'TRPHHACC', 'VEHID', 'TRWAITTM',
       'NUMTRANS', 'TRACCTM', 'DROP_PRK', 'TREGRTM', 'WHODROVE', 'WHYFROM',
       'LOOP_TRIP', 'TRPHHVEH', 'HHMEMDRV', 'HH_ONTD', 'NONHHCNT', 'NUMONTRP',
       'PSGR_FLG', 'PUBTRANS', 'TRIPPURP', 'DWELTIME', 'TDWKND', 'VMT_MILE',
       'DRVR_FLG', 'WHYTRP1S', 'ONTD_P1', 'ONTD_P2', 'ONTD_P3', 'ONTD_P4',
       'ONTD_P5', 'ONTD_P6', 'ONTD_P7', 'ONTD_P8', 'ONTD_P9', 'ONTD_P10',
       'ONTD_P11', 'ONTD_P12', 'ONTD_P13', 'TDCASEID', 'TRACC_WLK',
       'TRACC_POV', 'TRACC_BUS', 'TRACC_CRL'],
      dtype='object')

Next 50 columns: Index(['TRACC_SUB', 'TRACC_OTH', 'TREGR_WLK', 'TREGR_POV', 'TREGR_BUS',
       'TREGR_CRL', 'TREGR_SUB', 'TREGR_OTH', 'WHYTO', 'TRAVDAY', 'HOMEOWN',
       'HHSIZE', 'HHVEHCNT', 'HHFAMINC', 'DRVRCNT', 'HHSTATE', 'HHSTFIPS',
       'NUMADLT', 'WRKCOUNT', 'TDAYDATE', 'HHRESP', 'LIF_CYC', 'MSACA

In [6]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

columns_to_drop = ['WHYTO', 'WHYFROM', 'TRIPPURP', 'WHYTRP1S', 'WHYTRP90', 'HH_CBSA']  # over half the entries are XXXXX, need to drop this
data_dropped = data.drop(columns=columns_to_drop)
data_dropped = data_dropped.map(lambda x: np.nan if ((isinstance(x, (int, float)) and x < 0) or (isinstance(x, str) and x == '-9')) else x)

# Select columns with 'category' dtype
categorical_columns = data_dropped.select_dtypes(include='object').columns
print(len(categorical_columns))
numerical_columns = data_dropped.select_dtypes(include='number').columns
print(len(numerical_columns))

for c in categorical_columns:
    data_dropped[c] = data_dropped[c].fillna(data_dropped[c].mode())

for n in numerical_columns:
    data_dropped[n] = data_dropped[n].fillna(data_dropped[n].mean())

data_dropped = pd.get_dummies(data_dropped)
data_dropped = data_dropped.replace({True:1, False:0})
    

3
106


In [15]:
# Yes, let's use WHYTRP1S since WHYTO/WHYFROM split into too many categories I think it'll make classification harder.
# TRIPPURP categories are not that useful and WHYTRP90 is from an older study, not necessary any more
target_var = 'WHYTRP1S'

X = data_dropped
y = data[target_var]

print(X.columns)
print(y)

Index(['HOUSEID', 'PERSONID', 'TDTRPNUM', 'STRTTIME', 'ENDTIME', 'TRVLCMIN',
       'TRPMILES', 'TRPTRANS', 'TRPACCMP', 'TRPHHACC',
       ...
       'OBHUR_C', 'OBHUR_R', 'OBHUR_S', 'OBHUR_T', 'OBHUR_U', 'DBHUR_C',
       'DBHUR_R', 'DBHUR_S', 'DBHUR_T', 'DBHUR_U'],
      dtype='object', length=167)
0         20
1          1
2          1
3         10
4         20
          ..
923567    10
923568    50
923569    10
923570    40
923571     1
Name: WHYTRP1S, Length: 923572, dtype: int64


In [14]:
rand = np.random.choice(923572, size=20000, replace=False)
X_sel = X.loc[rand,:]
y_sel = y[rand]

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scale = scaler.fit_transform(X_train)
X_test_scale = scaler.transform(X_test)

In [12]:
X_train_scale

array([[ 1.08057967, -0.70531177, -0.10613368, ..., -0.59900838,
        -0.5482714 , -0.37893576],
       [ 1.03562384, -0.70531177, -0.10613368, ..., -0.59900838,
        -0.5482714 , -0.37893576],
       [ 1.03080503,  0.37119193, -0.96682246, ..., -0.59900838,
        -0.5482714 ,  2.63896974],
       ...,
       [ 1.10000055, -0.70531177,  2.90627704, ..., -0.59900838,
        -0.5482714 , -0.37893576],
       [-0.88092061,  0.37119193,  1.61524387, ...,  1.66942572,
        -0.5482714 , -0.37893576],
       [-0.91794613,  0.37119193, -0.96682246, ..., -0.59900838,
        -0.5482714 , -0.37893576]])

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

linreg = LogisticRegression(solver='newton-cholesky')
linreg.fit(X_train_scale, y_train)
y_pred = linreg.predict(X_test_scale)
print(accuracy_score(y_test, y_pred))

0.5743550875673334


In [22]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.0001, 0.001, 0.01, 0.1]
}

svc = LinearSVC(penalty='l1', dual=False, verbose=1)

grid_search = GridSearchCV(estimator=svc, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train_scale, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]

In [10]:
from sklearn.metrics import accuracy_score

y_pred = svc.predict(X_test_scale)
print(accuracy_score(y_test, y_pred))

0.5590666702758303


In [11]:
from joblib import dump, load

dump(svc, 'LinearSVC_l1_C1.joblib')

['LinearSVC_l1_C1.joblib']

In [12]:
newsvc = load('LinearSVC_l1_C1.joblib')

In [13]:
y_pred = newsvc.predict(X_test_scale)
print(accuracy_score(y_test, y_pred))

0.5590666702758303


In [3]:
np.linspace(0.001, 0.01, 10)

array([0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009,
       0.01 ])