In [5]:
import pandas as pd
import numpy as np
import random
import sklearn
from sklearn.linear_model import LinearRegression, LogisticRegression, LogisticRegressionCV
from sklearn import metrics

In [7]:
def getModel(df_train, df_test):
    import pandas as pd
    import numpy as np
    import random
    import sklearn
    from sklearn.linear_model import LinearRegression, LogisticRegression, LogisticRegressionCV
    from sklearn import metrics
    
    # define features to use in model
    features = ['station', 
            'day_of_week', 
            'day', 
            'hour', 
            'seconds_from_hour',
            'Temp.x',
            'Rain.x',
            'avgBikeIn',
            'avgBikeOut',
            'avgwait',
            'Total.docks',
            'cluster1',
            'cluster2',
            'cluster3',
            'cluster4',
            'cluster5',
            'cluster6'
           ]
    
    # create column for seconds in hour
    df_train['seconds_from_hour'] = 60*df_train['minute'] + df_train['second']
    df_test['seconds_from_hour'] = 60*df_test['minute'] + df_test['second']
    
    # get only data where a bike arrived (increment = 1) and there exists a valid wait-time
    X_train = df_train.loc[:, df_train.columns != 'decrement']
    X_train = X_train[X_train['increment']==1]
    X_train = X_train[X_train['wait_time']!=0]
    X_test = df_test.loc[:, df_test.columns != 'decrement']
    X_test = X_test[X_test['increment']==1]
    X_test = X_test[X_test['wait_time']!=0]
    
    # get data where docks in non-null, avoid errors
    X_train = X_train[X_train['Total.docks'].notnull()]
    X_test = X_test[X_test['Total.docks'].notnull()]
    
    # get y's based on whether wait time is 5min (300 sec) or more
    y_train = X_train['wait_time'] > 300
    y_train = y_train.replace(True, 1)
    y_train = y_train.replace(False, 0)
    y_test = X_test['wait_time'] > 300
    y_test = y_test.replace(True, 1)
    y_test = y_test.replace(False, 0)
    
    # get only desired features; one-hot encode category vars that aren't OHE already
    X_train = X_train[features]
    X_train = pd.get_dummies(X_train, columns = ['station', 
                                             'day_of_week', 
                                             'day', 
                                             'hour'
                                             ], drop_first = True)
    
    X_test = X_test[features]
    X_test = pd.get_dummies(X_test, columns = ['station',
                                             'day_of_week', 
                                             'day', 
                                             'hour',
                                             ], drop_first = True)
    
    # fix issue where columns are missing in training/test due to one-hot encoding
    for col in X_train.columns:
        if col not in X_test:
            X_test[col] = 0
        
    for col in X_test.columns:
        if col not in X_train:
            X_train[col] = 0
    
    # ensure columns are in the same order in train and test
    cols = X_train.columns
    X_test = X_test[cols]
    
    # Select the model type
    # model = RandomForestClassifier()
    model = LogisticRegressionCV(random_state=1, max_iter = 1000)
    # Fit the model to our data
    model.fit(X_train, y_train)
    
    # predict in and out of sample
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    auc_train = metrics.roc_auc_score(y_train, model.predict_proba(X_train)[:,1])
    auc_test = metrics.roc_auc_score(y_test, model.predict_proba(X_test)[:,1])
    
    return model, y_test_pred, auc_test

In [3]:
df_train = pd.read_csv('202109-stations-capacity/all_data_train_cluster.csv')
df_test = pd.read_csv('202109-stations-capacity/all_data_test_cluster.csv')

In [8]:
m, pred, auc = getModel(df_train, df_test)

In [17]:
m.coef_.shape

(1, 477)