In [1]:
# Initial imports.
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Loading data
file_path = Path("Resources/allmay_cleaned.csv")
allmay_df = pd.read_csv(file_path)
allmay_df.head()

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station latitude,start station longitude,end station id,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,538,2016-05-01 00:00:03,2016-05-01 00:09:02,536,40.741444,-73.975361,497,40.73705,-73.990093,23097,1,1986.0,2
1,224,2016-05-01 00:00:04,2016-05-01 00:03:49,361,40.716059,-73.991908,340,40.71269,-73.987763,23631,1,1977.0,1
2,328,2016-05-01 00:00:14,2016-05-01 00:05:43,301,40.722174,-73.983688,311,40.717227,-73.988021,23049,1,1980.0,1
3,753,2016-05-01 00:00:26,2016-05-01 00:13:00,492,40.7502,-73.990931,228,40.754601,-73.971879,16437,1,1981.0,1
4,511,2016-05-01 00:00:33,2016-05-01 00:09:05,445,40.727408,-73.98142,537,40.740259,-73.984092,20592,1,1991.0,1


In [3]:
allmay_df.dtypes

tripduration                 int64
starttime                   object
stoptime                    object
start station id             int64
start station latitude     float64
start station longitude    float64
end station id               int64
end station latitude       float64
end station longitude      float64
bikeid                       int64
usertype                     int64
birth year                 float64
gender                       int64
dtype: object

In [4]:
#  Convert date columns to datetime from object
allmay_df['starttime'] = pd.to_datetime(allmay_df['starttime'])
allmay_df['stoptime'] = pd.to_datetime(allmay_df['stoptime'])

In [5]:
allmay_df.dtypes

tripduration                        int64
starttime                  datetime64[ns]
stoptime                   datetime64[ns]
start station id                    int64
start station latitude            float64
start station longitude           float64
end station id                      int64
end station latitude              float64
end station longitude             float64
bikeid                              int64
usertype                            int64
birth year                        float64
gender                              int64
dtype: object

In [7]:
# Drop the datetime columns
allmay_df.drop(columns=["starttime", "stoptime"], inplace=True)
allmay_df.head()

Unnamed: 0,tripduration,start station id,start station latitude,start station longitude,end station id,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,538,536,40.741444,-73.975361,497,40.73705,-73.990093,23097,1,1986.0,2
1,224,361,40.716059,-73.991908,340,40.71269,-73.987763,23631,1,1977.0,1
2,328,301,40.722174,-73.983688,311,40.717227,-73.988021,23049,1,1980.0,1
3,753,492,40.7502,-73.990931,228,40.754601,-73.971879,16437,1,1981.0,1
4,511,445,40.727408,-73.98142,537,40.740259,-73.984092,20592,1,1991.0,1


In [8]:
# Remove usertype target from features data
y = allmay_df['usertype'].ravel()
X = allmay_df.drop(columns=["usertype"])

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
# Create a random forest classifier.
rf_model = RandomForestRegressor(n_estimators=128, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

ValueError: Classification metrics can't handle a mix of binary and continuous targets

In [None]:
# Define the model - deep neural net
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 =  24
hidden_nodes_layer2 = 12

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))


# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Compile the Sequential model together and customize metrics
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=50)

# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

In [None]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)