In [1]:
# Initial imports.
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
import tensorflow as tf

In [2]:
# Loading data
file_path = Path("Resources/allmay_cleaned.csv")
allmay_cleaned_df = pd.read_csv(file_path)
allmay_cleaned_df.head()

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station latitude,start station longitude,end station id,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,538,2016-05-01 00:00:03,2016-05-01 00:09:02,536,40.741444,-73.975361,497,40.73705,-73.990093,23097,1,1986.0,2
1,224,2016-05-01 00:00:04,2016-05-01 00:03:49,361,40.716059,-73.991908,340,40.71269,-73.987763,23631,1,1977.0,1
2,328,2016-05-01 00:00:14,2016-05-01 00:05:43,301,40.722174,-73.983688,311,40.717227,-73.988021,23049,1,1980.0,1
3,753,2016-05-01 00:00:26,2016-05-01 00:13:00,492,40.7502,-73.990931,228,40.754601,-73.971879,16437,1,1981.0,1
4,511,2016-05-01 00:00:33,2016-05-01 00:09:05,445,40.727408,-73.98142,537,40.740259,-73.984092,20592,1,1991.0,1


In [3]:
# Create a dataframe that consists only of rows where 'start station id' is 422 (W 59 St & 10 Ave)
station422_df = allmay_cleaned_df[allmay_cleaned_df['start station id'] == 422]
station422_df

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station latitude,start station longitude,end station id,end station latitude,end station longitude,bikeid,usertype,birth year,gender
1152,1101,2016-05-01 07:23:39,2016-05-01 07:42:00,422,40.770513,-73.988038,334,40.742388,-73.997262,18764,1,1985.0,2
4558,177,2016-05-01 14:59:23,2016-05-01 15:02:20,422,40.770513,-73.988038,3159,40.774925,-73.982666,16841,1,1988.0,2
5209,550,2016-05-01 15:38:33,2016-05-01 15:47:44,422,40.770513,-73.988038,448,40.756604,-73.997901,24161,1,1996.0,1
5218,334,2016-05-01 15:38:51,2016-05-01 15:44:25,422,40.770513,-73.988038,3175,40.777480,-73.982886,20773,1,1983.0,1
5621,537,2016-05-01 16:04:26,2016-05-01 16:13:24,422,40.770513,-73.988038,529,40.757570,-73.990985,19309,1,1980.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6130980,345,2019-05-31 21:51:37.882,2019-05-31 21:57:22.999,422,40.770513,-73.988038,3699,40.763605,-73.989180,28521,1,1979.0,1
6131190,1723,2019-05-31 21:57:15.160,2019-05-31 22:25:58.269,422,40.770513,-73.988038,3244,40.731437,-73.994903,28850,1,1996.0,1
6133420,1393,2019-05-31 22:52:23.707,2019-05-31 23:15:37.242,422,40.770513,-73.988038,412,40.715815,-73.994224,33845,1,1984.0,1
6135069,241,2019-05-31 23:40:29.694,2019-05-31 23:44:31.144,422,40.770513,-73.988038,3175,40.777480,-73.982886,26925,1,1975.0,1


In [4]:
# Check column data types
station422_df.dtypes

tripduration                 int64
starttime                   object
stoptime                    object
start station id             int64
start station latitude     float64
start station longitude    float64
end station id               int64
end station latitude       float64
end station longitude      float64
bikeid                       int64
usertype                     int64
birth year                 float64
gender                       int64
dtype: object

In [5]:
# Additional preprocessing

In [6]:
#  Convert 'starttime' column to type datetime from object
station422_df['starttime'] = pd.to_datetime(station422_df['starttime'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [7]:
# Split 'starttime' column into 'year', 'month', 'day', hour' and 'minute' columns
station422_df['year'] = station422_df['starttime'].dt.year
station422_df['month'] = station422_df['starttime'].dt.month
station422_df['day'] = station422_df['starttime'].dt.day
station422_df['hour'] = station422_df['starttime'].dt.hour
station422_df['minute'] = station422_df['starttime'].dt.minute

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_

In [8]:
# Drop the 'starttime' and 'stoptime' columns
station422_df.drop(columns=['starttime'], inplace=True)
station422_df.drop(columns=["stoptime"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [9]:
# convert 'birth year' column to int from float
station422_df['birth year'] = station422_df['birth year'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [10]:
station422_df

Unnamed: 0,tripduration,start station id,start station latitude,start station longitude,end station id,end station latitude,end station longitude,bikeid,usertype,birth year,gender,year,month,day,hour,minute
1152,1101,422,40.770513,-73.988038,334,40.742388,-73.997262,18764,1,1985,2,2016,5,1,7,23
4558,177,422,40.770513,-73.988038,3159,40.774925,-73.982666,16841,1,1988,2,2016,5,1,14,59
5209,550,422,40.770513,-73.988038,448,40.756604,-73.997901,24161,1,1996,1,2016,5,1,15,38
5218,334,422,40.770513,-73.988038,3175,40.777480,-73.982886,20773,1,1983,1,2016,5,1,15,38
5621,537,422,40.770513,-73.988038,529,40.757570,-73.990985,19309,1,1980,1,2016,5,1,16,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6130980,345,422,40.770513,-73.988038,3699,40.763605,-73.989180,28521,1,1979,1,2019,5,31,21,51
6131190,1723,422,40.770513,-73.988038,3244,40.731437,-73.994903,28850,1,1996,1,2019,5,31,21,57
6133420,1393,422,40.770513,-73.988038,412,40.715815,-73.994224,33845,1,1984,1,2019,5,31,22,52
6135069,241,422,40.770513,-73.988038,3175,40.777480,-73.982886,26925,1,1975,1,2019,5,31,23,40


In [11]:
station422_df.nunique()

tripduration               2384
start station id              1
start station latitude        1
start station longitude       1
end station id              461
end station latitude        467
end station longitude       467
bikeid                     7128
usertype                      2
birth year                   63
gender                        3
year                          4
month                         1
day                          31
hour                         24
minute                       60
dtype: int64

In [12]:
# Drop columns that contain only 1 unique value (start station columns and month column - all data is from May)
dropCols = ['start station id', 'start station latitude', 'start station longitude', 'month']
station422_df.drop(columns=dropCols, inplace=True)
station422_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,tripduration,end station id,end station latitude,end station longitude,bikeid,usertype,birth year,gender,year,day,hour,minute
1152,1101,334,40.742388,-73.997262,18764,1,1985,2,2016,1,7,23
4558,177,3159,40.774925,-73.982666,16841,1,1988,2,2016,1,14,59
5209,550,448,40.756604,-73.997901,24161,1,1996,1,2016,1,15,38
5218,334,3175,40.77748,-73.982886,20773,1,1983,1,2016,1,15,38
5621,537,529,40.75757,-73.990985,19309,1,1980,1,2016,1,16,4


In [13]:
###########################
# Machine learning
###########################

In [14]:
# Remove target 'end station id' from features data
y = station422_df['end station id']
X = station422_df.drop(columns=["end station id"])

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [18]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)

# Fitting the model
rf_model = rf_model.fit(X_train, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Random forest predictive accuracy: 0.905


In [19]:
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[11  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 ...
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  9  0]
 [ 0  0  0 ...  0  0  0]]


In [20]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

          72       1.00      1.00      1.00        11
          79       0.00      0.00      0.00         2
          82       0.00      0.00      0.00         1
         116       1.00      1.00      1.00         2
         127       0.71      1.00      0.83         5
         128       0.00      0.00      0.00         1
         146       0.00      0.00      0.00         1
         147       1.00      1.00      1.00         2
         150       0.00      0.00      0.00         1
         151       1.00      0.60      0.75         5
         152       0.00      0.00      0.00         1
         153       0.67      1.00      0.80         2
         157       0.00      0.00      0.00         1
         161       0.00      0.00      0.00         3
         164       1.00      0.80      0.89         5
         167       1.00      1.00      1.00         4
         168       0.62      1.00      0.77         5
         173       1.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
# Define the model - deep neural net
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 =  24
hidden_nodes_layer2 = 12

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))


# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Compile the Sequential model together and customize metrics
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=50)

# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
79/79 - 0s - loss: -2.6879e+09 - accuracy: 0.0000e+00
Loss: -2687856640.0, Accuracy: 0.0


In [22]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.0992776 , 0.28883551, 0.29035942, 0.05351839, 0.00407835,
       0.06744983, 0.01479716, 0.02757777, 0.04684624, 0.05522047,
       0.05203925])

In [23]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.2903594177226808, 'end station longitude'),
 (0.2888355085434247, 'end station latitude'),
 (0.09927760238151812, 'tripduration'),
 (0.06744982876901728, 'birth year'),
 (0.05522046545303789, 'hour'),
 (0.05351839388416772, 'bikeid'),
 (0.05203925485500585, 'minute'),
 (0.04684624106461672, 'day'),
 (0.027577774487545177, 'year'),
 (0.014797160461789752, 'gender'),
 (0.004078352377195877, 'usertype')]

In [24]:
############
#  Logistic Regression
############

In [25]:
# Define the logistic regression model
log_classifier = LogisticRegression(solver="lbfgs",max_iter=200)

# Train the model
log_classifier.fit(X_train,y_train)

# Evaluate the model
y_pred = log_classifier.predict(X_test)
print(f" Logistic regression model accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Logistic regression model accuracy: 0.044


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [26]:
# Prepare confusion matrix for logistic regression
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [27]:
# Prepare classificaiton report for logistic regression
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

          72       0.00      0.00      0.00        11
          79       0.00      0.00      0.00         2
          82       0.00      0.00      0.00         1
         116       0.00      0.00      0.00         2
         127       0.00      0.00      0.00         5
         128       0.00      0.00      0.00         1
         146       0.00      0.00      0.00         1
         147       0.00      0.00      0.00         2
         150       0.00      0.00      0.00         1
         151       0.00      0.00      0.00         5
         152       0.00      0.00      0.00         1
         153       0.00      0.00      0.00         2
         157       0.00      0.00      0.00         1
         161       0.00      0.00      0.00         3
         164       0.00      0.00      0.00         5
         167       0.00      0.00      0.00         4
         168       0.00      0.00      0.00         5
         173       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
