In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

In [2]:
import os
from google.colab import drive

In [6]:
drive.mount('/content/drive')
pwd = "/content/drive/My Drive/IE434_ProjectGroup7/Data_Used"
os.chdir(pwd)
rides_data=pd.read_pickle('Merged_Data.pkl')
rides_data['started_at']=pd.to_datetime(rides_data['started_at'])
rides_data['ended_at']=pd.to_datetime(rides_data['ended_at'])

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
rides_data.sort_values(by=['started_at'],inplace=True)

In [8]:
rides_data['member_casual'] = np.where((rides_data['member_casual'] == 'casual') | (rides_data['member_casual'] == 'Customer'), 'casual', rides_data['member_casual'])
rides_data['member_casual'] = np.where((rides_data['member_casual'] == 'member') | (rides_data['member_casual'] == 'Subscriber'), 'member', rides_data['member_casual'])

In [9]:
rides_data=rides_data.drop(columns=['member_casual','distance','duration','DATE', 'TEMP', 'VISIB', 'WDSP', 'MAX', 'MIN', 'PRCP','start_date', 'start_time', 'end_date',
       'end_time'])

In [10]:
rides_data.columns

Index(['ride_id', 'started_at', 'ended_at', 'start_station_name',
       'start_station_id', 'end_station_name', 'end_station_id', 'start_lat',
       'start_lng', 'end_lat', 'end_lng'],
      dtype='object')

## Preparing features from the raw data.

In [11]:
rides_data.reset_index(drop=True)

Unnamed: 0,ride_id,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng
0,24575,2017-01-01 00:21:32,2017-01-01 00:24:01,Marin Light Rail,3276,City Hall,3185,40.714584,-74.042817,40.717732,-74.043845
1,24723,2017-01-01 00:24:35,2017-01-01 00:45:58,Exchange Place,3183,Heights Elevator,3198,40.716247,-74.033459,40.748716,-74.040443
2,24620,2017-01-01 00:38:19,2017-01-01 00:44:31,Exchange Place,3183,Newark Ave,3211,40.716247,-74.033459,40.721525,-74.046305
3,24668,2017-01-01 00:38:37,2017-01-01 01:03:50,McGinley Square,3194,Danforth Light Rail,3271,40.725340,-74.067622,40.692640,-74.088012
4,26167,2017-01-01 01:47:52,2017-01-01 01:58:31,Exchange Place,3183,Hamilton Park,3203,40.716247,-74.033459,40.727596,-74.044247
...,...,...,...,...,...,...,...,...,...,...,...
3677402,D0823AA2F27FC42D,2023-09-30 23:54:48,2023-10-01 00:04:52,Hilltop,JC019,Van Vorst Park,JC035,40.731169,-74.057574,40.718489,-74.047727
3677403,04A29B3CFE030CA9,2023-09-30 23:55:43,2023-10-01 00:01:29,Newport Pkwy,JC008,Harborside,JC104,40.728745,-74.032108,40.719252,-74.034234
3677404,79D537CBF0241E71,2023-09-30 23:57:19,2023-10-01 00:05:02,Hoboken Terminal - River St & Hudson Pl,HB102,Madison St & 1 St,HB402,40.736068,-74.029127,40.738790,-74.039300
3677405,ACD5D53702C00310,2023-09-30 23:58:01,2023-10-01 00:05:16,Hoboken Terminal - Hudson St & Hudson Pl,HB101,Hoboken Ave at Monmouth St,JC105,40.735938,-74.030305,40.735208,-74.046964


In [12]:
rides_data['end_station_name'].isna().value_counts()

False    3677407
Name: end_station_name, dtype: int64

In [None]:
import pandas as pd

rides_data['started_at'] = pd.to_datetime(rides_data['started_at'])
rides_data['ended_at'] = pd.to_datetime(rides_data['ended_at'])

rides_data['day_of_week'] = rides_data['started_at'].dt.dayofweek
rides_data['year'] = rides_data['started_at'].dt.year
rides_data['hour_of_day'] = rides_data['started_at'].dt.hour
rides_data['month'] = rides_data['started_at'].dt.month

rides_data['date'] = rides_data['started_at'].dt.date


aggregated_data = pd.DataFrame()


outgoing_rides = rides_data.groupby(['start_station_name', 'date','year', 'month','day_of_week', 'hour_of_day']).size().reset_index(name='total_rides_out')
incoming_rides = rides_data.groupby(['end_station_name', 'date','year',  'month','day_of_week', 'hour_of_day']).size().reset_index(name='total_rides_in')


outgoing_rides.rename(columns={'start_station_name':'Station'},inplace=True)
incoming_rides.rename(columns={'end_station_name':'Station'},inplace=True)

aggregated_data = pd.merge(outgoing_rides, incoming_rides, left_on=['Station', 'date','year', 'month','day_of_week', 'hour_of_day'],right_on=['Station', 'date','year', 'month', 'day_of_week', 'hour_of_day'], how='outer')

aggregated_data['total_rides_out'] = aggregated_data['total_rides_out'].fillna(0)
aggregated_data['total_rides_in'] = aggregated_data['total_rides_in'].fillna(0)


aggregated_data['Bike_demand'] = aggregated_data['total_rides_out'] - aggregated_data['total_rides_in']


aggregated_data = aggregated_data[['Station', 'date','year',  'month','day_of_week', 'hour_of_day', 'total_rides_out', 'total_rides_in', 'Bike_demand']]

aggregated_data


In [None]:
aggregated_data.sort_values(by=['date','hour_of_day'])

In [None]:
hot_encoded_stations = pd.get_dummies(aggregated_data['Station'])
aggregated_data = pd.concat([aggregated_data, hot_encoded_stations], axis=1)
aggregated_data

In [None]:
Y_out=aggregated_data[['date','year','total_rides_out']]
Y_in=aggregated_data[['date','year','total_rides_in']]
X=aggregated_data.drop(columns=['total_rides_out','total_rides_in','Station','Bike_demand'])

In [None]:
X_test=X[X['year']>2022]
X_train=X[(X['year']<=2022)&(X['year']>2021)]

In [None]:
y_test_out=Y_out[Y_out['year']>2022]
y_test_in=Y_in[Y_in['year']>2022]
y_train_out=Y_out[(Y_out['year']<=2022)&(Y_out['year']>2021)]
y_train_in=Y_in[(Y_in['year']<=2022)&(Y_in['year']>2021)]

In [None]:
X_train.drop(columns=['date'],inplace=True)
y_train_out.drop(columns=['date','year'],inplace=True)
y_train_in.drop(columns=['date','year'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train.drop(columns=['date'],inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_train_out.drop(columns=['date','year'],inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_train_in.drop(columns=['date','year'],inplace=True)


In [None]:
X_test.drop(columns=['date'],inplace=True)
y_test_out.drop(columns=['date','year'],inplace=True)
y_test_in.drop(columns=['date','year'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test.drop(columns=['date'],inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_test_out.drop(columns=['date','year'],inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_test_in.drop(columns=['date','year'],inplace=True)


In [None]:
X_train

Unnamed: 0,year,month,day_of_week,hour_of_day,1 Ave & E 16 St,1 Ave & E 30 St,1 Ave & E 5 St,1 Ave & E 6 St,1 Ave & E 62 St,1 Ave & E 68 St,...,Whitehall St & Bridge St,William St & Pine St,Willoughby Ave & Hall St,Willoughby Ave & Tompkins Ave,Willow Ave & 12 St,Wilson Ave & Moffat St,Withers St & Kingsland Ave,Wythe Ave & Metropolitan Ave,York St,York St & Marin Blvd
3908,2022,1,5,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3909,2022,1,5,10,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3910,2022,1,5,11,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3911,2022,1,5,13,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3912,2022,1,5,15,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1871487,2022,12,4,12,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1871488,2022,12,4,16,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1871489,2022,12,4,19,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1871490,2022,12,5,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_in_tensor = torch.tensor(y_train_in.values, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_in_tensor = torch.tensor(y_test_in.values, dtype=torch.float32)


In [None]:

train_dataset = TensorDataset(X_train_tensor, y_train_in_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)


## Implementing a simple feed forward network as the baseline model.

### The objective is to find the count of outgoing and incoming rides at all stations for every 1 hour timeslot.

In [None]:
class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(X_train.shape[1], 64)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x


In [None]:

model = SimpleNN()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
import torch
import matplotlib.pyplot as plt


train_losses = []
val_losses = []


epochs = 10
for epoch in range(epochs):
    model.train()
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_losses.append(loss.item())


    model.eval()
    with torch.no_grad():
        val_outputs = model(X_test_tensor)
        val_loss = criterion(val_outputs, y_test_in_tensor)
        val_losses.append(val_loss.item())

y_pred_in = val_outputs.detach().numpy()



In [None]:
y_train_out_tensor = torch.tensor(y_train_out.values, dtype=torch.float32)

y_test_out_tensor = torch.tensor(y_test_out.values, dtype=torch.float32)


In [None]:
train_dataset = TensorDataset(X_train_tensor, y_train_out_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)


In [None]:
model = SimpleNN()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
import torch
import matplotlib.pyplot as plt


train_losses_out = []
val_losses_out = []


epochs = 10
for epoch in range(epochs):
    model.train()
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_losses_out.append(loss.item())


    model.eval()
    with torch.no_grad():
        val_outputs = model(X_test_tensor)
        val_loss = criterion(val_outputs, y_test_in_tensor)
        val_losses_out.append(val_loss.item())

y_pred_out = val_outputs.detach().numpy()

## Calculation of Loss and Metric of the Baseline Model

In [None]:
mse_in = mean_squared_error(y_test_in_tensor.numpy(), y_pred_in, squared=False)
mae_in = mean_absolute_error(y_test_in_tensor.numpy(), y_pred_in)
mse_out = mean_squared_error(y_test_out_tensor.numpy(), y_pred_out, squared=False)
mae_out = mean_absolute_error(y_test_out_tensor.numpy(), y_pred_out)
print(f"RMSE (Incoming Rides): {mse_in:.2f} rides")
print(f"MAE (Incoming Rides): {mae_in:.2f} rides\n")
print(f"RMSE (Outgoming Rides): {mse_out:.2f} rides")
print(f"MAE (Outgoming Rides): {mae_out:.2f} rides")