## Import libraries and read dataset

In [25]:
import pandas as pd
import numpy as np
import pprint

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import metrics
import matplotlib.pyplot as plt
import requests
import pickle

In [26]:
dataset = pd.read_csv('Dublin Bikes Training Data.csv', parse_dates=['TIME'])
# dataset.drop(['STATION ID.1', 'LATITUDE', 'LONGITUDE'], axis=1, inplace=True)
# dataset.to_csv('Dublin Bikes Training Data.csv')
dataset

Unnamed: 0.1,Unnamed: 0,STATION ID,TIME,BIKE STANDS,AVAILABLE BIKE STANDS,AVAILABLE BIKES,rain,temp
0,0,1,2022-04-27 13:00:00,31.0,27.0,0.0,0.0,11.5
1,1,1,2022-04-27 13:30:00,31.0,27.0,0.0,0.0,11.5
2,2,1,2022-04-27 14:00:00,31.0,27.0,0.0,0.0,12.1
3,3,1,2022-04-27 14:30:00,31.0,0.0,0.0,0.0,12.1
4,4,1,2022-04-27 15:00:00,31.0,0.0,0.0,0.0,10.8
...,...,...,...,...,...,...,...,...
2845137,2845137,507,2023-05-12 21:30:00,1.0,0.0,1.0,0.0,9.9
2845138,2845138,507,2023-05-12 22:00:00,1.0,0.0,1.0,0.0,8.9
2845139,2845139,507,2023-05-12 22:30:00,1.0,0.0,1.0,0.0,8.9
2845140,2845140,507,2023-05-12 23:00:00,1.0,0.0,1.0,0.0,8.2


In [42]:
# df = df[df['TIME']>pd.to_datetime('2022-11-01 00:00:00')]
# df = dataset[dataset['TIME']<pd.to_datetime('2023-11-01 00:00:00')]

df = df[df['TIME']>pd.to_datetime('2023-03-01 00:00:00')]
df = dataset[dataset['TIME']<pd.to_datetime('2023-05-01 00:00:00')]

## Data preparation

In [43]:
dataset.min()

Unnamed: 0                                 0
STATION ID                                 1
TIME                     2022-03-02 05:00:00
BIKE STANDS                              1.0
AVAILABLE BIKE STANDS                    0.0
AVAILABLE BIKES                          0.0
rain                                     0.0
temp                                    -3.8
dtype: object

In [44]:
df.max()

Unnamed: 0                           2844808
STATION ID                               507
TIME                     2023-04-30 23:30:00
BIKE STANDS                             40.0
AVAILABLE BIKE STANDS                   40.0
AVAILABLE BIKES                         40.0
rain                                     9.8
temp                                    32.5
dtype: object

In [45]:
df.min()

Unnamed: 0                                 0
STATION ID                                 1
TIME                     2022-03-02 05:00:00
BIKE STANDS                              1.0
AVAILABLE BIKE STANDS                    0.0
AVAILABLE BIKES                          0.0
rain                                     0.0
temp                                    -3.8
dtype: object

In [46]:
df = dataset.iloc[:]
df['hour'] = df['TIME'].dt.hour
df['minute'] = df['TIME'].dt.minute
# df['month'] = df['TIME'].dt.month
df['day'] = df['TIME'].dt.day
df['dayofweek'] = df['TIME'].dt.dayofweek

df['occupancy ratio'] = df['AVAILABLE BIKES'] / df['BIKE STANDS'] * 100
df['stand availability ratio'] = df['AVAILABLE BIKE STANDS'] / df['BIKE STANDS'] * 100
df['bike usage'] = df['AVAILABLE BIKE STANDS'] - df['AVAILABLE BIKE STANDS'].shift(1)

df.dropna(inplace=True)
df = df[df['STATION ID']!=507]

In [47]:
df

Unnamed: 0.1,Unnamed: 0,STATION ID,TIME,BIKE STANDS,AVAILABLE BIKE STANDS,AVAILABLE BIKES,rain,temp,hour,minute,day,dayofweek,occupancy ratio,stand availability ratio,bike usage
1,1,1,2022-04-27 13:30:00,31.0,27.0,0.0,0.0,11.5,13,30,27,2,0.0,87.096774,0.0
2,2,1,2022-04-27 14:00:00,31.0,27.0,0.0,0.0,12.1,14,0,27,2,0.0,87.096774,0.0
3,3,1,2022-04-27 14:30:00,31.0,0.0,0.0,0.0,12.1,14,30,27,2,0.0,0.000000,-27.0
4,4,1,2022-04-27 15:00:00,31.0,0.0,0.0,0.0,10.8,15,0,27,2,0.0,0.000000,0.0
5,5,1,2022-04-27 15:30:00,31.0,0.0,0.0,0.0,10.8,15,30,27,2,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2843047,2843047,117,2023-12-25 21:30:00,40.0,17.0,23.0,0.0,-0.4,21,30,25,0,57.5,42.500000,0.0
2843048,2843048,117,2023-12-25 22:00:00,40.0,17.0,23.0,0.0,-0.4,22,0,25,0,57.5,42.500000,0.0
2843049,2843049,117,2023-12-25 22:30:00,40.0,17.0,23.0,0.0,-0.4,22,30,25,0,57.5,42.500000,0.0
2843050,2843050,117,2023-12-25 23:00:00,40.0,17.0,23.0,0.0,-0.4,23,0,25,0,57.5,42.500000,0.0


In [48]:
X_cols = ['STATION ID', 'BIKE STANDS', 'rain', 'temp', 'hour', 'minute', 'day', 'dayofweek']
y_target = ['AVAILABLE BIKES']

train = df[X_cols]
target = df[y_target]
Xtrain, Xtest, Ytrain, Ytest = train_test_split(train, target, test_size=0.3)
Xtrain

Unnamed: 0,STATION ID,BIKE STANDS,rain,temp,hour,minute,day,dayofweek
2003913,84,30.0,0.5,6.8,10,30,24,0
2429500,101,30.0,0.0,7.5,13,0,5,6
1733776,72,31.0,0.0,16.8,8,0,27,5
1568496,65,40.0,0.0,15.7,12,0,11,2
2359570,98,40.0,0.0,19.2,17,0,28,4
...,...,...,...,...,...,...,...,...
899657,37,30.0,0.0,9.9,21,0,2,3
1543246,64,40.0,0.0,18.8,17,0,8,6
2564696,106,40.0,0.0,6.2,22,0,9,3
2154467,90,40.0,1.6,8.4,20,30,15,5


In [49]:
Ytrain.columns

Index(['AVAILABLE BIKES'], dtype='object')

## STATION ID - DUBLIN BIKES and Station ID-Position Dataset preparation

In [35]:
df_stations = pd.DataFrame(df['STATION ID'].unique(), columns=['STATION ID'])
df_query = df_stations.merge(df[['STATION ID', 'BIKE STANDS']], on='STATION ID', how='left')
df_query.drop_duplicates(inplace=True)
df_query.index = df_query['STATION ID']
df_query = df_query[['BIKE STANDS']]
df_query.to_csv('STATION ID - BIKE STANDS.csv')

In [41]:
df = pd.read_csv('postpandemic.csv', parse_dates=['TIME'])
df_stations = pd.DataFrame(df['STATION ID'].unique(), columns=['STATION ID'])

df_query = df_stations.merge(df[['STATION ID', 'LATITUDE', 'LONGITUDE']], on='STATION ID', how='left')
df_query.drop_duplicates(inplace=True)
df_query.index = df_query['STATION ID']
df_query.to_csv('STATION ID - POSITION.csv', index=False)
df_query

Unnamed: 0_level_0,STATION ID,LATITUDE,LONGITUDE
STATION ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,53.3409,-6.26250
2,2,53.3568,-6.26814
3,3,53.3512,-6.26986
4,4,53.3469,-6.27298
5,5,53.3307,-6.26018
...,...,...,...
114,114,53.3337,-6.24834
115,115,53.3548,-6.24758
116,116,53.3547,-6.27231
117,117,53.3437,-6.23175


# MACHINE LEARNING

In [50]:
def printMetrics(testActualVal, predictions):
    #classification evaluation measures
    print('\n==============================================================================')
    print("MAE: ", metrics.mean_absolute_error(testActualVal, predictions))
    #print("MSE: ", metrics.mean_squared_error(testActualVal, predictions))
    print("RMSE: ", metrics.mean_squared_error(testActualVal, predictions)**0.5)
    print("R2: ", metrics.r2_score(testActualVal, predictions))

In [51]:
lr = LinearRegression()
lr = lr.fit(Xtrain,Ytrain)
# score = cross_val_score(clf,Xtest,Ytest,cv=10).mean()
score = lr.score(Xtest,Ytest)
score

0.043130718205979135

In [53]:
test_prediction = lr.predict(Xtest)
printMetrics(Ytest, test_prediction)


MAE:  7.7006949559997055
RMSE:  9.306200678416017
R2:  0.043130718205979135


In [54]:
rfc = RandomForestRegressor(random_state=0)
rfc = rfc.fit(Xtrain,Ytrain['AVAILABLE BIKES'])
score = rfc.score(Xtest,Ytest['AVAILABLE BIKES'])
# score = cross_val_score(rfc,Xtest,Ytest,cv=10)
score

0.8036887011289641

### Save the Model

In [55]:
import pickle

with open("RFmodel_v2.pkl", "wb") as f:
    pickle.dump(rfc, f)

# USING THE MODEL

In [56]:
APIKEY_OPENWEATHER = 'eef810c9a22776cce17d0de14d316137'
APIKEY_METEOSOURCE = 'wkz9f0gm7xust1d45patrd9uqugwm2qjrtctorxx'

In [57]:
def get_weather_forecast():
    parameters = {'key': APIKEY_METEOSOURCE,
                  'place_id': 'dublin'}
    url = "https://www.meteosource.com/api/v1/free/point"
    data = requests.get(url, parameters).json()
    return data

weather_forecast = get_weather_forecast()

In [None]:
def prepare_query_for_model(weather_forecast=None):
    if(weather_forecast == None):
        weather_forecast = get_weather_forecast()
        
    res = pd.DataFrame.from_dict(pd.json_normalize(weather_forecast['hourly']['data']), orient='columns')
    res['date'] = pd.to_datetime(res['date'])
    res = res[['date', 'precipitation.total', 'temperature']]
    res.rename(columns={'precipitation.total': 'rain', 'temperature': 'temp', 'date': 'TIME'}, inplace=True)
    
    res['hour'] = res['TIME'].dt.hour
    res['minute'] = res['TIME'].dt.minute
    # res['month'] = res['TIME'].dt.month
    res['day'] = res['TIME'].dt.day
    res['dayofweek'] = res['TIME'].dt.dayofweek

    res.drop('TIME', axis=1, inplace=True)

    df_stations = pd.read_csv('STATION ID - BIKE STANDS.csv')
    res['dummy'] = 1
    df_stations['dummy'] = 1
    res = pd.merge(df_stations, res, on='dummy')
    res = res.drop(columns='dummy')
    res = res.reset_index(drop=True)
    
    return res

df_query = prepare_query_for_model(weather_forecast)
df_query

In [58]:
with open("RFmodel_v2.pkl", "rb") as f:
    model = pickle.load(f)

In [59]:
def get_predictions(df_query):
    predictions = model.predict(df_query)
    return predictions

predictions = get_predictions(df_query)
predictions

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- LATITUDE
- LONGITUDE
Feature names seen at fit time, yet now missing:
- BIKE STANDS
- day
- dayofweek
- hour
- minute
- ...
