## Import libraries and read dataset

In [9]:
import pandas as pd
import numpy as np
import pprint

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import metrics
import matplotlib.pyplot as plt
import requests
import pickle

In [2]:
dataset = pd.read_csv('postpandemic.csv', parse_dates=['TIME'])
dataset

Unnamed: 0,STATION ID,TIME,STATION ID.1,BIKE STANDS,AVAILABLE BIKE STANDS,AVAILABLE BIKES,LATITUDE,LONGITUDE,rain,temp
0,1,2022-04-27 13:00:00,1.0,31.0,27.0,0.0,53.3409,-6.26250,0.0,11.5
1,1,2022-04-27 13:30:00,1.0,31.0,27.0,0.0,53.3409,-6.26250,0.0,11.5
2,1,2022-04-27 14:00:00,1.0,31.0,27.0,0.0,53.3409,-6.26250,0.0,12.1
3,1,2022-04-27 14:30:00,1.0,31.0,0.0,0.0,53.3409,-6.26250,0.0,12.1
4,1,2022-04-27 15:00:00,1.0,31.0,0.0,0.0,53.3409,-6.26250,0.0,10.8
...,...,...,...,...,...,...,...,...,...,...
2845137,507,2023-05-12 21:30:00,507.0,1.0,0.0,1.0,53.3546,-6.24262,0.0,9.9
2845138,507,2023-05-12 22:00:00,507.0,1.0,0.0,1.0,53.3546,-6.24262,0.0,8.9
2845139,507,2023-05-12 22:30:00,507.0,1.0,0.0,1.0,53.3546,-6.24262,0.0,8.9
2845140,507,2023-05-12 23:00:00,507.0,1.0,0.0,1.0,53.3546,-6.24262,0.0,8.2


In [15]:
df = dataset[dataset['TIME']<pd.to_datetime('2023-11-01 00:00:00')]
df = df[df['TIME']>pd.to_datetime('2022-11-01 00:00:00')]

## Data preparation

In [3]:
dataset.min()

STATION ID                                 1
TIME                     2022-03-02 05:00:00
STATION ID.1                             1.0
BIKE STANDS                              1.0
AVAILABLE BIKE STANDS                    0.0
AVAILABLE BIKES                          0.0
LATITUDE                             53.3301
LONGITUDE                           -6.31002
rain                                     0.0
temp                                    -3.8
dtype: object

In [5]:
df.max()

STATION ID                               507
TIME                     2023-10-31 23:30:00
STATION ID.1                           507.0
BIKE STANDS                             40.0
AVAILABLE BIKE STANDS                   40.0
AVAILABLE BIKES                         40.0
LATITUDE                               53.36
LONGITUDE                           -6.23085
rain                                    12.0
temp                                    26.5
dtype: object

In [6]:
df.min()

STATION ID                                 1
TIME                     2022-11-01 05:00:00
STATION ID.1                             1.0
BIKE STANDS                              1.0
AVAILABLE BIKE STANDS                    0.0
AVAILABLE BIKES                          0.0
LATITUDE                             53.3301
LONGITUDE                           -6.31002
rain                                     0.0
temp                                    -3.8
dtype: object

In [7]:
df = dataset.iloc[:]
df['hour'] = df['TIME'].dt.hour
df['minute'] = df['TIME'].dt.minute
df['month'] = df['TIME'].dt.month
df['day'] = df['TIME'].dt.day
df['dayofweek'] = df['TIME'].dt.dayofweek

df['occupancy ratio'] = df['AVAILABLE BIKES'] / df['BIKE STANDS'] * 100
df['stand availability ratio'] = df['AVAILABLE BIKE STANDS'] / df['BIKE STANDS'] * 100
df['bike usage'] = df['AVAILABLE BIKE STANDS'] - df['AVAILABLE BIKE STANDS'].shift(1)

df.drop([ 'STATION ID.1', 'LATITUDE', 'LONGITUDE'], axis=1, inplace=True)
df.dropna(inplace=True)
df = df[df['STATION ID']!=507]

In [8]:
df

Unnamed: 0,STATION ID,TIME,BIKE STANDS,AVAILABLE BIKE STANDS,AVAILABLE BIKES,rain,temp,hour,minute,month,day,dayofweek,occupancy ratio,stand availability ratio,bike usage
1,1,2022-04-27 13:30:00,31.0,27.0,0.0,0.0,11.5,13,30,4,27,2,0.0,87.096774,0.0
2,1,2022-04-27 14:00:00,31.0,27.0,0.0,0.0,12.1,14,0,4,27,2,0.0,87.096774,0.0
3,1,2022-04-27 14:30:00,31.0,0.0,0.0,0.0,12.1,14,30,4,27,2,0.0,0.000000,-27.0
4,1,2022-04-27 15:00:00,31.0,0.0,0.0,0.0,10.8,15,0,4,27,2,0.0,0.000000,0.0
5,1,2022-04-27 15:30:00,31.0,0.0,0.0,0.0,10.8,15,30,4,27,2,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2843047,117,2023-12-25 21:30:00,40.0,17.0,23.0,0.0,-0.4,21,30,12,25,0,57.5,42.500000,0.0
2843048,117,2023-12-25 22:00:00,40.0,17.0,23.0,0.0,-0.4,22,0,12,25,0,57.5,42.500000,0.0
2843049,117,2023-12-25 22:30:00,40.0,17.0,23.0,0.0,-0.4,22,30,12,25,0,57.5,42.500000,0.0
2843050,117,2023-12-25 23:00:00,40.0,17.0,23.0,0.0,-0.4,23,0,12,25,0,57.5,42.500000,0.0


In [9]:
X_cols = ['STATION ID', 'BIKE STANDS', 'rain', 'temp', 'hour', 'minute', 'month', 'day', 'dayofweek']
y_target = ['AVAILABLE BIKES']

train = df[X_cols]
target = df[y_target]
Xtrain, Xtest, Ytrain, Ytest = train_test_split(train, target, test_size=0.3)
Xtrain

Unnamed: 0,STATION ID,BIKE STANDS,rain,temp,hour,minute,month,day,dayofweek
244164,10,16.0,0.0,16.7,6,0,8,12,5
2516128,104,40.0,0.0,-0.4,11,0,12,25,0
2559062,106,40.0,0.0,21.2,15,0,6,14,2
1158491,48,40.0,0.0,15.1,8,30,8,16,2
550141,23,30.0,0.0,12.0,6,30,8,28,6
...,...,...,...,...,...,...,...,...,...
1407807,58,40.0,0.0,17.7,19,0,6,20,1
1694773,71,40.0,0.0,14.2,19,30,5,25,2
1337258,55,36.0,0.0,14.1,14,30,10,27,4
1940444,82,22.0,0.0,11.3,22,30,5,10,1


In [10]:
Ytrain.columns

Index(['AVAILABLE BIKES'], dtype='object')

## STATION ID _ DUBLIN BIKES Dataset preparation

In [None]:
df_stations = pd.DataFrame(df['STATION ID'].unique(), columns=['STATION ID'])
df_query = df_stations.merge(df[['STATION ID', 'BIKE STANDS']], on='STATION ID', how='left')
df_query.drop_duplicates(inplace=True)
df_query.index = df_query['STATION ID']
df_query = df_query[['BIKE STANDS']]
df_query.drop(507, inplace=True)
df_query.to_csv('STATION ID - BIKE STANDS.csv')

# MACHINE LEARNING

In [11]:
def printMetrics(testActualVal, predictions):
    #classification evaluation measures
    print('\n==============================================================================')
    print("MAE: ", metrics.mean_absolute_error(testActualVal, predictions))
    #print("MSE: ", metrics.mean_squared_error(testActualVal, predictions))
    print("RMSE: ", metrics.mean_squared_error(testActualVal, predictions)**0.5)
    print("R2: ", metrics.r2_score(testActualVal, predictions))

In [12]:
clf = LinearRegression()
clf = clf.fit(Xtrain,Ytrain)
# score = cross_val_score(clf,Xtest,Ytest,cv=10).mean()
score = clf.score(Xtest,Ytest)
score

0.04348107665507339

In [13]:
test_prediction = clf.predict(Xtest)
printMetrics(Ytest, test_prediction)


MAE:  7.691593688399934
RMSE:  9.295670796320383
R2:  0.04348107665507339


In [14]:
rfc = RandomForestRegressor(random_state=0)
rfc = rfc.fit(Xtrain,Ytrain['AVAILABLE BIKES'])
score = rfc.score(Xtest,Ytest['AVAILABLE BIKES'])
# score = cross_val_score(rfc,Xtest,Ytest,cv=10)
score

0.8731824161065153

### Save the Model

In [16]:
import pickle

with open("RFmodel.pkl", "wb") as f:
    pickle.dump(rfc, f)

# USING THE MODEL

In [16]:
APIKEY_OPENWEATHER = 'eef810c9a22776cce17d0de14d316137'
APIKEY_METEOSOURCE = 'wkz9f0gm7xust1d45patrd9uqugwm2qjrtctorxx'

In [17]:
def get_weather_forecast():
    parameters = {'key': APIKEY_METEOSOURCE,
                  'place_id': 'dublin'}
    url = "https://www.meteosource.com/api/v1/free/point"
    data = requests.get(url, parameters).json()
    return data

weather_forecast = get_weather_forecast()

In [26]:
def prepare_query_for_model(weather_forecast=None):
    if(weather_forecast == None):
        weather_forecast = get_weather_forecast()
        
    res = pd.DataFrame.from_dict(pd.json_normalize(weather_forecast['hourly']['data']), orient='columns')
    res['date'] = pd.to_datetime(res['date'])
    res = res[['date', 'precipitation.total', 'temperature']]
    res.rename(columns={'precipitation.total': 'rain', 'temperature': 'temp', 'date': 'TIME'}, inplace=True)
    
    res['hour'] = res['TIME'].dt.hour
    res['minute'] = res['TIME'].dt.minute
    res['month'] = res['TIME'].dt.month
    res['day'] = res['TIME'].dt.day
    res['dayofweek'] = res['TIME'].dt.dayofweek

    res.drop('TIME', axis=1, inplace=True)

    df_stations = pd.read_csv('STATION ID - BIKE STANDS.csv')
    res['dummy'] = 1
    df_stations['dummy'] = 1
    res = pd.merge(df_stations, res, on='dummy')
    res = res.drop(columns='dummy')
    res = res.reset_index(drop=True)
    
    return res

df_query = prepare_query_for_model(weather_forecast)
df_query

Unnamed: 0,STATION ID,BIKE STANDS,rain,temp,hour,minute,month,day,dayofweek
0,1,31.0,0.0,5.5,14,0,2,8,3
1,1,31.0,0.0,5.8,15,0,2,8,3
2,1,31.0,0.0,6.0,16,0,2,8,3
3,1,31.0,0.0,6.2,17,0,2,8,3
4,1,31.0,0.0,6.8,18,0,2,8,3
...,...,...,...,...,...,...,...,...,...
2731,117,40.0,0.0,5.8,9,0,2,9,4
2732,117,40.0,0.0,5.8,10,0,2,9,4
2733,117,40.0,0.0,5.8,11,0,2,9,4
2734,117,40.0,0.0,6.0,12,0,2,9,4


In [23]:
with open("RFmodel.pkl", "rb") as f:
    model = pickle.load(f)

In [33]:
def get_predictions(df_query):
    predictions = model.predict(df_query)
    return predictions

predictions = get_predictions(df_query)
predictions

array([12.69, 10.63, 11.61, ..., 12.62, 12.28, 11.37])