# Machine Learning Model for Dublin Bikes application

In [84]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import copy
import seaborn as sns
import tensorflow as tf
from sklearn.linear_model import LinearRegression ## maybe logisitic regression??????

# Marge the two csv with the closest date and time for convention 

In [85]:
# we just need to create a collect csv files for both and put them in here
df_bike = pd.read_csv('dynamic_4_4_23 copy.csv')
df_weather = pd.read_csv('weather_4_4_23 2.csv')

In [86]:
df_bike['s_time'] = df_bike['s_time'].astype(str).apply(lambda x: str(x).split(' ')[-1])
df_weather['w_time'] = df_weather['w_time'].astype(str).apply(lambda x: str(x).split(' ')[-1])
df_bike['datetime'] = pd.to_datetime(df_bike['s_date'].astype(str) + ' ' + df_bike['s_time'].astype(str).apply(lambda x: str(x).split(' ')[-1]))
df_weather['datetime'] = pd.to_datetime(df_weather['w_date'].astype(str) + ' ' + df_weather['w_time'].astype(str).apply(lambda x: str(x).split(' ')[-1]))

In [87]:
df_bike = df_bike.sort_values('datetime')
df_weather = df_weather.sort_values('datetime')

df_main = pd.merge_asof(df_bike, df_weather, on='datetime', direction='nearest')

# Cleaning the dataframe

In [88]:
df_main.columns

Index(['number', 'name', 'bike_stands', 'available_bike_stands',
       'available_bikes', 'status', 's_date', 's_time', 'datetime', 'latitude',
       'longitude', 'weather_id', 'weather_main', 'weather_description',
       'weather_icon', 'temperature', 'feels_like', 'pressure', 'humidity',
       'visibility', 'wind_speed', 'wind_direction', 'rain', 'snow', 'clouds',
       'sunrise', 'sunset', 'w_date', 'w_time'],
      dtype='object')

In [89]:
df_main = df_main.drop(['name','status', 's_date', 's_time','latitude',
       'longitude', 'weather_id', 'weather_description',
       'weather_icon', 'feels_like',
       'pressure','visibility','wind_direction',
       'rain','snow', 'sunrise', 'sunset',
       'w_date', 'w_time'], axis = 1)

In [90]:
df_main = df_main.drop(df_main[df_main["number"] == 507].index)


In [91]:
df_main["datetime"].unique()

array(['2023-03-03T10:43:51.000000000', '2023-03-03T10:44:01.000000000',
       '2023-03-03T10:44:04.000000000', ...,
       '2023-04-04T08:45:33.000000000', '2023-04-04T08:45:40.000000000',
       '2023-04-04T08:45:47.000000000'], dtype='datetime64[ns]')

df_main = df_main.drop(df_main["datatime] < ])

In [92]:
df_main = df_main[(df_main['datetime'] >= '2023-03-04') & (df_main['datetime'] <= '2023-04-03')]

In [93]:
df_main["datetime"].unique()

array(['2023-03-04T00:00:01.000000000', '2023-03-04T00:00:15.000000000',
       '2023-03-04T00:00:22.000000000', ...,
       '2023-04-02T23:59:21.000000000', '2023-04-02T23:59:38.000000000',
       '2023-04-02T23:59:55.000000000'], dtype='datetime64[ns]')

In [94]:
df_main["availability_percentage"] = df_main["available_bikes"]/(df_main["bike_stands"])
df_main["availability_percentage"].round(2)

20127      0.20
20128      0.20
20129      1.00
20130      1.00
20131      0.25
           ... 
1001188    0.21
1001189    0.21
1001190    0.21
1001191    0.43
1001192    0.43
Name: availability_percentage, Length: 981066, dtype: float64

In [95]:
df_main.columns

Index(['number', 'bike_stands', 'available_bike_stands', 'available_bikes',
       'datetime', 'weather_main', 'temperature', 'humidity', 'wind_speed',
       'clouds', 'availability_percentage'],
      dtype='object')

In [96]:
df_main["temperature"] = df_main["temperature"] - 273.15

In [97]:
df_main["weather_main"].unique()

array(['Clouds', 'Drizzle', 'Rain', 'Clear', 'Snow', 'Mist', 'Fog'],
      dtype=object)

In [98]:
df_main["datetime"] = pd.to_datetime(df_main["datetime"])
df_main["year"] = df_main["datetime"].dt.year
df_main["month"] = df_main["datetime"].dt.month
df_main["day"] = df_main["datetime"].dt.day
df_main["time"] = df_main["datetime"].dt.time
df_main["hour"] = df_main["datetime"].dt.round("H").dt.hour.astype(int)
df_main["day_of_week"] = df_main["datetime"].dt.strftime('%A')

# One-hot encoding for categorical

In [99]:
one_hot = pd.get_dummies(df_main["weather_main"])
df_main = pd.concat([df_main, one_hot], axis = 1)

In [100]:
one_hot = pd.get_dummies(df_main["day_of_week"])
df_main = pd.concat([df_main, one_hot], axis = 1)

In [101]:
df_main = df_main.drop(['datetime', 'bike_stands','weather_main', 'year',
       'month', 'day', 'time', 'day_of_week', 'available_bike_stands',
       'available_bikes'], axis = 1)

In [102]:
for column in df_main.columns:
    if df_main[column].dtype == "uint8":
        df_main[column] = df_main[column].astype("int64")

In [103]:
df_main.dtypes

number                       int64
temperature                float64
humidity                     int64
wind_speed                 float64
clouds                       int64
availability_percentage    float64
hour                         int64
Clear                        int64
Clouds                       int64
Drizzle                      int64
Fog                          int64
Mist                         int64
Rain                         int64
Snow                         int64
Friday                       int64
Monday                       int64
Saturday                     int64
Sunday                       int64
Thursday                     int64
Tuesday                      int64
Wednesday                    int64
dtype: object

In [104]:
print(df_main.isnull().sum())

number                     0
temperature                0
humidity                   0
wind_speed                 0
clouds                     0
availability_percentage    0
hour                       0
Clear                      0
Clouds                     0
Drizzle                    0
Fog                        0
Mist                       0
Rain                       0
Snow                       0
Friday                     0
Monday                     0
Saturday                   0
Sunday                     0
Thursday                   0
Tuesday                    0
Wednesday                  0
dtype: int64


### Divide data into training/validation and testing
- Drop all data for the 3rd March
- Train- first 3 weeks, Validation next 1 week. Test 1 week
- Starting at Saturday 00:00AM
- Dates Training 4/3/23 to end of 24/3/23
- Dates validation 25/3/23 to end of 31st
- Dates training 1/4/23 to end of 7/4th/23


In [105]:
from sklearn.model_selection import train_test_split

def divide_data(station_number, dataframe):
    df_main = dataframe[dataframe["number"] == station_number].copy()

    # Split the data into train and test sets
    train, test = train_test_split(df_main, test_size=0.2, random_state=42)

    # Split the train and test sets into x and y
    x_train = train.drop(["availability_percentage", "number"], axis=1)
    y_train = train["availability_percentage"]
    x_test = test.drop(["availability_percentage", "number"], axis=1)
    y_test = test["availability_percentage"]

    return df_main, x_train, y_train, x_test, y_test


In [116]:
from sklearn.model_selection import train_test_split

def divide_data_intothree(station_number, dataframe):
    df_main = dataframe[dataframe["number"] == station_number].copy()

    # Split the data into train and test sets (80% train, 20% test)
    train, test = train_test_split(df_main, test_size=0.2, random_state=42)

    # Split the train set into train and validation sets (60% train, 20% validation)
    train, validation = train_test_split(train, test_size=0.25, random_state=42)

    # Split the train, validation, and test sets into x and y
    x_train = train.drop(["availability_percentage", "number"], axis=1)
    y_train = train["availability_percentage"]
    x_validation = validation.drop(["availability_percentage", "number"], axis=1)
    y_validation = validation["availability_percentage"]
    x_test = test.drop(["availability_percentage", "number"], axis=1)
    y_test = test["availability_percentage"]

    return df_main, x_train, y_train, x_validation, y_validation, x_test, y_test


# Check the relation between x columns and y column

if you find some unrelated input columns from the scatter below, you can drop it

In [106]:
def check_xy(x, y):
    for column in x.columns:
        plt.scatter(x[column], y)
        plt.title(column)
        plt.ylabel("Availability")
        plt.xlabel(column)
        plt.show()

# Training the model

In [107]:
def training_model(train_x, train_y, test_x, test_y):
    reg = LinearRegression()
    reg.fit(train_x, train_y)
    print(f"train score : {reg.score(train_x, train_y):.4f}")
    print(f"test score : {reg.score(test_x, test_y):.4f}")
    return reg

# Save the model into a folder

In [108]:
import pickle


def save_model(model, stationnumber):
    filename = f'/Users/ikeoshuya/Documents/GitHub/dublinbikes/datamodel/model_RFR/model_{stationnumber}.pkl'
    with open(filename, 'wb') as file:
        pickle.dump(model, file)

# Execute the models with for loop

In [109]:
"""for station_number in df_main["number"].unique():
    df_station, x_train, y_train, x_test, y_test = divide_data(station_number, df_main)
    model = training_model(x_train, y_train, x_test, y_test)
    #save_model(model, station_number)"""

'for station_number in df_main["number"].unique():\n    df_station, x_train, y_train, x_test, y_test = divide_data(station_number, df_main)\n    model = training_model(x_train, y_train, x_test, y_test)\n    #save_model(model, station_number)'

# Random Forest Regressor

In [110]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [111]:

def training_model_randomForest(train_x, train_y, test_x, test_y):
    rfr = RandomForestRegressor()
    rfr.fit(train_x, train_y)
    train_r2 = rfr.score(train_x, train_y)
    test_r2 = rfr.score(test_x, test_y)
    y_pred = rfr.predict(test_x)
    mae = mean_absolute_error(test_y, y_pred)
    print(f"Train R-squared: {train_r2:.4f}")
    print(f"Test R-squared: {test_r2:.4f}")
    print(f"Mean Absolute Error: {mae:.4f}")
    return rfr


In [112]:
"""for station_number in df_main["number"].unique():
    df_station, x_train, y_train, x_test, y_test = divide_data(station_number, df_main)
    model = training_model_randomForest(x_train, y_train, x_test, y_test)
    save_model(model, station_number)"""

'for station_number in df_main["number"].unique():\n    df_station, x_train, y_train, x_test, y_test = divide_data(station_number, df_main)\n    model = training_model_randomForest(x_train, y_train, x_test, y_test)\n    save_model(model, station_number)'

# Check what columns are impportant to make the model

In [81]:
pd.DataFrame({"columns" : x_train.columns, "importances": model.feature_importances_}).sort_values("importances", ascending = False)

Unnamed: 0,columns,importances
0,temperature,0.1729427
2,wind_speed,0.1726033
1,humidity,0.1628933
4,hour,0.1614562
15,Sunday,0.1009734
13,Monday,0.07470717
3,clouds,0.0403932
6,Clouds,0.02202971
14,Saturday,0.02185722
16,Thursday,0.02009092


# Trying to change the hyper parameters

In [117]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error

def tuning_randomForest(train_x, train_y, val_x, val_y, test_x, test_y):
    
    param_grid = {
        'n_estimators': [50, 100, 150],              
        'max_depth': [None, 10, 20],                  
        'min_samples_split': [2, 5, 10],               
        'min_samples_leaf': [1, 2, 4],                 
        'max_features': ['sqrt', 'log2', 5, 10, None]  
    }

    rfr = RandomForestRegressor()

    grid_search = GridSearchCV(rfr, param_grid, cv=5)
    grid_search.fit(train_x, train_y)

    best_params = grid_search.best_params_
    print("Best Hyperparameters:", best_params)

    rfr_best = RandomForestRegressor(**best_params)
    rfr_best.fit(train_x, train_y)

    train_r2 = rfr_best.score(train_x, train_y)  
    val_r2 = rfr_best.score(val_x, val_y)     
    mae = mean_absolute_error(test_y, rfr_best.predict(test_x)) 
    print(f"Train R-squared: {train_r2:.4f}")
    print(f"Validation R-squared: {val_r2:.4f}")
    print(f"Mean Absolute Error: {mae:.4f}") 

    return rfr_best


In [119]:
for station_number in df_main["number"].unique():
    df_station, x_train, y_train, x_val, y_val, x_test, y_test = divide_data_intothree(station_number, df_main)
    model = tuning_randomForest(x_train, y_train, x_val, y_val, x_test, y_test)
    save_model(model, station_number)

Best Hyperparameters: {'max_depth': 20, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Train R-squared: 0.9853
Validation R-squared: 0.9788
Mean Absolute Error: 0.0158
