# Machine Learning Model for Dublin Bikes application

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import copy
import seaborn as sns
import tensorflow as tf
from sklearn.linear_model import LinearRegression

2023-04-04 18:27:02.229574: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Marge the two csv with the closest date and time for convention 

In [3]:
# we just need to create a collect csv files for both and put them in here
df_bike = pd.read_csv('dynamic_4_4_23.csv')
df_weather = pd.read_csv('weather_30_3_23.csv')

In [4]:
df_bike['s_time'] = df_bike['s_time'].astype(str).apply(lambda x: str(x).split(' ')[-1])
df_weather['w_time'] = df_weather['w_time'].astype(str).apply(lambda x: str(x).split(' ')[-1])
df_bike['datetime'] = pd.to_datetime(df_bike['s_date'].astype(str) + ' ' + df_bike['s_time'].astype(str).apply(lambda x: str(x).split(' ')[-1]))
df_weather['datetime'] = pd.to_datetime(df_weather['w_date'].astype(str) + ' ' + df_weather['w_time'].astype(str).apply(lambda x: str(x).split(' ')[-1]))

In [5]:
df_bike = df_bike.sort_values('datetime')
df_weather = df_weather.sort_values('datetime')

df_main = pd.merge_asof(df_bike, df_weather, on='datetime', direction='nearest')

# Cleaning the dataframe

In [6]:
df_main.columns

Index(['number', 'name', 'bike_stands', 'available_bike_stands',
       'available_bikes', 'status', 's_date', 's_time', 'datetime', 'latitude',
       'longitude', 'weather_id', 'weather_main', 'weather_description',
       'weather_icon', 'temperature', 'feels_like', 'temp_min', 'temp_max',
       'pressure', 'humidity', 'visibility', 'wind_speed', 'wind_direction',
       'rain_1', 'rain_3', 'snow_1', 'snow_3', 'clouds', 'sunrise', 'sunset',
       'w_date', 'w_time'],
      dtype='object')

In [10]:
df_main = df_main.drop(['name', 'status', 's_date', 's_time','latitude',
       'longitude', 'weather_id', 'weather_description',
       'weather_icon', 'feels_like', 'temp_min', 'temp_max',
       'pressure','visibility','wind_direction',
       'rain_1', 'rain_3', 'snow_1', 'snow_3', 'clouds', 'sunrise', 'sunset',
       'w_date', 'w_time'], axis = 1)

In [11]:
df_main = df_main.drop(df_main[df_main["number"] == 507].index)


In [13]:
df_main["availability_percentage"] = df_main["available_bikes"]/(df_main["bike_stands"])
df_main["availability_percentage"].round(2)

2082       0.50
2083       0.30
2084       0.10
2085       0.00
2086       0.57
           ... 
1060225    0.02
1060226    0.47
1060227    0.05
1060228    0.35
1060229    0.92
Name: availability_percentage, Length: 1058148, dtype: float64

In [14]:
df_main.columns

Index(['number', 'bike_stands', 'available_bike_stands', 'available_bikes',
       'datetime', 'weather_main', 'temperature', 'humidity', 'wind_speed',
       'availability_percentage'],
      dtype='object')

In [17]:
df_main["temperature"] = df_main["temperature"] - 275.15

In [18]:
df_main["weather_main"].unique()

array(['Drizzle', 'Clouds', 'Rain', 'Clear'], dtype=object)

In [20]:
df_main["datetime"] = pd.to_datetime(df_main["datetime"])
df_main["year"] = df_main["datetime"].dt.year
df_main["month"] = df_main["datetime"].dt.month
df_main["day"] = df_main["datetime"].dt.day
df_main["time"] = df_main["datetime"].dt.time
df_main["hour"] = df_main["datetime"].dt.round("H").dt.hour.astype(int)
df_main["day_of_week"] = df_main["datetime"].dt.strftime('%A')

# One-hot encoding for categorical

In [19]:
one_hot = pd.get_dummies(df_main["weather_main"])
df_main = pd.concat([df_main, one_hot], axis = 1)

In [21]:
one_hot = pd.get_dummies(df_main["day_of_week"])
df_main = pd.concat([df_main, one_hot], axis = 1)

In [24]:
df_main = df_main.drop(['datetime', 'weather_main', 'year',
       'month', 'day', 'time', 'day_of_week', 'available_bike_stands',
       'available_bikes'], axis = 1)

In [26]:
for column in df_main.columns:
    if df_main[column].dtype == "uint8":
        df_main[column] = df_main[column].astype("int64")

In [27]:
df_main.dtypes

number                       int64
bike_stands                  int64
available_bike_stands        int64
available_bikes              int64
temperature                float64
humidity                     int64
wind_speed                 float64
availability_percentage    float64
Clear                        int64
Clouds                       int64
Drizzle                      int64
Rain                         int64
hour                         int64
Friday                       int64
Monday                       int64
Saturday                     int64
Sunday                       int64
Thursday                     int64
Tuesday                      int64
Wednesday                    int64
dtype: object

### Divide data into training/validation and testing
- Drop all data for the 3rd March
- Train- first 3 weeks, Validation next 1 week. Test 1 week
- Starting at Saturday 00:00AM
- Dates Training 4/3/23 to end of 24/3/23
- Dates validation 25/3/23 to end of 31st
- Dates training 1/4/23 to end of 7/4th/23


In [None]:
from sklearn.model_selection import train_test_split

def divide_data(station_number, dataframe):
    df_main = dataframe[dataframe["number"] == station_number].copy()

    # Split the data into train and test sets
    train, test = train_test_split(df_main, test_size=0.2, random_state=42)

    # Split the train and test sets into x and y
    x_train = train.drop(["availability_percentage", "number"], axis=1)
    y_train = train["availability_percentage"]
    x_test = test.drop(["availability_percentage", "number"], axis=1)
    y_test = test["availability_percentage"]

    return df_main, x_train, y_train, x_test, y_test


# Check the relation between x columns and y column

if you find some unrelated input columns from the scatter below, you can drop it

In [None]:
def check_xy(x, y):
    for column in x.columns:
        plt.scatter(x[column], y)
        plt.title(column)
        plt.ylabel("Availability")
        plt.xlabel(column)
        plt.show()

# Training the model

In [None]:
def training_model(train_x, train_y, test_x, test_y):
    reg = LinearRegression()
    reg.fit(train_x, train_y)
    print(f"train score : {reg.score(train_x, train_y)}")
    print(f"test score : {reg.score(test_x, test_y)}")
    return reg

# Save the model into a folder

In [28]:
import pickle


def save_model(model, stationnumber):
    filename = f'/Users/ikeoshuya/Documents/GitHub/dublinbikes/datamodel/models/model_{stationnumber}.pkl'
    with open(filename, 'wb') as file:
        pickle.dump(model, file)

# Execute the models with for loop

In [None]:
"""for station_number in df_main["number"].unique():
    df_station, x_train, y_train, x_test, y_test = divide_data(station_number, df)
    model = training_model(x_train, y_train, x_test, y_test)
    save_model(model, station_number)"""