In [None]:
# Combine 'Year' and 'Month' into a single 'Date' column
ire_dub_bus_months_passengers['Date'] = pd.to_datetime(ire_dub_bus_months_passengers['Year'].astype(str) + ' ' 
                                                       + ire_dub_bus_months_passengers['Month'])

# Display the first few rows of the updated dataframe
ire_dub_bus_months_passengers.head()

In [None]:
# Set the plot size
plt.figure(figsize=(12, 5))

# Create the line plot
plt.plot(ire_dub_bus_months_passengers['Date'], ire_dub_bus_months_passengers['Passengers (Mn.)'], linewidth = 1.5)
plt.title('Yearly Dublin Bus Passengers (Mn.) by Season Trend: 2014 to 2022', size = 14)
plt.xlabel("Year", size = 12)
plt.ylabel("Passengers (Mn.)", size = 12)
plt.tight_layout()
plt.gca().yaxis.grid(True, color = 'lightgrey', linestyle = '-', linewidth = 0.8)
plt.gca().xaxis.grid(False)
plt.show()

In [None]:
# Use the query function in Pandas to exclude year's 2020 and 2021 due to COVID and skew
ire_dub_bus_months_passengers_filtered = ire_dub_bus_months_passengers.query('Year != 2020 and Year != 2021 and Year != 2022' )

ire_dub_bus_months_passengers_filtered.tail()

In [None]:
# Set the plot size
plt.figure(figsize=(12, 5))

# Create the line plot
plt.plot(ire_dub_bus_months_passengers_filtered['Date'], ire_dub_bus_months_passengers_filtered['Passengers (Mn.)'], linewidth = 1.5)
plt.title('Yearly Dublin Bus Passengers (Mn.) by Season Trend: 2014 to 2022', size = 14)
plt.xlabel("Year", size = 12)
plt.ylabel("Passengers (Mn.)", size = 12)
plt.tight_layout()
plt.gca().yaxis.grid(True, color = 'lightgrey', linestyle = '-', linewidth = 0.8)
plt.gca().xaxis.grid(False)
plt.show()

In [None]:
# Ensure the date column is in datetime format
ire_dub_bus_months_passengers_filtered['Date'] = pd.to_datetime(ire_dub_bus_months_passengers_filtered['Date'])

# Convert the date to an ordinal number
ire_dub_bus_months_passengers_filtered['Date (Int)'] = ire_dub_bus_months_passengers_filtered['Date'].apply(lambda x: x.toordinal())

# Display the DataFrame to verify the changes
ire_dub_bus_months_passengers_filtered.head()

In [None]:
# Extract the relevant features
X = ire_dub_bus_months_passengers_filtered['Date (Int)'].values.reshape(-1, 1)
y = ire_dub_bus_months_passengers_filtered['Passengers (Mn.)'].values

In [None]:
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from datetime import datetime
import numpy as np
plt.style.use('seaborn-whitegrid')

# Function to evaluate and plot a regressor on a given feature set
def eval_on_features(features, target, regressor, n_train):
    X_train, X_test = features[:n_train], features[n_train:] # Split the given features and target into training and test sets
    y_train, y_test = target[:n_train], target[n_train:] #Split the target array

    regressor.fit(X_train, y_train) # Fit the regressor
    print('Test-set R^2: {:.2f}'.format(regressor.score(X_test, y_test))) # Print the Test-set R^2 score

    y_pred = regressor.predict(X_test)
    y_pred_train = regressor.predict(X_train)

    # Convert integer dates to datetime for plotting the x-axis
    date_ticks = np.vectorize(lambda x: datetime.fromordinal(x))(features.flatten())

    # Create the plot
    plt.figure(figsize = (12, 5))
    plt.plot(date_ticks[:n_train], y_train, label = 'train')
    plt.plot(date_ticks[n_train:], y_test, '-', label = 'test')
    plt.plot(date_ticks[:n_train], y_pred_train, '--', label = 'prediction train')
    plt.plot(date_ticks[n_train:], y_pred, '--', label = 'prediction test')
    plt.gca().yaxis.grid(True, color = 'lightgrey', linestyle = '-', linewidth = 0.8)
    plt.gca().xaxis.grid(False)
    plt.legend(loc=(1.01, 0))
    plt.xlabel('Date')
    plt.ylabel('Passengers (Mn.)')

    # Adjusting x-ticks to make them readable
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [None]:
# Set the training value 70% of data for training
n_train = int(len(X) * 0.7) 

# Create an object 'regressor' by calling a method RandomForestRegressor()
regressor = RandomForestRegressor(n_estimators = 100, random_state = 0)

eval_on_features(X, y, regressor, n_train)