# Task for Today  

***

## Bike Share Usage Prediction  

Given *data about a bike share service in London*, let's try to predict the **number of bikes** used in a given hour.

We will use XGBoost to make our predictions.

# Getting Started

In [None]:
import numpy as np
import pandas as pd

import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from xgboost import XGBRegressor

In [None]:
data = pd.read_csv('../input/london-bike-sharing-dataset/london_merged.csv')

In [None]:
data

In [None]:
data.info()

# Preprocessing

In [None]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Extract month, day, and hour features from the timestamp column
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['month'] = df['timestamp'].apply(lambda x: x.month)
    df['day'] = df['timestamp'].apply(lambda x: x.day)
    df['hour'] = df['timestamp'].apply(lambda x: x.hour)
    df = df.drop('timestamp', axis=1)
    
    # One-hot encode weather_code column
    weather_dummies = pd.get_dummies(df['weather_code'], prefix='weather')
    df = pd.concat([df, weather_dummies], axis=1)
    df = df.drop('weather_code', axis=1)
    
    # Split df into X and y
    y = df['cnt']
    X = df.drop('cnt', axis=1)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [None]:
X_train

In [None]:
y_train

# Training

In [None]:
model = XGBRegressor()
model.fit(X_train, y_train)

# Results

In [None]:
y_pred = model.predict(X_test)

rmse = np.sqrt(np.mean((y_test - y_pred)**2))
r2 = 1 - (np.sum((y_test - y_pred)**2) / np.sum((y_test - y_test.mean())**2))

print("RMSE: {:.2f}".format(rmse))
print(" R^2: {:.4f}".format(r2))

fig = px.scatter(
    x=y_pred,
    y=y_test,
    labels={'x': "Predicted", 'y': "Actual"},
    title="Actual vs. Predicted Values",
    width=700,
    height=700
)

fig.show()

# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/-46XflOZewg