# Welcome to My Ted Talk

In [50]:
import urllib.request, os, sys, json, pickle, re
from os.path import join
import pandas as pd, numpy as np
from sklearn.ensemble import RandomForestRegressor
import sklearn.metrics as metrics

In [3]:
path = join("..", "data")
path

'../data'

## 1. Get Data

In [4]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00560/SeoulBikeData.csv'

In [6]:
raw_data = join(path, 'raw', "SeoulBikeData.csv")

In [7]:
urllib.request.urlretrieve(url, raw_data)

('../data/raw/SeoulBikeData.csv', <http.client.HTTPMessage at 0x7f4d24b8a290>)

In [111]:
%%writefile ../src/get_data.py

import urllib.request, os

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00560/SeoulBikeData.csv'
path = os.path.join("..", 'data', 'raw')
filename = 'SeoulBikeData.csv'

if not os.path.exists(path): os.makedirs(path)
        
urllib.request.urlretrieve(url, os.path.join(path, filename))

Writing ../src/get_data.py


## 2. Prepare Data

In [10]:
!head -n 5 {raw_data}

Date,Rented Bike Count,Hour,Temperature(�C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(�C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
01/12/2017,254,0,-5.2,37,2.2,2000,-17.6,0,0,0,Winter,No Holiday,Yes
01/12/2017,204,1,-5.5,38,0.8,2000,-17.6,0,0,0,Winter,No Holiday,Yes
01/12/2017,173,2,-6,39,1,2000,-17.7,0,0,0,Winter,No Holiday,Yes
01/12/2017,107,3,-6.2,40,0.9,2000,-17.6,0,0,0,Winter,No Holiday,Yes


In [82]:
data = pd.read_csv(raw_data, encoding='iso-8859-1')
data.head()

In [82]:
data.columns

In [83]:
data.info()

In [84]:
def clean_col_names(list_of_cols):
    return [re.sub(r'[^a-zA-Z0-9\s]', '', col).lower().replace(r" ", "_") for col in list_of_cols]

In [85]:
data.columns = clean_col_names(data.columns)

In [86]:
def extract_dates(data):
    data['date'] = pd.to_datetime(data['date'], format="%d/%m/%Y")
    data.sort_values(['date', 'hour'], inplace=True)
    data["year"] = data['date'].dt.year
    data["month"] = data['date'].dt.month
    data["week"] = data['date'].dt.isocalendar().week
    data["day"] = data['date'].dt.day
    data["day_of_week"] = data['date'].dt.dayofweek
    data["day_of_year"] = data['date'].dt.dayofyear
    data["is_month_end"] = data['date'].dt.is_month_end
    data["is_month_start"] = data['date'].dt.is_month_start
    data["is_quarter_end"] = data['date'].dt.is_quarter_end
    data["is_quarter_start"] = data['date'].dt.is_quarter_start
    data["is_year_end"] = data['date'].dt.is_year_end
    data["is_year_start"] = data['date'].dt.is_year_start
    data.drop('date', axis=1, inplace=True)
    return data

In [None]:
data = extract_dates(data)

In [87]:
data_interim = join(path, "interim", "clean_data.parquet")

In [90]:
data = pd.get_dummies(data=data, columns=['holiday', 'seasons', 'functioning_day'])

In [88]:
data.to_parquet(data_interim, compression="snappy")

In [115]:
%%writefile ../src/prepare.py

import pandas as pd, os, sys, re

raw_data = os.path.join("..", "data", 'raw', "SeoulBikeData.csv")
data_interim = os.path.join("..", "data", "interim", "clean_data.parquet")

data = pd.read_csv(raw_data, encoding='iso-8859-1')

def clean_col_names(list_of_cols):
    return [re.sub(r'[^a-zA-Z0-9\s]', '', col).lower().replace(r" ", "_") for col in list_of_cols]

def extract_dates(data):
    data['date'] = pd.to_datetime(data['date'], format="%d/%m/%Y")
    data.sort_values(['date', 'hour'], inplace=True)
    data["year"] = data['date'].dt.year
    data["month"] = data['date'].dt.month
    data["week"] = data['date'].dt.isocalendar().week
    data["day"] = data['date'].dt.day
    data["day_of_week"] = data['date'].dt.dayofweek
    data["day_of_year"] = data['date'].dt.dayofyear
    data["is_month_end"] = data['date'].dt.is_month_end
    data["is_month_start"] = data['date'].dt.is_month_start
    data["is_quarter_end"] = data['date'].dt.is_quarter_end
    data["is_quarter_start"] = data['date'].dt.is_quarter_start
    data["is_year_end"] = data['date'].dt.is_year_end
    data["is_year_start"] = data['date'].dt.is_year_start
    data.drop('date', axis=1, inplace=True)
    return data

data.columns = clean_col_names(data.columns)
data = extract_dates(data)
data = pd.get_dummies(data=data, columns=['holiday', 'seasons', 'functioning_day'])
data.to_parquet(data_interim, compression="snappy")

Overwriting ../src/prepare.py


## 3. Split Data

In [91]:
split = 0.30
n_train = int(len(data) - len(data) * split)

In [94]:
train_path = join(path, 'processed', 'train.parquet')
test_path = join(path, 'processed', 'test.parquet')

In [95]:
data[:n_train].reset_index(drop=True).to_parquet(train_path, compression="snappy")
data[n_train:].reset_index(drop=True).to_parquet(test_path, compression="snappy")

In [123]:
%%writefile ../src/split_data.py

import pandas as pd, os

data_interim = os.path.join("..", "data", "interim", "clean_data.parquet")
train_path = os.path.join("..", "data", 'processed', 'train.parquet')
test_path = os.path.join("..", "data", 'processed', 'test.parquet')

data = pd.read_parquet(data_interim)

split = 0.30
n_train = int(len(data) - len(data) * split)

data[:n_train].reset_index(drop=True).to_parquet(train_path, compression="snappy")
data[n_train:].reset_index(drop=True).to_parquet(test_path, compression="snappy")

Overwriting ../src/split_data.py


## 4. Train Model

In [96]:
X_train = pd.read_parquet(train_path)
y_train = X_train.pop('rented_bike_count')

In [97]:
seed = 42
n_est = 100

In [98]:
%%time

rf = RandomForestRegressor(n_estimators=n_est, random_state=seed)
rf.fit(X_train.values, y_train.values)

CPU times: user 3.67 s, sys: 25.9 ms, total: 3.7 s
Wall time: 3.81 s


In [100]:
rf.predict(X_train.values)[:10]

array([236.39, 216.06, 159.82, 102.32,  80.01,  91.96, 166.18, 422.35,
       843.38, 447.64])

In [101]:
with open('../models/rf_model.pkl', "wb") as fd:
    pickle.dump(rf, fd)

In [121]:
%%writefile ../src/train_model.py

import pandas as pd, os
from sklearn.ensemble import RandomForestRegressor

train_path = os.path.join("..", "data", 'processed', 'train.parquet')

X_train = pd.read_parquet(train_path)
y_train = X_train.pop('rented_bike_count')

seed = 42
n_est = 100

rf = RandomForestRegressor(n_estimators=n_est, random_state=seed)
rf.fit(X_train.values, y_train.values)

with open(os.path.join('..', 'models', 'rf_model.pkl'), "wb") as fd:
    pickle.dump(rf, fd)

Writing ../src/train_model.py


## 5. Evaluate

In [102]:
with open('../models/rf_model.pkl', "rb") as fd:
    model = pickle.load(fd)

In [105]:
X_test = pd.read_parquet(test_path)
y_test = X_test.pop('rented_bike_count')

In [107]:
predictions = model.predict(X_test.values)
predictions[:10]

array([ 701.82,  711.5 ,  674.75,  722.74,  879.71, 1303.26, 2228.5 ,
       1901.35, 1787.42, 1761.24])

In [108]:
mae = metrics.mean_absolute_error(y_test.values, predictions)
rmse = np.sqrt(metrics.mean_squared_error(y_test.values, predictions))
r2_score = model.score(X_test.values, y_test.values)

In [109]:
print(f"Mean Absolute Error: {mae:.2f}")
print(f"Root Mean Square Error: {rmse:.2f}")
print(f"R^2: {r2_score:.3f}")

Mean Absolute Error: 267.47
Root Mean Square Error: 394.82
R^2: 0.629


In [110]:
with open(join('..', "reports", 'metrics.json'), "w") as fd:
    json.dump({"MAE": mae, "RMSE": rmse, "R^2":r2_score}, fd, indent=4)

In [124]:
%%writefile ../src/evaluate.py

import pandas as pd, os
import sklearn.metrics as metrics

model_path = os.path.join('..', 'models', 'rf_model.pkl')
metrics_path = os.path.join('..', "reports", 'metrics.json')
test_path = os.path.join("..", "data", 'processed', 'test.parquet')

with open(model_path, "rb") as fd:
    model = pickle.load(fd)

X_test = pd.read_parquet(test_path)
y_test = X_test.pop('rented_bike_count')

mae = metrics.mean_absolute_error(y_test.values, predictions)
rmse = np.sqrt(metrics.mean_squared_error(y_test.values, predictions))
r2_score = model.score(X_test.values, y_test.values)

with open(metrics_path, "w") as fd:
    json.dump({"MAE": mae, "RMSE": rmse, "R^2":r2_score}, fd, indent=4)

Writing ../src/evaluate.py
