# Welcome to My Ted Talk

In [1]:
import urllib.request, os, sys, json, pickle, re
from os.path import join
import pandas as pd, numpy as np
from sklearn.ensemble import RandomForestRegressor
import sklearn.metrics as metrics

In [2]:
path = join("..", "data")
path

'../data'

## 1. Get Data

In [3]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00560/SeoulBikeData.csv'

In [4]:
raw_data = join(path, 'raw', "SeoulBikeData.csv")

In [5]:
urllib.request.urlretrieve(url, raw_data)

('../data/raw/SeoulBikeData.csv', <http.client.HTTPMessage at 0x7f6e986d6d10>)

In [6]:
!head -n 5 ../data/raw/SeoulBikeData.csv

Date,Rented Bike Count,Hour,Temperature(�C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(�C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
01/12/2017,254,0,-5.2,37,2.2,2000,-17.6,0,0,0,Winter,No Holiday,Yes
01/12/2017,204,1,-5.5,38,0.8,2000,-17.6,0,0,0,Winter,No Holiday,Yes
01/12/2017,173,2,-6,39,1,2000,-17.7,0,0,0,Winter,No Holiday,Yes
01/12/2017,107,3,-6.2,40,0.9,2000,-17.6,0,0,0,Winter,No Holiday,Yes


In [7]:
%%writefile ../src/get_data.py

import urllib.request, os

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00560/SeoulBikeData.csv'
path = os.path.join('data', 'raw')
filename = 'SeoulBikeData.csv'

if not os.path.exists(path): os.makedirs(path)
        
urllib.request.urlretrieve(url, os.path.join(path, filename))
print("File Downloaded Successfully!")

Writing ../src/get_data.py


## 2. Prepare Data

In [9]:
!head -n 10 {raw_data}

Date,Rented Bike Count,Hour,Temperature(�C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(�C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
01/12/2017,254,0,-5.2,37,2.2,2000,-17.6,0,0,0,Winter,No Holiday,Yes
01/12/2017,204,1,-5.5,38,0.8,2000,-17.6,0,0,0,Winter,No Holiday,Yes
01/12/2017,173,2,-6,39,1,2000,-17.7,0,0,0,Winter,No Holiday,Yes
01/12/2017,107,3,-6.2,40,0.9,2000,-17.6,0,0,0,Winter,No Holiday,Yes
01/12/2017,78,4,-6,36,2.3,2000,-18.6,0,0,0,Winter,No Holiday,Yes
01/12/2017,100,5,-6.4,37,1.5,2000,-18.7,0,0,0,Winter,No Holiday,Yes
01/12/2017,181,6,-6.6,35,1.3,2000,-19.5,0,0,0,Winter,No Holiday,Yes
01/12/2017,460,7,-7.4,38,0.9,2000,-19.3,0,0,0,Winter,No Holiday,Yes
01/12/2017,930,8,-7.6,37,1.1,2000,-19.8,0.01,0,0,Winter,No Holiday,Yes


In [10]:
data = pd.read_csv(raw_data, encoding='iso-8859-1')
data.head()

Unnamed: 0,Date,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(°C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
0,01/12/2017,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
1,01/12/2017,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
2,01/12/2017,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,Winter,No Holiday,Yes
3,01/12/2017,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
4,01/12/2017,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,Winter,No Holiday,Yes


In [11]:
data.columns

Index(['Date', 'Rented Bike Count', 'Hour', 'Temperature(°C)', 'Humidity(%)',
       'Wind speed (m/s)', 'Visibility (10m)', 'Dew point temperature(°C)',
       'Solar Radiation (MJ/m2)', 'Rainfall(mm)', 'Snowfall (cm)', 'Seasons',
       'Holiday', 'Functioning Day'],
      dtype='object')

In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8760 entries, 0 to 8759
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Date                       8760 non-null   object 
 1   Rented Bike Count          8760 non-null   int64  
 2   Hour                       8760 non-null   int64  
 3   Temperature(°C)            8760 non-null   float64
 4   Humidity(%)                8760 non-null   int64  
 5   Wind speed (m/s)           8760 non-null   float64
 6   Visibility (10m)           8760 non-null   int64  
 7   Dew point temperature(°C)  8760 non-null   float64
 8   Solar Radiation (MJ/m2)    8760 non-null   float64
 9   Rainfall(mm)               8760 non-null   float64
 10  Snowfall (cm)              8760 non-null   float64
 11  Seasons                    8760 non-null   object 
 12  Holiday                    8760 non-null   object 
 13  Functioning Day            8760 non-null   objec

In [13]:
def clean_col_names(list_of_cols):
    return [re.sub(r'[^a-zA-Z0-9\s]', '', col).lower().replace(r" ", "_") for col in list_of_cols]

In [14]:
data.columns = clean_col_names(data.columns)
data.columns

Index(['date', 'rented_bike_count', 'hour', 'temperaturec', 'humidity',
       'wind_speed_ms', 'visibility_10m', 'dew_point_temperaturec',
       'solar_radiation_mjm2', 'rainfallmm', 'snowfall_cm', 'seasons',
       'holiday', 'functioning_day'],
      dtype='object')

In [15]:
data.tail()

Unnamed: 0,date,rented_bike_count,hour,temperaturec,humidity,wind_speed_ms,visibility_10m,dew_point_temperaturec,solar_radiation_mjm2,rainfallmm,snowfall_cm,seasons,holiday,functioning_day
8755,30/11/2018,1003,19,4.2,34,2.6,1894,-10.3,0.0,0.0,0.0,Autumn,No Holiday,Yes
8756,30/11/2018,764,20,3.4,37,2.3,2000,-9.9,0.0,0.0,0.0,Autumn,No Holiday,Yes
8757,30/11/2018,694,21,2.6,39,0.3,1968,-9.9,0.0,0.0,0.0,Autumn,No Holiday,Yes
8758,30/11/2018,712,22,2.1,41,1.0,1859,-9.8,0.0,0.0,0.0,Autumn,No Holiday,Yes
8759,30/11/2018,584,23,1.9,43,1.3,1909,-9.3,0.0,0.0,0.0,Autumn,No Holiday,Yes


In [16]:
def extract_dates(data):
    data['date'] = pd.to_datetime(data['date'], format="%d/%m/%Y")
    data.sort_values(['date', 'hour'], inplace=True)
    data["year"] = data['date'].dt.year
    data["month"] = data['date'].dt.month
    data["week"] = data['date'].dt.isocalendar().week
    data["day"] = data['date'].dt.day
    data["day_of_week"] = data['date'].dt.dayofweek
    data["day_of_year"] = data['date'].dt.dayofyear
    data["is_month_end"] = data['date'].dt.is_month_end
    data["is_month_start"] = data['date'].dt.is_month_start
    data["is_quarter_end"] = data['date'].dt.is_quarter_end
    data["is_quarter_start"] = data['date'].dt.is_quarter_start
    data["is_year_end"] = data['date'].dt.is_year_end
    data["is_year_start"] = data['date'].dt.is_year_start
    data.drop('date', axis=1, inplace=True)
    return data

In [17]:
data = extract_dates(data)

In [18]:
data.head()

Unnamed: 0,rented_bike_count,hour,temperaturec,humidity,wind_speed_ms,visibility_10m,dew_point_temperaturec,solar_radiation_mjm2,rainfallmm,snowfall_cm,...,week,day,day_of_week,day_of_year,is_month_end,is_month_start,is_quarter_end,is_quarter_start,is_year_end,is_year_start
0,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,...,48,1,4,335,False,True,False,False,False,False
1,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,...,48,1,4,335,False,True,False,False,False,False
2,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,...,48,1,4,335,False,True,False,False,False,False
3,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,...,48,1,4,335,False,True,False,False,False,False
4,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,...,48,1,4,335,False,True,False,False,False,False


In [19]:
data_interim = join(path, "interim", "clean_data.parquet")

In [20]:
data[['holiday', 'seasons', 'functioning_day']].head()

Unnamed: 0,holiday,seasons,functioning_day
0,No Holiday,Winter,Yes
1,No Holiday,Winter,Yes
2,No Holiday,Winter,Yes
3,No Holiday,Winter,Yes
4,No Holiday,Winter,Yes


In [21]:
data = pd.get_dummies(data=data, columns=['holiday', 'seasons', 'functioning_day'])

In [22]:
data.head()

Unnamed: 0,rented_bike_count,hour,temperaturec,humidity,wind_speed_ms,visibility_10m,dew_point_temperaturec,solar_radiation_mjm2,rainfallmm,snowfall_cm,...,is_year_end,is_year_start,holiday_Holiday,holiday_No Holiday,seasons_Autumn,seasons_Spring,seasons_Summer,seasons_Winter,functioning_day_No,functioning_day_Yes
0,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,...,False,False,0,1,0,0,0,1,0,1
1,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,...,False,False,0,1,0,0,0,1,0,1
2,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,...,False,False,0,1,0,0,0,1,0,1
3,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,...,False,False,0,1,0,0,0,1,0,1
4,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,...,False,False,0,1,0,0,0,1,0,1


In [23]:
data.to_parquet(data_interim, compression="snappy")

### Loading Data with AWS Python SDK

In [25]:
import boto3
from pprint import pprint

In [26]:
s3_client = boto3.client('s3')

In [27]:
pprint(s3_client.list_buckets())

{'Buckets': [{'CreationDate': datetime.datetime(2022, 7, 17, 21, 50, 3, tzinfo=tzlocal()),
              'Name': 'sagemaker-studio-007364072835-20kgweknjbe'},
             {'CreationDate': datetime.datetime(2022, 7, 17, 23, 29, 51, tzinfo=tzlocal()),
              'Name': 'sagemaker-studio-007364072835-hwxvwzewhpq'},
             {'CreationDate': datetime.datetime(2022, 9, 12, 14, 4, 31, tzinfo=tzlocal()),
              'Name': 'september22-workshop'},
             {'CreationDate': datetime.datetime(2022, 5, 12, 21, 20, 8, tzinfo=tzlocal()),
              'Name': 'testing-012345'},
             {'CreationDate': datetime.datetime(2022, 9, 11, 21, 51, 12, tzinfo=tzlocal()),
              'Name': 'testing-aws-4-decd'}],
 'Owner': {'ID': '7c268a684733d1b8292a935004a1ccd6336186efd4d9c1a6d3e94f7718484579'},
 'ResponseMetadata': {'HTTPHeaders': {'content-type': 'application/xml',
                                      'date': 'Mon, 12 Sep 2022 14:39:06 GMT',
                                   

In [30]:
pprint(s3_client.list_buckets()["Buckets"][2]["Name"])

'september22-workshop'


In [31]:
data_interim

'../data/interim/clean_data.parquet'

In [32]:
s3_client.upload_file(data_interim, "september22-workshop", "interim/clean_data.parquet")

In [33]:
%%writefile ../src/prepare.py

import pandas as pd, os, sys, re

raw_data = os.path.join("data", 'raw', "SeoulBikeData.csv")
data_interim = os.path.join("data", "interim", "clean_data.parquet")

data = pd.read_csv(raw_data, encoding='iso-8859-1')

def clean_col_names(list_of_cols):
    return [re.sub(r'[^a-zA-Z0-9\s]', '', col).lower().replace(r" ", "_") for col in list_of_cols]

def extract_dates(data):
    data['date'] = pd.to_datetime(data['date'], format="%d/%m/%Y")
    data.sort_values(['date', 'hour'], inplace=True)
    data["year"] = data['date'].dt.year
    data["month"] = data['date'].dt.month
    data["week"] = data['date'].dt.isocalendar().week
    data["day"] = data['date'].dt.day
    data["day_of_week"] = data['date'].dt.dayofweek
    data["day_of_year"] = data['date'].dt.dayofyear
    data["is_month_end"] = data['date'].dt.is_month_end
    data["is_month_start"] = data['date'].dt.is_month_start
    data["is_quarter_end"] = data['date'].dt.is_quarter_end
    data["is_quarter_start"] = data['date'].dt.is_quarter_start
    data["is_year_end"] = data['date'].dt.is_year_end
    data["is_year_start"] = data['date'].dt.is_year_start
    data.drop('date', axis=1, inplace=True)
    return data

data.columns = clean_col_names(data.columns)
data = extract_dates(data)
data = pd.get_dummies(data=data, columns=['holiday', 'seasons', 'functioning_day'])
data.to_parquet(data_interim, compression="snappy")
print("File Cleaned Successfully!")

Writing ../src/prepare.py


## 3. Split Data

In [34]:
split = 0.30
n_train = int(len(data) - len(data) * split)

In [35]:
train_path = join(path, 'processed', 'train.parquet')
test_path = join(path, 'processed', 'test.parquet')

In [36]:
data[:n_train].reset_index(drop=True).to_parquet(train_path, compression="snappy")
data[n_train:].reset_index(drop=True).to_parquet(test_path, compression="snappy")

In [37]:
%%writefile ../src/split_data.py

import pandas as pd, os

data_interim = os.path.join("data", "interim", "clean_data.parquet")
train_path = os.path.join("data", 'processed', 'train.parquet')
test_path = os.path.join("data", 'processed', 'test.parquet')

data = pd.read_parquet(data_interim)

split = 0.30
n_train = int(len(data) - len(data) * split)

data[:n_train].reset_index(drop=True).to_parquet(train_path, compression="snappy")
data[n_train:].reset_index(drop=True).to_parquet(test_path, compression="snappy")

print("File Partitioned Successfully!")

Writing ../src/split_data.py


## 4. Train Model

In [38]:
X_train = pd.read_parquet(train_path)
y_train = X_train.pop('rented_bike_count')

In [39]:
seed = 42
n_est = 100

In [40]:
%%time

rf = RandomForestRegressor(n_estimators=n_est, random_state=seed)
rf.fit(X_train.values, y_train.values)

CPU times: user 3.66 s, sys: 5.22 ms, total: 3.67 s
Wall time: 3.69 s


In [41]:
rf.predict(X_train.values)[:10]

array([236.39, 216.06, 159.82, 102.32,  80.01,  91.96, 166.18, 422.35,
       843.38, 447.64])

In [42]:
with open('../models/rf_model.pkl', "wb") as fd:
    pickle.dump(rf, fd)

In [43]:
s3_client.upload_file('../models/rf_model.pkl', "september22-workshop", "models/rf_model.pkl")

In [50]:
%%writefile ../src/train_model.py

import pandas as pd, os, numpy as np, pickle
from sklearn.ensemble import RandomForestRegressor

train_path = os.path.join("data", 'processed', 'train.parquet')

X_train = pd.read_parquet(train_path)
y_train = X_train.pop('rented_bike_count')

seed = 42
n_est = 100

rf = RandomForestRegressor(n_estimators=n_est, random_state=seed)
rf.fit(X_train.values, y_train.values)

with open(os.path.join('models', 'rf_model.pkl'), "wb") as fd:
    pickle.dump(rf, fd)
    
print("File Trained Successfully!")

Overwriting ../src/train_model.py


## 5. Evaluate

In [45]:
with open('../models/rf_model.pkl', "rb") as fd:
    model = pickle.load(fd)

In [46]:
X_test = pd.read_parquet(test_path)
y_test = X_test.pop('rented_bike_count')

In [47]:
predictions = model.predict(X_test.values)
predictions[:10]

array([ 701.82,  711.5 ,  674.75,  722.74,  879.71, 1303.26, 2228.5 ,
       1901.35, 1787.42, 1761.24])

In [48]:
mae = metrics.mean_absolute_error(y_test.values, predictions)
rmse = np.sqrt(metrics.mean_squared_error(y_test.values, predictions))
r2_score = model.score(X_test.values, y_test.values)

In [49]:
print(f"Mean Absolute Error: {mae:.2f}")
print(f"Root Mean Square Error: {rmse:.2f}")
print(f"R^2: {r2_score:.3f}")

Mean Absolute Error: 267.47
Root Mean Square Error: 394.82
R^2: 0.629


In [51]:
with open(join('..', "reports", 'metrics.json'), "w") as fd:
    json.dump({"MAE": mae, "RMSE": rmse, "R^2":r2_score}, fd, indent=4)

In [52]:
%%writefile ../src/evaluate.py

import pandas as pd, os, numpy as np, pickle, json, pprint
import sklearn.metrics as metrics

model_path = os.path.join('models', 'rf_model.pkl')
metrics_path = os.path.join("reports", 'metrics.json')
test_path = os.path.join("data", 'processed', 'test.parquet')

with open(model_path, "rb") as fd:
    model = pickle.load(fd)

X_test = pd.read_parquet(test_path)
y_test = X_test.pop('rented_bike_count')

predictions = model.predict(X_test.values)

mae = metrics.mean_absolute_error(y_test.values, predictions)
rmse = np.sqrt(metrics.mean_squared_error(y_test.values, predictions))
r2_score = model.score(X_test.values, y_test.values)

with open(metrics_path, "w") as fd:
    json.dump({"MAE": mae, "RMSE": rmse, "R^2": r2_score}, fd, indent=4)

print("File Evaluated Successfully!")
pprint.pprint({"MAE": mae, "RMSE": rmse, "R^2": r2_score})

Writing ../src/evaluate.py
