# Exploration

## 1. Get The Data

In [None]:
import urllib.request, os

In [None]:
os.chdir('..')

In [None]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00560/SeoulBikeData.csv'
path = os.path.join('data', 'raw')
filename = 'SeoulBikeData.csv'

In [None]:
urllib.request.urlretrieve(url, os.path.join(path, filename))

('../data/raw/SeoulBikeData.csv', <http.client.HTTPMessage at 0x7f3cac3540a0>)

In [58]:
%%writefile src/get_data.py

import urllib.request, os

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00560/SeoulBikeData.csv'
path = os.path.join('data', 'raw')
filename = 'SeoulBikeData.csv'

if not os.path.exists(path):
        os.makedirs(path)
        
urllib.request.urlretrieve(url, os.path.join(path, filename))

Overwriting src/get_data.py


In [5]:
os.chdir('..')

In [6]:
!pwd

/home/ramonperez/Tresors/datascience/projects/bikes_ml


In [8]:
%%bash

git add data/raw/.gitignore data/raw/SeoulBikeData.csv.dvc
git commit -m "Start Tracking Data"
git remote add origin https://github.com/ramonpzg/bikes_ml.git
git push -u origin master

Branch 'master' set up to track remote branch 'master' from 'origin'.


To https://github.com/ramonpzg/bikes_ml.git
 * [new branch]      master -> master


## Preparation

In [16]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [12]:
data = pd.read_csv('data/raw/SeoulBikeData.csv', encoding='iso-8859-1')

In [13]:
data.head().T

Unnamed: 0,0,1,2,3,4
Date,01/12/2017,01/12/2017,01/12/2017,01/12/2017,01/12/2017
Rented Bike Count,254,204,173,107,78
Hour,0,1,2,3,4
Temperature(°C),-5.2,-5.5,-6.0,-6.2,-6.0
Humidity(%),37,38,39,40,36
Wind speed (m/s),2.2,0.8,1.0,0.9,2.3
Visibility (10m),2000,2000,2000,2000,2000
Dew point temperature(°C),-17.6,-17.6,-17.7,-17.6,-18.6
Solar Radiation (MJ/m2),0.0,0.0,0.0,0.0,0.0
Rainfall(mm),0.0,0.0,0.0,0.0,0.0


In [14]:
data['Date'] = pd.to_datetime(data['Date'])

In [15]:
data["Year"] = data['Date'].dt.year
data["Month"] = data['Date'].dt.month
data["Week"] = data['Date'].dt.isocalendar().week
data["Day"] = data['Date'].dt.day
data["Dayofweek"] = data['Date'].dt.dayofweek
data["Dayofyear"] = data['Date'].dt.dayofyear
data["Is_month_end"] = data['Date'].dt.is_month_end
data["Is_month_start"] = data['Date'].dt.is_month_start
data["Is_quarter_end"] = data['Date'].dt.is_quarter_end
data["Is_quarter_start"] = data['Date'].dt.is_quarter_start
data["Is_year_end"] = data['Date'].dt.is_year_end
data["Is_year_start"] = data['Date'].dt.is_year_start
data.drop('Date', axis=1, inplace=True)

In [19]:
data = pd.get_dummies(data=data, columns=['Holiday', 'Seasons', 'Functioning Day'])

In [20]:
data.columns = ['rented_bike_count', 'hour', 'temperature', 'humidity', 'wind_speed', 'visibility', 
                'dew_point_temperature', 'solar_radiation', 'rainfall', 'snowfall', 'year', 
                'month', 'week', 'day', 'dayofweek', 'dayofyear', 'is_month_end', 'is_month_start',
                'is_quarter_end', 'is_quarter_start', 'is_year_end', 'is_year_start',
                'seasons_autumn', 'seasons_winter', 'seasons_summer', 'seasons_spring',
                'holiday_yes', 'holiday_no', 'functioning_day_no', 'functioning_day_yes']

In [24]:
split = 0.30

train_path = os.path.join('data', 'processed', 'train.csv')
test_path = os.path.join('data', 'processed', 'test.csv')

In [25]:
n_train = int(len(data) - len(data) * split)

df_train = data[:n_train].reset_index(drop=True)
df_test = data[n_train:].reset_index(drop=True)

df_train.to_csv(train_path, index=False)
df_test.to_csv(test_path, index=False)

In [70]:
%%bash

dvc add data/processed/train.csv data/processed/test.csv
dvc push


To track the changes with git, run:

	git add data/processed/.gitignore data/processed/train.csv.dvc data/processed/test.csv.dvc
2 files pushed


In [26]:
%%writefile src/prepare.py

import pandas as pd
import os, sys

split = 0.30

raw_data_path = sys.argv[1]
train_path = os.path.join('data', 'processed', 'train.csv')
test_path = os.path.join('data', 'processed', 'test.csv')

# read the data
data = pd.read_csv(raw_data_path, encoding='iso-8859-1')

# add date vars
data['Date'] = pd.to_datetime(data['Date'])
data.sort_values(['Date', 'Hour'], inplace=True)
data["Year"] = data['Date'].dt.year
data["Month"] = data['Date'].dt.month
data["Week"] = data['Date'].dt.isocalendar().week
data["Day"] = data['Date'].dt.day
data["Dayofweek"] = data['Date'].dt.dayofweek
data["Dayofyear"] = data['Date'].dt.dayofyear
data["Is_month_end"] = data['Date'].dt.is_month_end
data["Is_month_start"] = data['Date'].dt.is_month_start
data["Is_quarter_end"] = data['Date'].dt.is_quarter_end
data["Is_quarter_start"] = data['Date'].dt.is_quarter_start
data["Is_year_end"] = data['Date'].dt.is_year_end
data["Is_year_start"] = data['Date'].dt.is_year_start
data.drop('Date', axis=1, inplace=True)

# add dummies
data = pd.get_dummies(data=data, columns=['Holiday', 'Seasons', 'Functioning Day'])

# Normalize columns
data.columns = ['rented_bike_count', 'hour', 'temperature', 'humidity', 'wind_speed', 'visibility', 
                'dew_point_temperature', 'solar_radiation', 'rainfall', 'snowfall', 'year', 
                'month', 'week', 'day', 'dayofweek', 'dayofyear', 'is_month_end', 'is_month_start',
                'is_quarter_end', 'is_quarter_start', 'is_year_end', 'is_year_start',
                'seasons_autumn', 'seasons_winter', 'seasons_summer', 'seasons_spring',
                'holiday_yes', 'holiday_no', 'functioning_day_no', 'functioning_day_yes']

n_train = int(len(data) - len(data) * split)

data[:n_train].reset_index(drop=True).to_csv(train_path, index=False)
data[n_train:].reset_index(drop=True).to_csv(test_path, index=False)

Writing prepare.py


In [27]:
%%writefile .gitignore

.ipynb_checkpoints
new_user_credentials.csv

Writing .gitignore


In [71]:
%%bash

git add .
git commit -m "Preparation stage completed"
git push

[master da046c3] Preparation stage completed
 8 files changed, 428 insertions(+), 9 deletions(-)
 create mode 100644 data/processed/.gitignore
 create mode 100644 data/processed/test.csv.dvc
 create mode 100644 data/processed/train.csv.dvc
 create mode 100644 dvc.lock
 create mode 100644 dvc.yaml
 create mode 100644 metrics/metrics.json


To https://github.com/ramonpzg/bikes_ml.git
   5b5ba3f..da046c3  master -> master


## Training

In [29]:
from sklearn.ensemble import RandomForestRegressor
import pickle

In [30]:
X_train = pd.read_csv('data/processed/train.csv')
y_train = X_train.pop('rented_bike_count')

In [31]:
seed = 42
n_est = 100

In [32]:
rf = RandomForestRegressor(n_estimators=n_est, random_state=seed)
rf.fit(X_train.values, y_train.values)

RandomForestRegressor(random_state=42)

In [33]:
with open('models/rf_model.pkl', "wb") as fd:
    pickle.dump(rf, fd)

In [38]:
rf.predict(X_train.values)[:10]

array([226.38, 211.76, 165.48,  95.93,  80.49,  90.37, 150.53, 476.47,
       795.83, 447.04])

In [35]:
%%writefile src/train.py

import os, pickle, sys
import numpy as np, pandas as pd
from sklearn.ensemble import RandomForestRegressor

input_data = sys.argv[1]
output = os.path.join('models', 'rf_model.pkl')
seed = 42
n_est = 100

X_train = pd.read_csv(input_data)
y_train = X_train.pop('rented_bike_count')

rf = RandomForestRegressor(n_estimators=n_est, random_state=seed)
rf.fit(X_train.values, y_train.values)

with open(output, "wb") as fd:
    pickle.dump(rf, fd)

Overwriting src/train.py


In [40]:
%%bash

dvc add models/rf_model.pkl
dvc push


To track the changes with git, run:

	git add models/.gitignore models/rf_model.pkl.dvc
1 file pushed
[master eb2504f] Training stage completed
 4 files changed, 1171 insertions(+), 2826 deletions(-)
 create mode 100644 models/.gitignore
 create mode 100644 models/rf_model.pkl.dvc
 rewrite notebooks/exploration.ipynb (72%)
 create mode 100644 src/train.py


To https://github.com/ramonpzg/bikes_ml.git
   6950930..eb2504f  master -> master


In [None]:
%%bash

git add .
git commit -m "Training stage completed"
git push

## Evaluate

In [63]:
import sklearn.metrics as metrics
import numpy as np, json

In [42]:
with open('models/rf_model.pkl', "rb") as fd:
    model = pickle.load(fd)

In [43]:
X_test = pd.read_csv('data/processed/test.csv')
y_test = X_test.pop('rented_bike_count')

In [45]:
predictions = model.predict(X_test.values)
predictions[:10]

array([ 678.43,  690.88,  655.57,  721.8 ,  940.68, 1380.16, 2060.02,
       1961.72, 1769.  , 1772.99])

In [52]:
mae = metrics.mean_absolute_error(y_test.values, predictions)
rmse = np.sqrt(metrics.mean_squared_error(y_test.values, predictions))
r2_score = model.score(X_test.values, y_test.values)

print(f"Mean Absolute Error: {mae:.2f}")
print(f"Root Mean Square Error: {rmse:.2f}")
print(f"R^2: {r2_score:.3f}")

Mean Absolute Error: 249.12
Root Mean Square Error: 373.88
R^2: 0.667


In [64]:
with open(os.path.join('metrics', 'metrics.json'), "w") as fd:
    json.dump({"MAE": mae, "RMSE": rmse, "R^2":r2_score}, fd, indent=4)

In [53]:
%%writefile src/evaluate.py

import json, os, pickle, sys, pandas as pd, numpy as np
import sklearn.metrics as metrics

model_file = sys.argv[1]
test_file = os.path.join(sys.argv[2], "test.csv")
scores_file = os.path.join('metrics', 'metrics.json')

with open(model_file, "rb") as fd:
    model = pickle.load(fd)

X_test = pd.read_csv(test_file)
y_test = X_test.pop('rented_bike_count')

predictions = model.predict(X_test.values)

mae = metrics.mean_absolute_error(y_test.values, predictions)
rmse = np.sqrt(metrics.mean_squared_error(y_test.values, predictions))
r2_score = model.score(X_test.values, y_test.values)

with open(scores_file, "w") as fd:
    json.dump({"MAE": mae, "RMSE": rmse, "R^2":r2_score}, fd, indent=4)

Writing src/evaluate.py


In [54]:
%%bash

git add .
git commit -m "Evaluation stage completed"
git push

[master 179bf62] Evaluation stage completed
 2 files changed, 201 insertions(+), 6 deletions(-)
 create mode 100644 src/evaluate.py


To https://github.com/ramonpzg/bikes_ml.git
   eb2504f..179bf62  master -> master


## Pipelines

In [72]:
%%bash

dvc remove data/raw/SeoulBikeData.csv.dvc \
           data/processed/train.csv.dvc \
           data/processed/test.csv.dvc \
           models/rf_model.pkl.dvc

In [74]:
%%bash

dvc run -n get_data \
-d src/get_data.py \
-o data/raw/SeoulBikeData.csv \
python src/get_data.py

Running stage 'get_data':
> python src/get_data.py
Creating 'dvc.yaml'
Adding stage 'get_data' in 'dvc.yaml'
Generating lock file 'dvc.lock'
Updating lock file 'dvc.lock'

To track the changes with git, run:

	git add data/raw/.gitignore dvc.yaml dvc.lock


In [75]:
%%bash

dvc run -n prepare \
-d src/prepare.py -d data/raw/SeoulBikeData.csv \
-o data/processed/train.csv -o data/processed/test.csv \
python src/prepare.py data/raw/SeoulBikeData.csv

Running stage 'prepare':
> python src/prepare.py data/raw/SeoulBikeData.csv
Adding stage 'prepare' in 'dvc.yaml'
Updating lock file 'dvc.lock'

To track the changes with git, run:

	git add dvc.yaml data/processed/.gitignore dvc.lock


In [77]:
%%bash

dvc run -n train \
-d src/train.py -d data/processed/train.csv \
-o models/rf_model.pkl \
python src/train.py data/processed/train.csv

Running stage 'train':
> python src/train.py data/processed/train.csv
Adding stage 'train' in 'dvc.yaml'
Updating lock file 'dvc.lock'

To track the changes with git, run:

	git add dvc.yaml models/.gitignore dvc.lock


In [79]:
%%bash

dvc run -n evaluate \
-d src/evaluate.py -d models/rf_model.pkl -d data/processed \
-M metrics/metrics.json \
python src/evaluate.py models/rf_model.pkl data/processed

Running stage 'evaluate':
> python src/evaluate.py models/rf_model.pkl data/processed
Adding stage 'evaluate' in 'dvc.yaml'
Updating lock file 'dvc.lock'

To track the changes with git, run:

	git add dvc.lock dvc.yaml


In [80]:
!dvc dag

        +----------+      
        | get_data |      
        +----------+      
              *           
              *           
              *           
         +---------+      
         | prepare |      
         +---------+      
         **        **     
       **            *    
      *               **  
+-------+               * 
| train |             **  
+-------+            *    
         **        **     
           **    **       
             *  *         
        +----------+      
        | evaluate |      
        +----------+      [0m

In [84]:
!rm dvc.lock data/raw/SeoulBikeData.csv data/processed/train.csv data/processed/test.csv

In [85]:
!dvc repro

Stage 'get_data' is cached - skipping run, checking out outputs       core[39m>
Generating lock file 'dvc.lock'                                                 
Updating lock file 'dvc.lock'

Stage 'prepare' is cached - skipping run, checking out outputs
Updating lock file 'dvc.lock'                                                   

Stage 'train' is cached - skipping run, checking out outputs
Updating lock file 'dvc.lock'                                                   

Stage 'evaluate' is cached - skipping run, checking out outputs                 
Updating lock file 'dvc.lock'                                                   

To track the changes with git, run:

	git add dvc.lock
Use `dvc push` to send your updates to remote storage.
[0m

In [86]:
!git status

On branch master
Your branch is up to date with 'origin/master'.

Changes not staged for commit:
  (use "git add/rm <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	[31mdeleted:    data/processed/test.csv.dvc[m
	[31mdeleted:    data/processed/train.csv.dvc[m
	[31mdeleted:    data/raw/SeoulBikeData.csv.dvc[m
	[31mmodified:   dvc.lock[m
	[31mmodified:   dvc.yaml[m
	[31mmodified:   metrics/metrics.json[m
	[31mdeleted:    models/rf_model.pkl.dvc[m
	[31mmodified:   notebooks/exploration.ipynb[m

no changes added to commit (use "git add" and/or "git commit -a")


In [87]:
%%bash

git add .
git commit -m "Pipeline Finished"
git push

[master 352fba5] Pipeline Finished
 8 files changed, 237 insertions(+), 129 deletions(-)
 delete mode 100644 data/processed/test.csv.dvc
 delete mode 100644 data/processed/train.csv.dvc
 delete mode 100644 data/raw/SeoulBikeData.csv.dvc
 delete mode 100644 models/rf_model.pkl.dvc


To https://github.com/ramonpzg/bikes_ml.git
   da046c3..352fba5  master -> master


## CI/CD Pipelines with CML



Create a `requirements.txt` file

In [89]:
%%writefile requirements.txt

pandas
scikit-learn
numpy
xgboost
lightgbm
catboost

Writing requirements.txt


First add your secrets to github

1. Go to Settings > Secrets and click on **New repository secret**
2. On the **Name** box write **AWS_ACCESS_KEY_ID**
3. On the **Value** box write the access key you created earlier
4. Do the same as above for your **AWS_SECRET_ACCESS_KEY**



In [88]:
%%writefile .github/workflows/cml.yaml

name: bikes-pipeline-test
on: push
jobs:
  run:
    runs-on: [ubuntu-latest]
    container: docker://dvcorg/cml:0-dvc2-base1
    steps:
      - uses: actions/checkout@v2
      - name: cml_run 
        env:
          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
        run: |

          pip install -r requirements.txt
          
          dvc repro
          dvc push
          git fetch --prune


          echo "# CML Report" > report.md
          dvc metrics diff --show-md master >> report.md
          cml-send-comment report.md

Writing .github/workflows/cml.yaml


In [90]:
!git status

On branch master
Your branch is up to date with 'origin/master'.

Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	[31mmodified:   notebooks/exploration.ipynb[m

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31m.github/[m
	[31mREADME.md[m
	[31mrequirements.txt[m

no changes added to commit (use "git add" and/or "git commit -a")


In [91]:
%%bash

git add .
git commit -m "Adding CML CI/CD Pipeline"
git push

[master 22da554] Adding CML CI/CD Pipeline
 4 files changed, 189 insertions(+), 16 deletions(-)
 create mode 100644 .github/workflows/cml.yaml
 create mode 100644 README.md
 create mode 100644 requirements.txt


To https://github.com/ramonpzg/bikes_ml.git
   352fba5..22da554  master -> master


# Experiments

In [92]:
!git checkout -b "exp1-xgb"

Switched to a new branch 'exp1-xgb'


In [93]:
%%writefile src/train.py

import os, pickle, sys, pandas as pd
from xgboost import XGBRFRegressor

input_data = sys.argv[1]
output = os.path.join('models', 'rf_model.pkl')
seed = 42
n_est = 100

X_train = pd.read_csv(input_data)
y_train = X_train.pop('rented_bike_count')

rf = XGBRFRegressor(n_estimators=n_est, seed=seed)
rf.fit(X_train.values, y_train.values)

with open(output, "wb") as fd:
    pickle.dump(rf, fd)

Overwriting src/train.py


In [94]:
%%bash

git add .
git commit -m "Testing XGBoost"
git push --set-upstream origin exp1-xgb
git push

[exp1-xgb a5ef829] Testing XGBoost
 2 files changed, 122 insertions(+), 20 deletions(-)
Branch 'exp1-xgb' set up to track remote branch 'exp1-xgb' from 'origin'.


remote: 
remote: Create a pull request for 'exp1-xgb' on GitHub by visiting:        
remote:      https://github.com/ramonpzg/bikes_ml/pull/new/exp1-xgb        
remote: 
To https://github.com/ramonpzg/bikes_ml.git
 * [new branch]      exp1-xgb -> exp1-xgb
Everything up-to-date


In [95]:
!git checkout -b "exp2-lgbm"

Switched to a new branch 'exp2-lgbm'


In [96]:
%%writefile src/train.py

import os, pickle, sys, pandas as pd
from lightgbm import LGBMRegressor

input_data = sys.argv[1]
output = os.path.join('models', 'rf_model.pkl')
seed = 42
n_est = 100

X_train = pd.read_csv(input_data)
y_train = X_train.pop('rented_bike_count')

rf = LGBMRegressor(n_estimators=n_est, random_state=seed)
rf.fit(X_train.values, y_train.values)

with open(output, "wb") as fd:
    pickle.dump(rf, fd)

Overwriting src/train.py


In [97]:
%%bash

git add .
git commit -m "Testing LightGBM"
git push --set-upstream origin exp2-lgbm
git push

[exp2-lgbm 7b688c6] Testing LightGBM
 2 files changed, 80 insertions(+), 12 deletions(-)
Branch 'exp2-lgbm' set up to track remote branch 'exp2-lgbm' from 'origin'.


remote: 
remote: Create a pull request for 'exp2-lgbm' on GitHub by visiting:        
remote:      https://github.com/ramonpzg/bikes_ml/pull/new/exp2-lgbm        
remote: 
To https://github.com/ramonpzg/bikes_ml.git
 * [new branch]      exp2-lgbm -> exp2-lgbm
Everything up-to-date


In [99]:
!git checkout -b "exp3-cat"

Switched to a new branch 'exp3-cat'


In [100]:
%%writefile src/train.py

import os, pickle, sys, pandas as pd
from catboost import CatBoostRegressor

input_data = sys.argv[1]
output = os.path.join('models', 'rf_model.pkl')
seed = 42
n_est = 100

X_train = pd.read_csv(input_data)
y_train = X_train.pop('rented_bike_count')

rf = CatBoostRegressor(n_estimators=n_est, random_state=seed)
rf.fit(X_train.values, y_train.values)

with open(output, "wb") as fd:
    pickle.dump(rf, fd)

Overwriting src/train.py


In [101]:
%%bash

git add .
git commit -m "Testing CatBoost"
git push --set-upstream origin exp3-cat
git push

[exp3-cat 59742a0] Testing CatBoost
 2 files changed, 38 insertions(+), 14 deletions(-)
Branch 'exp3-cat' set up to track remote branch 'exp3-cat' from 'origin'.


remote: 
remote: Create a pull request for 'exp3-cat' on GitHub by visiting:        
remote:      https://github.com/ramonpzg/bikes_ml/pull/new/exp3-cat        
remote: 
To https://github.com/ramonpzg/bikes_ml.git
 * [new branch]      exp3-cat -> exp3-cat
Everything up-to-date
