# Exploration

## 1. Get The Data

In [1]:
!pwd

/home/ramonperez/Tresors/datascience/projects/bikes_ml/notebooks


In [2]:
import urllib.request, os

In [3]:
os.chdir('..')

In [5]:
!pwd

/home/ramonperez/Tresors/datascience/projects/bikes_ml


In [6]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00560/SeoulBikeData.csv'
path = os.path.join('data', 'raw')
filename = 'SeoulBikeData.csv'

In [7]:
urllib.request.urlretrieve(url, os.path.join(path, filename))

('data/raw/SeoulBikeData.csv', <http.client.HTTPMessage at 0x7f85b42258b0>)

In [8]:
%%writefile src/get_data.py

import urllib.request, os

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00560/SeoulBikeData.csv'
path = os.path.join('data', 'raw')
filename = 'SeoulBikeData.csv'

if not os.path.exists(path): os.makedirs(path)
        
urllib.request.urlretrieve(url, os.path.join(path, filename))

Writing src/get_data.py


In [None]:
%%bash

git add data/raw/.gitignore data/raw/SeoulBikeData.csv.dvc
git commit -m "Start Tracking Data"
git remote add origin https://github.com/ramonpzg/bikes_ml.git
git push -u origin master

## Preparation

In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [None]:
data = pd.read_csv('data/raw/SeoulBikeData.csv', encoding='iso-8859-1')

In [None]:
data.head().T

In [None]:
data['Date'] = pd.to_datetime(data['Date'])

In [None]:
data["Year"] = data['Date'].dt.year
data["Month"] = data['Date'].dt.month
data["Week"] = data['Date'].dt.isocalendar().week
data["Day"] = data['Date'].dt.day
data["Dayofweek"] = data['Date'].dt.dayofweek
data["Dayofyear"] = data['Date'].dt.dayofyear
data["Is_month_end"] = data['Date'].dt.is_month_end
data["Is_month_start"] = data['Date'].dt.is_month_start
data["Is_quarter_end"] = data['Date'].dt.is_quarter_end
data["Is_quarter_start"] = data['Date'].dt.is_quarter_start
data["Is_year_end"] = data['Date'].dt.is_year_end
data["Is_year_start"] = data['Date'].dt.is_year_start
data.drop('Date', axis=1, inplace=True)

In [None]:
data = pd.get_dummies(data=data, columns=['Holiday', 'Seasons', 'Functioning Day'])

In [None]:
data.columns = ['rented_bike_count', 'hour', 'temperature', 'humidity', 'wind_speed', 'visibility', 
                'dew_point_temperature', 'solar_radiation', 'rainfall', 'snowfall', 'year', 
                'month', 'week', 'day', 'dayofweek', 'dayofyear', 'is_month_end', 'is_month_start',
                'is_quarter_end', 'is_quarter_start', 'is_year_end', 'is_year_start',
                'seasons_autumn', 'seasons_winter', 'seasons_summer', 'seasons_spring',
                'holiday_yes', 'holiday_no', 'functioning_day_no', 'functioning_day_yes']

In [None]:
split = 0.30

train_path = os.path.join('data', 'processed', 'train.csv')
test_path = os.path.join('data', 'processed', 'test.csv')

In [None]:
n_train = int(len(data) - len(data) * split)

df_train = data[:n_train].reset_index(drop=True)
df_test = data[n_train:].reset_index(drop=True)

df_train.to_csv(train_path, index=False)
df_test.to_csv(test_path, index=False)

In [None]:
%%bash

dvc add data/processed/train.csv data/processed/test.csv
dvc push

In [None]:
%%writefile src/prepare.py

import pandas as pd
import os, sys

split = 0.30

raw_data_path = sys.argv[1]
train_path = os.path.join('data', 'processed', 'train.csv')
test_path = os.path.join('data', 'processed', 'test.csv')

# read the data
data = pd.read_csv(raw_data_path, encoding='iso-8859-1')

# add date vars
data['Date'] = pd.to_datetime(data['Date'])
data.sort_values(['Date', 'Hour'], inplace=True)
data["Year"] = data['Date'].dt.year
data["Month"] = data['Date'].dt.month
data["Week"] = data['Date'].dt.isocalendar().week
data["Day"] = data['Date'].dt.day
data["Dayofweek"] = data['Date'].dt.dayofweek
data["Dayofyear"] = data['Date'].dt.dayofyear
data["Is_month_end"] = data['Date'].dt.is_month_end
data["Is_month_start"] = data['Date'].dt.is_month_start
data["Is_quarter_end"] = data['Date'].dt.is_quarter_end
data["Is_quarter_start"] = data['Date'].dt.is_quarter_start
data["Is_year_end"] = data['Date'].dt.is_year_end
data["Is_year_start"] = data['Date'].dt.is_year_start
data.drop('Date', axis=1, inplace=True)

# add dummies
data = pd.get_dummies(data=data, columns=['Holiday', 'Seasons', 'Functioning Day'])

# Normalize columns
data.columns = ['rented_bike_count', 'hour', 'temperature', 'humidity', 'wind_speed', 'visibility', 
                'dew_point_temperature', 'solar_radiation', 'rainfall', 'snowfall', 'year', 
                'month', 'week', 'day', 'dayofweek', 'dayofyear', 'is_month_end', 'is_month_start',
                'is_quarter_end', 'is_quarter_start', 'is_year_end', 'is_year_start',
                'seasons_autumn', 'seasons_winter', 'seasons_summer', 'seasons_spring',
                'holiday_yes', 'holiday_no', 'functioning_day_no', 'functioning_day_yes']

n_train = int(len(data) - len(data) * split)

data[:n_train].reset_index(drop=True).to_csv(train_path, index=False)
data[n_train:].reset_index(drop=True).to_csv(test_path, index=False)

In [None]:
%%writefile .gitignore

.ipynb_checkpoints
new_user_credentials.csv

In [None]:
%%bash

git add .
git commit -m "Preparation stage completed"
git push

## Training

In [None]:
from sklearn.ensemble import RandomForestRegressor
import pickle

In [None]:
X_train = pd.read_csv('data/processed/train.csv')
y_train = X_train.pop('rented_bike_count')

In [None]:
seed = 42
n_est = 100

In [None]:
rf = RandomForestRegressor(n_estimators=n_est, random_state=seed)
rf.fit(X_train.values, y_train.values)

In [None]:
with open('models/rf_model.pkl', "wb") as fd:
    pickle.dump(rf, fd)

In [None]:
rf.predict(X_train.values)[:10]

In [None]:
%%writefile src/train.py

import os, pickle, sys
import numpy as np, pandas as pd
from sklearn.ensemble import RandomForestRegressor

input_data = sys.argv[1]
output = os.path.join('models', 'rf_model.pkl')
seed = 42
n_est = 100

X_train = pd.read_csv(input_data)
y_train = X_train.pop('rented_bike_count')

rf = RandomForestRegressor(n_estimators=n_est, random_state=seed)
rf.fit(X_train.values, y_train.values)

with open(output, "wb") as fd:
    pickle.dump(rf, fd)

In [None]:
%%bash

dvc add models/rf_model.pkl
dvc push

In [None]:
%%bash

git add .
git commit -m "Training stage completed"
git push

## Evaluate

In [None]:
import sklearn.metrics as metrics
import numpy as np, json

In [None]:
with open('models/rf_model.pkl', "rb") as fd:
    model = pickle.load(fd)

In [None]:
X_test = pd.read_csv('data/processed/test.csv')
y_test = X_test.pop('rented_bike_count')

In [None]:
predictions = model.predict(X_test.values)
predictions[:10]

In [None]:
mae = metrics.mean_absolute_error(y_test.values, predictions)
rmse = np.sqrt(metrics.mean_squared_error(y_test.values, predictions))
r2_score = model.score(X_test.values, y_test.values)

print(f"Mean Absolute Error: {mae:.2f}")
print(f"Root Mean Square Error: {rmse:.2f}")
print(f"R^2: {r2_score:.3f}")

In [None]:
with open(os.path.join('metrics', 'metrics.json'), "w") as fd:
    json.dump({"MAE": mae, "RMSE": rmse, "R^2":r2_score}, fd, indent=4)

In [None]:
%%writefile src/evaluate.py

import json, os, pickle, sys, pandas as pd, numpy as np
import sklearn.metrics as metrics

model_file = sys.argv[1]
test_file = os.path.join(sys.argv[2], "test.csv")
scores_file = os.path.join('metrics', 'metrics.json')

with open(model_file, "rb") as fd:
    model = pickle.load(fd)

X_test = pd.read_csv(test_file)
y_test = X_test.pop('rented_bike_count')

predictions = model.predict(X_test.values)

mae = metrics.mean_absolute_error(y_test.values, predictions)
rmse = np.sqrt(metrics.mean_squared_error(y_test.values, predictions))
r2_score = model.score(X_test.values, y_test.values)

with open(scores_file, "w") as fd:
    json.dump({"MAE": mae, "RMSE": rmse, "R^2":r2_score}, fd, indent=4)

In [None]:
%%bash

git add .
git commit -m "Evaluation stage completed"
git push

## Pipelines

In [None]:
%%bash

dvc remove data/raw/SeoulBikeData.csv.dvc \
           data/processed/train.csv.dvc \
           data/processed/test.csv.dvc \
           models/rf_model.pkl.dvc

In [None]:
%%bash

dvc run -n get_data \
-d src/get_data.py \
-o data/raw/SeoulBikeData.csv \
python src/get_data.py

In [None]:
%%bash

dvc run -n prepare \
-d src/prepare.py -d data/raw/SeoulBikeData.csv \
-o data/processed/train.csv -o data/processed/test.csv \
python src/prepare.py data/raw/SeoulBikeData.csv

In [None]:
%%bash

dvc run -n train \
-d src/train.py -d data/processed/train.csv \
-o models/rf_model.pkl \
python src/train.py data/processed/train.csv

In [None]:
%%bash

dvc run -n evaluate \
-d src/evaluate.py -d models/rf_model.pkl -d data/processed \
-M metrics/metrics.json \
python src/evaluate.py models/rf_model.pkl data/processed

In [None]:
!dvc dag

In [None]:
!rm dvc.lock data/raw/SeoulBikeData.csv data/processed/train.csv data/processed/test.csv

In [None]:
!dvc repro

In [None]:
!git status

In [None]:
%%bash

git add .
git commit -m "Pipeline Finished"
git push

## CI/CD Pipelines with CML



Create a `requirements.txt` file

In [None]:
%%writefile requirements.txt

pandas
scikit-learn
numpy
xgboost
lightgbm
catboost

First add your secrets to github

1. Go to Settings > Secrets and click on **New repository secret**
2. On the **Name** box write **AWS_ACCESS_KEY_ID**
3. On the **Value** box write the access key you created earlier
4. Do the same as above for your **AWS_SECRET_ACCESS_KEY**



In [None]:
%%writefile .github/workflows/cml.yaml

name: bikes-pipeline-test
on: push
jobs:
  run:
    runs-on: [ubuntu-latest]
    container: docker://dvcorg/cml:0-dvc2-base1
    steps:
      - uses: actions/checkout@v2
      - name: cml_run 
        env:
          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
        run: |

          pip install -r requirements.txt
          
          dvc repro
          dvc push
          git fetch --prune


          echo "# CML Report" > report.md
          dvc metrics diff --show-md master >> report.md
          cml-send-comment report.md

In [None]:
!git status

In [None]:
%%bash

git add .
git commit -m "Adding CML CI/CD Pipeline"
git push

# Experiments

In [None]:
!git checkout -b "exp1-xgb"

In [None]:
%%writefile src/train.py

import os, pickle, sys, pandas as pd
from xgboost import XGBRFRegressor

input_data = sys.argv[1]
output = os.path.join('models', 'rf_model.pkl')
seed = 42
n_est = 100

X_train = pd.read_csv(input_data)
y_train = X_train.pop('rented_bike_count')

rf = XGBRFRegressor(n_estimators=n_est, seed=seed)
rf.fit(X_train.values, y_train.values)

with open(output, "wb") as fd:
    pickle.dump(rf, fd)

In [None]:
%%bash

git add .
git commit -m "Testing XGBoost"
git push --set-upstream origin exp1-xgb
git push

In [None]:
!git checkout -b "exp2-lgbm"

In [None]:
%%writefile src/train.py

import os, pickle, sys, pandas as pd
from lightgbm import LGBMRegressor

input_data = sys.argv[1]
output = os.path.join('models', 'rf_model.pkl')
seed = 42
n_est = 100

X_train = pd.read_csv(input_data)
y_train = X_train.pop('rented_bike_count')

rf = LGBMRegressor(n_estimators=n_est, random_state=seed)
rf.fit(X_train.values, y_train.values)

with open(output, "wb") as fd:
    pickle.dump(rf, fd)

In [None]:
%%bash

git add .
git commit -m "Testing LightGBM"
git push --set-upstream origin exp2-lgbm
git push

In [None]:
!git checkout -b "exp3-cat"

In [None]:
%%writefile src/train.py

import os, pickle, sys, pandas as pd
from catboost import CatBoostRegressor

input_data = sys.argv[1]
output = os.path.join('models', 'rf_model.pkl')
seed = 42
n_est = 100

X_train = pd.read_csv(input_data)
y_train = X_train.pop('rented_bike_count')

rf = CatBoostRegressor(n_estimators=n_est, random_state=seed)
rf.fit(X_train.values, y_train.values)

with open(output, "wb") as fd:
    pickle.dump(rf, fd)

In [None]:
%%bash

git add .
git commit -m "Testing CatBoost"
git push --set-upstream origin exp3-cat
git push

In [None]:
data.year.min(), data.year.max()