In [1]:
import os
import pickle
import pandas as pd
import subprocess

try:
    # Check if script is running from ipython notebook
    get_ipython()
    MAKE_PACKAGE = True
except NameError:
    # Script is running from console
    MAKE_PACKAGE = False

if MAKE_PACKAGE:
    !pip freeze | grep scikit-learn
    !python -V
    # sklearn version is prefered to be matching with the pickled model.bin file's sklearn version
    !pip install -U scikit-learn==1.5.0
    # Docker is needed to build image in Q6 answer.
    # I have mounted my host's /var/run/docker.sock to the jupyter environment to access docker service itself
    !apt-get update
    !apt-get install -y docker.io

[0mscikit-learn==1.5.0
Python 3.11.6
Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:2 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:3 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Reading package lists... Done
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  apparmor bridge-utils containerd dmsetup dns-root-data dnsmasq-base iproute2
  iptables libatm1 libbpf0 libcap2-bin libdevmapper1.02.1 libip4tc2 libip6tc2
  libmnl0 libnetfilter-conntrack3 libnfnetlink0 libnftnl11 libpam-cap
  libxtables12 netbase netcat netcat-openbsd pigz runc ubuntu-fan
Suggested packages:
  apparmor-profiles-extra apparmor-utils ifupdown aufs-tools btrfs-progs
  cgroupfs-mount | cgroup-lite debootstrap docker-doc rinse zfs-fuse
  | zfsutils iproute2-doc firewalld kmod nftables
The following NEW packages wi

## Q5.1: Parametrize the script (Output is after Q4 Answer)

In [2]:
try:
    year, month = int(os.environ["YEAR"]), int(os.environ["MONTH"])
except KeyError:
    year, month = 2023, 3
    print(f"Missing year and month input arguments. Using default values (year, month): {year, month}")

Missing year and month input arguments. Using default values (year, month): (2023, 3)


## Predict values

In [3]:
with open('model.bin', 'rb') as f_in:
    dv, model = pickle.load(f_in)

In [4]:
categorical = ['PULocationID', 'DOLocationID']

def read_data(filename):
    print(f"Loading parquet file: {DATA_URL}")
    df = pd.read_parquet(filename)
    
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')

    return df

In [5]:
# Download dataset
DATA_URL = f'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year:04d}-{month:02d}.parquet'
df = read_data(DATA_URL)

dicts = df[categorical].to_dict(orient='records')
X_val = dv.transform(dicts)
y_pred = model.predict(X_val)

Loading parquet file: https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet


## Q1: What's the standard deviation of the predicted duration for this dataset?

In [6]:
if MAKE_PACKAGE:
    print(f"Q1: What's the standard deviation of the predicted duration for this dataset?\n  Answer: {y_pred.std()}")  # 6.247

Q1: What's the standard deviation of the predicted duration for this dataset?
  Answer: 6.247488852238703


## Q2: Preparing the output

In [7]:
if MAKE_PACKAGE:
    output_file = f"scoring_data_{year:04d}-{month:02d}.parquet"
    
    df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')
    df_result = pd.DataFrame()
    df_result['prediction'] = y_pred
    df_result['ride_id'] = df['ride_id']
    df_result.to_parquet(
        output_file,
        engine='pyarrow',
        compression=None,
        index=False
    )
    print(f"Q2: What's the size of the output file?\n  Answer: Size of {output_file} is {os.stat(output_file).st_size/(1024*1024):.0f}MB")  # 64MB

Q2: What's the size of the output file?
  Answer: Size of scoring_data_2023-03.parquet is 64MB


## Q3: Creating the scoring script

In [8]:
if MAKE_PACKAGE:
    print("Q3: Which command you need to execute to turn the notebook into a script? Answer: 'jupyter nbconvert --to script starter.ipynb'")
    print(subprocess.getoutput("jupyter nbconvert --to script starter.ipynb"))

Q3: Which command you need to execute to turn the notebook into a script? Answer: 'jupyter nbconvert --to script starter.ipynb'
[NbConvertApp] Converting notebook starter.ipynb to script
[NbConvertApp] Writing 4348 bytes to starter.py


## Q4: Virtual environment

In [9]:
if MAKE_PACKAGE:
    print(subprocess.getoutput("pip install pipenv"))
    print(subprocess.getoutput("python3 -m pipenv install pandas scikit-learn==1.5.0 pyarrow --python=3.10"))

[0mCollecting pipenv
  Downloading pipenv-2024.1.0-py3-none-any.whl.metadata (19 kB)
Collecting virtualenv>=20.24.2 (from pipenv)
  Downloading virtualenv-20.26.6-py3-none-any.whl.metadata (4.5 kB)
Collecting distlib<1,>=0.3.7 (from virtualenv>=20.24.2->pipenv)
  Downloading distlib-0.3.8-py2.py3-none-any.whl.metadata (5.1 kB)
Collecting filelock<4,>=3.12.2 (from virtualenv>=20.24.2->pipenv)
  Downloading filelock-3.16.1-py3-none-any.whl.metadata (2.9 kB)
Downloading pipenv-2024.1.0-py3-none-any.whl (3.0 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.0 MB[0m [31m?[0m eta [36m-:--:--[0m
[2K   [91m━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.6/3.0 MB[0m [31m17.8 MB/s[0m eta [36m0:00:01[0m
[2K   [91m━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━[0m [32m1.7/3.0 MB[0m [31m26.1 MB/s[0m eta [36m0:00:01[0m
[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━[0m [32m2.7/3.0 MB[0m [31m26.2

In [10]:
if MAKE_PACKAGE:
    import json
    with open("Pipfile.lock", "rb") as f:
        hash = json.load(f)["default"]["scikit-learn"]["hashes"][0]
        print(f"Q4: What's the first hash for the Scikit-Learn dependency?\n  Answer: {hash}")  # sha256:057b991ac64b3e75c9c04b5f9395eaf19a6179244c089afdebaad98264bff37c

Q4: What's the first hash for the Scikit-Learn dependency?
  Answer: sha256:057b991ac64b3e75c9c04b5f9395eaf19a6179244c089afdebaad98264bff37c


## Q5.2 Parametrize the script (Answer)

In [11]:
if MAKE_PACKAGE:
    print(subprocess.getoutput('YEAR=2023 MONTH=04 Q5=somevalue python3 starter.py'))
if os.environ.get("Q5", False):
    print(f"Q5: What's the mean predicted duration?\n  Answer: {y_pred.mean()}")  # 14.29

Loading parquet file: https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-04.parquet
Q5: What's the mean predicted duration?
  Answer: 14.292282936862449


## Q6: Docker container

In [12]:
if MAKE_PACKAGE:
    print(subprocess.getoutput("docker rmi homework4"))
    print(subprocess.getoutput("docker build -t homework4 ."))

Error response from daemon: No such image: homework4:latest
DEPRECATED: The legacy builder is deprecated and will be removed in a future release.
            Install the buildx component to build images with BuildKit:
            https://docs.docker.com/go/buildx/

Sending build context to Docker daemon  67.34MB

Step 1/5 : FROM agrigorev/zoomcamp-model:mlops-2024-3.10.13-slim
 ---> 13e5353db264
Step 2/5 : COPY [ "starter.py", "Pipfile", "Pipfile.lock", "./" ]
 ---> 2bd124fb7196
Step 3/5 : RUN pip install pipenv
 ---> Running in f4def3f10c1d
Collecting pipenv
  Downloading pipenv-2024.1.0-py3-none-any.whl (3.0 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.0/3.0 MB 25.0 MB/s eta 0:00:00
Collecting packaging>=22
  Downloading packaging-24.1-py3-none-any.whl (53 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 54.0/54.0 kB 4.9 MB/s eta 0:00:00
Collecting virtualenv>=20.24.2
  Downloading virtualenv-20.26.6-py3-none-any.whl (6.0 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.0/

In [13]:
if MAKE_PACKAGE:
    print(subprocess.getoutput('docker run -i --rm -e YEAR=2023 -e MONTH=05 -e Q6=somevalue homework4'))
if os.environ.get("Q6", False):
    print(f"Q6: What's the mean predicted duration?\n  Answer: {y_pred.mean()}")  # 0.1917

Loading parquet file: https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-05.parquet
Q6: What's the mean predicted duration?
  Answer: 0.19174419265916945
