# Run inference on time to merge model trained previously


## What we did previously

In the previous [notebook](./03_model_training.ipynb) we trained machine learning models to classify a PR's `time_to_merge` into one of the 10 bins (or "classes"). We then deployed the model with the highest f1-score as a service using the model saved in s3.

## In this step

In this notebook, we are going to fetch the model that we previously trained, saved and stored in s3. We will send a payload to this model and see how it performs on the test data.
# Time to Merge Prediction Inference Service

In the previous notebook, we explored some basic machine learning models for predicting time to merge of a PR.

In [1]:
import os
import sys
import gzip
import json
import boto3
import datetime
import requests
from dotenv import load_dotenv, find_dotenv
from io import BytesIO

import joblib

import numpy as np
import pandas as pd

from sklearn.metrics import classification_report

load_dotenv(find_dotenv(), override=True)

import warnings
warnings.filterwarnings('ignore') 

from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

In [2]:
class CephCommunication:
    """
    Class to establish communication with a ceph s3 bucket.
    It connects with the bucket and provides methods to read and write data in the parquet format.
    """

    def __init__(
        self, s3_endpoint_url, aws_access_key_id, aws_secret_access_key, s3_bucket
    ):
        self.s3_endpoint_url = s3_endpoint_url
        self.aws_access_key_id = aws_access_key_id
        self.aws_secret_access_key = aws_secret_access_key
        self.s3_resource = boto3.resource(
            "s3",
            endpoint_url=self.s3_endpoint_url,
            aws_access_key_id=self.aws_access_key_id,
            aws_secret_access_key=self.aws_secret_access_key,
        )
        self.bucket = s3_bucket
        ## Todo: Add try catch

    def upload_to_ceph(self, dataframe, s3_path, filename):
        """
        This helper function takes as input the data frame to be uploaded, and the output filename.
        It then saves the data frame in the defined ceph bucket.
        """
        parquet_buffer = BytesIO()
        dataframe.to_parquet(parquet_buffer)
        s3_obj = self.s3_resource.Object(self.bucket, f"{s3_path}/{filename}")
        status = s3_obj.put(Body=parquet_buffer.getvalue())
        return status

    def read_from_ceph(self, s3_path, filename):
        """
        Helper function to read from ceph and see if the saved data is correct.
        """
        buffer = BytesIO()
        s3_object = self.s3_resource.Object(self.bucket, f"{s3_path}/{filename}")
        s3_object.download_fileobj(buffer)
        df_temp = pd.read_parquet(buffer)
        return df_temp


def save_to_disk(dataframe, path, filename):
    """
    Helper function to save the dataframe
    as a parquet file to disk.
    """
    dataset_base_path = Path(path)
    dataset_base_path.mkdir(parents=True, exist_ok=True)
    dataframe.to_parquet(f"{path}/{filename}")
    return True

In [3]:
## CEPH Bucket variables
## Create a .env file on your local with the correct configs,

ORG = os.getenv("GITHUB_ORG")
REPO = os.getenv("GITHUB_REPO")

## S3 bucket credentials
s3_endpoint_url = os.getenv("S3_ENDPOINT_URL")
s3_access_key = os.getenv("AWS_ACCESS_KEY_ID")
s3_secret_key = os.getenv("AWS_SECRET_ACCESS_KEY")
s3_bucket = os.getenv("S3_BUCKET")

s3_input_data_path = os.getenv("CEPH_BUCKET_PREFIX")

REMOTE = os.getenv("REMOTE")
RAW_DATA_PATH = os.path.join(
    s3_input_data_path, "srcopsmetrics/bot_knowledge", ORG, REPO, "PullRequest.json"
)

In [4]:
output = []
local_input_data_path = "../../../data/raw/GitHub/PullRequest.json.gz"
if REMOTE:
    print("getting dataset from ceph")
    s3 = boto3.resource(
        "s3",
        endpoint_url=s3_endpoint_url,
        aws_access_key_id=s3_access_key,
        aws_secret_access_key=s3_secret_key,
    )
    content = s3.Object(s3_bucket, RAW_DATA_PATH)
    file = content.get()["Body"].read().decode("utf-8")

    prs = json.loads(file)

    for pr in prs.splitlines():
        output.append(json.loads(pr))

else:
    print("getting dataset from local")
    with gzip.open(local_input_data_path, "r") as f:
        prs = json.loads(f.read().decode("utf-8"))


pr_df = pd.DataFrame(output)

getting dataset from ceph


In [5]:
# github pr dataset collected using thoth's mi-scheduler
pr_df.head()

Unnamed: 0,title,body,size,created_by,created_at,closed_at,closed_by,merged_at,merged_by,commits_number,changed_files_number,interactions,reviews,labels,commits,changed_files,changed_files_changes,first_review_at,first_approve_at,id
0,"docs: fix simple typo, cources -> courses",There is a small typo in CONTRIBUTING.md.\n\nS...,XS,timgates42,1659226482,,,,,1,1,{},{},[],[b3207ea8bd9b6965c05e487fe6246a098571dd1f],[CONTRIBUTING.md],{'CONTRIBUTING.md': 2},,,225
1,New tutorial resource of integrating Flask wit...,"Hey everyone,\r\n\r\nBecause Tailwind CSS and ...",XS,zoltanszogyenyi,1659015529,,,,,1,1,{'ah-ring': 1},{},[],[aac6f9434d27cab7177ce74bcf78f8b7fbc819dc],[README.md],{'README.md': 1},,,224
2,"Add apiman, a easy way to integrate Swagger/Op...",Add [apiman](https://github.com/strongbugman/a...,XS,strongbugman,1645251264,,,,,1,1,{'ah-ring': 1},{},[],[1c782d4588276a4f88ad43ee79e3b3010f66f518],[README.md],{'README.md': 1},,,222
3,Flask-RESTX replaces Flask-RestPlus,"If you go on the website of Flask-RestPlus, th...",XS,MartinThoma,1642846223,,,,,1,1,"{'ah-ring': 1, 'MartinThoma': 7}",{},[],[f06f9d04fc82623fd0ae8f4c3de2b33f45601f10],[README.md],{'README.md': 2},,,221
4,Added a free course and updated contributing.md,Flask: Develop Web Applications in Python - Th...,XS,samr1ddh1,1634472220,,,,,2,2,{},{},[],"[c669f7a1583f7ada3e7b256373581294c23795fe, fd0...","[CONTRIBUTING.md, README.md]","{'CONTRIBUTING.md': 2, 'README.md': 2}",,,219


In [6]:
interval = (pr_df["merged_at"] - pr_df["created_at"]).astype("float")
interval = interval.dropna()
interval

41       500500.0
47       469584.0
51      4693361.0
52      6937643.0
53      8032394.0
          ...    
186    13749675.0
187      216781.0
188    14898309.0
189       44225.0
191       78919.0
Length: 78, dtype: float64

In [7]:
n_buckets = 10

quantiles = interval.quantile(q=np.arange(0, 1 + 1e-100, 1 / n_buckets))
quantiles

0.0        1049.0
0.1       47956.7
0.2      126092.0
0.3      271133.6
0.4      467775.2
0.5      976198.5
0.6     2993056.6
0.7     5153405.9
0.8     6943473.8
0.9    10066874.3
dtype: float64

In [8]:
quantiles / 3600

0.0       0.291389
0.1      13.321306
0.2      35.025556
0.3      75.314889
0.4     129.937556
0.5     271.166250
0.6     831.404611
0.7    1431.501639
0.8    1928.742722
0.9    2796.353972
dtype: float64

In [9]:
time_intervals = quantiles / 3600

In [10]:
# remove PRs from train/test which are still open
pr_df = pr_df[pr_df["closed_at"].notna()]
pr_df = pr_df[pr_df["merged_at"].notna()]

In [11]:
pr_df["created_at"] = pr_df["created_at"].apply(
    lambda x: int(datetime.datetime.timestamp(pd.to_datetime(x)))
)
pr_df["closed_at"] = pr_df["closed_at"].apply(
    lambda x: float(datetime.datetime.timestamp(pd.to_datetime(x)))
)
pr_df["merged_at"] = pr_df["merged_at"].apply(
    lambda x: float(datetime.datetime.timestamp(pd.to_datetime(x)))
)

In [12]:
TEST_DATA_PATH = os.path.join(s3_input_data_path, ORG, REPO, "test-data")

# read processed and split data created for train/test in the model training notebook
if REMOTE:
    cc = CephCommunication(s3_endpoint_url, s3_access_key, s3_secret_key, s3_bucket)
    X_test = cc.read_from_ceph(TEST_DATA_PATH, "X_test.parquet")
    y_test = cc.read_from_ceph(TEST_DATA_PATH, "y_test.parquet")

else:
    print(
        "The X_test.parquet and y_test.parquet files are not included in the github repo."
    )
    print(
        "Please set REMOTE=1 in the .env file and read this data from the S3 bucket instead."
    )

In [13]:
X_test

Unnamed: 0,size,created_at_day,created_at_month,created_at_weekday,created_at_hour,changed_files_number,body_size,commits_number,filetype_.md,filetype_None,...,title_wordcount_vuejs,title_wordcount_way,title_wordcount_web,title_wordcount_whooshalchemy,title_wordcount_wtf,title_wordcount_youtube,title_wordcount_z3950,title_wordcount_zappa,title_wordcount_zmusic,title_wordcount_…
135,0.0,10.0,12.0,3.0,15.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
41,0.0,11.0,9.0,2.0,12.0,1.0,22.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
136,0.0,9.0,12.0,2.0,23.0,1.0,2.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100,1.0,10.0,8.0,2.0,14.0,1.0,0.0,2.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
62,0.0,30.0,4.0,0.0,19.0,1.0,46.0,2.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
191,0.0,18.0,8.0,0.0,17.0,1.0,50.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
132,0.0,13.0,1.0,2.0,3.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53,0.0,29.0,10.0,0.0,5.0,1.0,9.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
152,0.0,24.0,9.0,3.0,13.0,1.0,0.0,2.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
171,0.0,23.0,3.0,0.0,15.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
y_test

Unnamed: 0,ttm_class
135,5
41,4
136,5
100,4
62,0
191,1
132,2
53,8
152,0
171,0


In [15]:
sample_payload = pr_df.reindex(X_test.index)

In [16]:
sample_payload

Unnamed: 0,title,body,size,created_by,created_at,closed_at,closed_by,merged_at,merged_by,commits_number,changed_files_number,interactions,reviews,labels,commits,changed_files,changed_files_changes,first_review_at,first_approve_at,id
135,removed the duplicate quokka project,,XS,mehdy,1,1.450957,humiaozuzu,1.450957,humiaozuzu,1,1,{},{},[],[44b23a97bf699dcc8899d8969bc42058d10405c8],[README.md],{'README.md': 1},,,60
41,Mention Flask-Dramatiq extension,"Hi,\r\n\r\nI suggest to point to flask dramati...",XS,bersace,1,1.568706,humiaozuzu,1.568706,humiaozuzu,1,1,{'humiaozuzu': 1},{},[],[d38d7937baf29fa575210fb386014978b46f248b],[README.md],{'README.md': 2},,,169
136,Data Validation extension,Added Flask-Validator\n,XS,xeBuz,1,1.451632,humiaozuzu,1.451632,humiaozuzu,1,1,{'humiaozuzu': 3},{},[],[27b0bb05b145d40440ce1cb912e9610f4c5e2fde],[README.md],{'README.md': 1},,,59
100,Add flask-by-example tutorial,,S,truskovskiyk,1,1.471593,humiaozuzu,1.471593,humiaozuzu,2,1,{'truskovskiyk': 3},{},[],"[c2a15eb549dcf243a9297a90bbd1592d8dc5befa, 808...",[README.md],{'README.md': 13},,,99
62,updated readme with Flask-MonitoringDashboard,Flask-MonitoringDashboard is an extension for ...,XS,FlyingBird95,1,1.525129,humiaozuzu,1.525129,humiaozuzu,2,1,{'humiaozuzu': 2},{},[],"[f565cff08e8438e574ad69223ea79a7cfaaa9fea, 0ce...",[README.md],{'README.md': 3},,,142
191,Added Authomatic under OAuth client/providor,Added [Authomatic](https://github.com/peterhud...,XS,the-shank,1,1.408463,humiaozuzu,1.408463,humiaozuzu,1,1,"{'humiaozuzu': 4, 'the-shank': 15}",{},[],[e6e278e11ad54585555e02c2c922ca26243eb1c8],[README.md],{'README.md': 1},,,2
132,Add some books for learning Flask,,XS,thanhleviet,1,1.452862,humiaozuzu,1.452862,humiaozuzu,1,1,{},{},[],[c080ce70147805baafef5759ea5b6c3125f512e5],[README.md],{'README.md': 4},,,64
53,Add PythonBuddy,Added PythonBuddy to list of apps built by Flask!,XS,ethanchewy,1,1.548823,humiaozuzu,1.548823,humiaozuzu,1,1,{},{},[],[297afe7e1b37e806ee194e25bc752b41cdf746bf],[README.md],{'README.md': 1},,,155
152,Add flask-restful-swagger and Flask-HTTPAuth,,XS,houjunchen,1,1.443138,humiaozuzu,1.443138,humiaozuzu,2,1,{},{},[],"[32958993d2176a691884da47c53e20da99512869, 34c...",[README.md],{'README.md': 2},,,42
171,Update README.md,,XS,mjhea0,1,1.427158,humiaozuzu,1.427158,humiaozuzu,1,1,{},{},[],[a537b78facfae0f34990e068fd140b461e35214f],[README.md],{'README.md': 3},,,23


In [17]:
sample_payload.dtypes

title                     object
body                      object
size                      object
created_by                object
created_at                 int64
closed_at                float64
closed_by                 object
merged_at                float64
merged_by                 object
commits_number             int64
changed_files_number       int64
interactions              object
reviews                   object
labels                    object
commits                   object
changed_files             object
changed_files_changes     object
first_review_at           object
first_approve_at          object
id                        object
dtype: object

In [18]:
## read model
MODEL_KEY = os.path.join(s3_input_data_path, ORG, REPO, "ttm-model")
MODEL_FILENAME = "model.joblib"


s3_resource = boto3.resource(
    "s3",
    endpoint_url=s3_endpoint_url,
    aws_access_key_id=s3_access_key,
    aws_secret_access_key=s3_secret_key,
)

buffer = BytesIO()
s3_object = s3_resource.Object(s3_bucket, f"{MODEL_KEY}/{MODEL_FILENAME}")
s3_object.download_fileobj(buffer)
model = joblib.load(buffer)
model

In [19]:
# Test model on the dataset
preds = model.predict(sample_payload)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       4.0
           1       0.00      0.00      0.00       2.0
           2       0.00      0.00      0.00       2.0
           3       0.00      0.00      0.00       0.0
           4       0.00      0.00      0.00       3.0
           5       0.00      0.00      0.00       2.0
           6       0.00      0.00      0.00       0.0
           7       0.00      0.00      0.00       1.0
           8       0.00      0.00      0.00       1.0
           9       0.00      0.00      0.00       1.0

    accuracy                           0.00      16.0
   macro avg       0.00      0.00      0.00      16.0
weighted avg       0.00      0.00      0.00      16.0



# Conclusion

This notebook fetches the saved model from s3 and sends a payload to see how the model is performing on this new data. Additionally, we see that the evaluation scores in the classification report match the ones we saw in the training notebook. So, great, looks like our model are working as expected, and are ready to predict some times to merge for GitHub PRs! 