# Run inference on time to merge model trained previously


## What we did previously

In the previous [notebook](./03_model_training.ipynb) we trained machine learning models to classify a PR's `time_to_merge` into one of the 10 bins (or "classes"). We then deployed the model with the highest f1-score as a service using the model saved in s3.

## In this step


The purpose of this notebook is to check whether this service is running as intended, and more specifically to ensure that the model performance is what we expect it to be. So here, we will use the test set from the aforementioned notebook as the query payload for the service, and then verify that the return values are the same as those obtained during training/testing locally.
# Time to Merge Prediction Inference Service

In the previous notebook, we explored some basic machine learning models for predicting time to merge of a PR.

In [1]:
import os
import sys
import gzip
import json
import boto3
import datetime
import requests
from dotenv import load_dotenv, find_dotenv
from io import BytesIO

import joblib

import numpy as np
import pandas as pd

from sklearn.metrics import classification_report

load_dotenv(find_dotenv(), override=True)
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

In [2]:
class CephCommunication:
    """
    Class to establish communication with a ceph s3 bucket.
    It connects with the bucket and provides methods to read and write data in the parquet format.
    """

    def __init__(
        self, s3_endpoint_url, aws_access_key_id, aws_secret_access_key, s3_bucket
    ):
        self.s3_endpoint_url = s3_endpoint_url
        self.aws_access_key_id = aws_access_key_id
        self.aws_secret_access_key = aws_secret_access_key
        self.s3_resource = boto3.resource(
            "s3",
            endpoint_url=self.s3_endpoint_url,
            aws_access_key_id=self.aws_access_key_id,
            aws_secret_access_key=self.aws_secret_access_key,
        )
        self.bucket = s3_bucket
        ## Todo: Add try catch

    def upload_to_ceph(self, dataframe, s3_path, filename):
        """
        This helper function takes as input the data frame to be uploaded, and the output filename.
        It then saves the data frame in the defined ceph bucket.
        """
        parquet_buffer = BytesIO()
        dataframe.to_parquet(parquet_buffer)
        s3_obj = self.s3_resource.Object(self.bucket, f"{s3_path}/{filename}")
        status = s3_obj.put(Body=parquet_buffer.getvalue())
        return status

    def read_from_ceph(self, s3_path, filename):
        """
        Helper function to read from ceph and see if the saved data is correct.
        """
        buffer = BytesIO()
        s3_object = self.s3_resource.Object(self.bucket, f"{s3_path}/{filename}")
        s3_object.download_fileobj(buffer)
        df_temp = pd.read_parquet(buffer)
        return df_temp


def save_to_disk(dataframe, path, filename):
    """
    Helper function to save the dataframe
    as a parquet file to disk.
    """
    dataset_base_path = Path(path)
    dataset_base_path.mkdir(parents=True, exist_ok=True)
    dataframe.to_parquet(f"{path}/{filename}")
    return True

In [3]:
## CEPH Bucket variables
## Create a .env file on your local with the correct configs,

ORG = os.getenv("GITHUB_ORG")
REPO = os.getenv("GITHUB_REPO")

## S3 bucket credentials
s3_endpoint_url = os.getenv("S3_ENDPOINT_URL")
s3_access_key = os.getenv("AWS_ACCESS_KEY_ID")
s3_secret_key = os.getenv("AWS_SECRET_ACCESS_KEY")
s3_bucket = os.getenv("S3_BUCKET")

s3_input_data_path = os.getenv("CEPH_BUCKET_PREFIX")

REMOTE = os.getenv("REMOTE")
RAW_DATA_PATH = os.path.join(
    s3_input_data_path, "srcopsmetrics/bot_knowledge", ORG, REPO, "PullRequest.json"
)

In [4]:
output = []
local_input_data_path = "../../../data/raw/GitHub/PullRequest.json.gz"
if REMOTE:
    print("getting dataset from ceph")
    s3 = boto3.resource(
        "s3",
        endpoint_url=s3_endpoint_url,
        aws_access_key_id=s3_access_key,
        aws_secret_access_key=s3_secret_key,
    )
    content = s3.Object(s3_bucket, RAW_DATA_PATH)
    file = content.get()["Body"].read().decode("utf-8")

    prs = json.loads(file)

    for pr in prs.splitlines():
        output.append(json.loads(pr))

else:
    print("getting dataset from local")
    with gzip.open(local_input_data_path, "r") as f:
        prs = json.loads(f.read().decode("utf-8"))


pr_df = pd.DataFrame(output)

getting dataset from ceph


In [5]:
# github pr dataset collected using thoth's mi-scheduler
pr_df.head()

Unnamed: 0,title,body,size,created_by,created_at,closed_at,closed_by,merged_at,merged_by,commits_number,changed_files_number,interactions,reviews,labels,commits,changed_files,first_review_at,first_approve_at,id
0,add knikolla and cleanup,,S,schwesig,1665410557,1665411000.0,sesheta,1665411000.0,sesheta,2,1,"{'schwesig': 175, 'quaid': 1, 'sesheta': 65}",{},"[lgtm, triage/accepted, approved, priority/cri...","[20fcc08d8ebd4e7453469c7fd5492995f156181c, 764...",[OWNERS],,,229
1,pre-commit to clean up existing errors (traili...,,XS,schwesig,1665402036,1665409000.0,sesheta,1665409000.0,sesheta,1,1,"{'schwesig': 9, 'codificat': 16, 'quaid': 3, '...",{},"[lgtm, triage/accepted, approved, priority/cri...",[210de76a23b3cf7f1d0c66b8c82457c597400797],[sig-community/meeting-notes/20221006-meeting-...,,,228
2,Add knikolla,,XS,schwesig,1665079206,1665409000.0,schwesig,,,0,0,"{'quaid': 2, 'schwesig': 7, 'sesheta': 287, 'c...","{'1134003785': {'author': 'schwesig', 'words_c...","[size/XS, area/user, kind/governance, sig/comm...",[],[],1665127000.0,,225
3,Create 20221006-meeting-notes.md,,M,schwesig,1665078958,1665148000.0,schwesig,1665148000.0,schwesig,1,1,"{'sesheta': 172, 'quaid': 2}","{'1133430644': {'author': 'quaid', 'words_coun...","[kind/documentation, lgtm, approved, size/M, s...",[54c273e75aa9a4a7326c7f9aad1b013bc0c29791],[sig-community/meeting-notes/20221006-meeting-...,1665079000.0,1665079000.0,224
4,Update wording to be more open and inclusive.,Updates as worked out with Karsten\r\n,XS,billburnseh,1663618828,1663619000.0,billburnseh,1663619000.0,billburnseh,2,1,{'sesheta': 86},{},[size/XS],"[c86790d3a173abae0682dcbef583e6b151d6e61c, 272...",[four-principles.md],,,223


In [6]:
interval = (pr_df["merged_at"] - pr_df["created_at"]).astype("float")
interval = interval.dropna()
interval

0         476.0
1        7081.0
3       68588.0
4          11.0
5          20.0
        ...    
89     359974.0
90      52870.0
91      86385.0
92    4028019.0
93      16676.0
Length: 80, dtype: float64

In [7]:
n_buckets = 10

quantiles = interval.quantile(q=np.arange(0, 1 + 1e-100, 1 / n_buckets))
quantiles

0.0         11.0
0.1         48.1
0.2        327.0
0.3       4442.5
0.4      39709.6
0.5      59925.0
0.6      88395.4
0.7     426698.0
0.8    1044979.2
0.9    2048763.9
dtype: float64

In [8]:
quantiles / 3600

0.0      0.003056
0.1      0.013361
0.2      0.090833
0.3      1.234028
0.4     11.030444
0.5     16.645833
0.6     24.554278
0.7    118.527222
0.8    290.272000
0.9    569.101083
dtype: float64

In [9]:
time_intervals = quantiles / 3600

In [10]:
# remove PRs from train/test which are still open
pr_df = pr_df[pr_df["closed_at"].notna()]
pr_df = pr_df[pr_df["merged_at"].notna()]

In [11]:
pr_df["created_at"] = pr_df["created_at"].apply(
    lambda x: int(datetime.datetime.timestamp(pd.to_datetime(x)))
)
pr_df["closed_at"] = pr_df["closed_at"].apply(
    lambda x: float(datetime.datetime.timestamp(pd.to_datetime(x)))
)
pr_df["merged_at"] = pr_df["merged_at"].apply(
    lambda x: float(datetime.datetime.timestamp(pd.to_datetime(x)))
)

In [12]:
TEST_DATA_PATH = os.path.join(s3_input_data_path, ORG, REPO, "test-data")

# read processed and split data created for train/test in the model training notebook
if REMOTE:
    cc = CephCommunication(s3_endpoint_url, s3_access_key, s3_secret_key, s3_bucket)
    X_test = cc.read_from_ceph(TEST_DATA_PATH, "X_test.parquet")
    y_test = cc.read_from_ceph(TEST_DATA_PATH, "y_test.parquet")

else:
    print(
        "The X_test.parquet and y_test.parquet files are not included in the github repo."
    )
    print(
        "Please set REMOTE=1 in the .env file and read this data from the S3 bucket instead."
    )

In [13]:
X_test

Unnamed: 0,size,created_at_day,created_at_month,created_at_weekday,created_at_hour,changed_files_number,body_size,commits_number,filetype_.md,filetype_.yaml,...,title_wordcount_use,title_wordcount_var,title_wordcount_via,title_wordcount_website,title_wordcount_week,title_wordcount_weekly,title_wordcount_whitespace,title_wordcount_word,title_wordcount_wording,title_wordcount_workload
36,3.0,31.0,3.0,3.0,12.0,7.0,38.0,1.0,5.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,1.0,10.0,10.0,0.0,14.0,1.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28,0.0,27.0,4.0,2.0,21.0,1.0,4.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37,2.0,29.0,3.0,1.0,16.0,2.0,31.0,1.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23,0.0,31.0,5.0,1.0,12.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34,2.0,31.0,3.0,3.0,14.0,4.0,11.0,1.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,3.0,8.0,2.0,18.0,2.0,4.0,2.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
84,5.0,27.0,8.0,4.0,19.0,3.0,18.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,2.0,13.0,9.0,1.0,14.0,8.0,9.0,1.0,6.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,1.0,26.0,7.0,1.0,17.0,2.0,18.0,3.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
y_test

Unnamed: 0,ttm_class
36,2
0,2
28,2
37,8
23,9
34,7
12,0
84,6
5,0
14,1


In [15]:
sample_payload = pr_df.reindex(X_test.index)

In [16]:
sample_payload

Unnamed: 0,title,body,size,created_by,created_at,closed_at,closed_by,merged_at,merged_by,commits_number,changed_files_number,interactions,reviews,labels,commits,changed_files,first_review_at,first_approve_at,id
36,running generator,regenerate content based on yamls\r\n\r\n```\r...,L,durandom,1,1.64873,sesheta,1.64873,sesheta,1,7,{'sesheta': 65},"{'927505912': {'author': 'schwesig', 'words_co...","[size/L, lgtm, approved]",[6877b914c18198a4e590d56c9ad5bbf6d3d37e6a],"[OWNERS_ALIASES, sig-list.md, sig-operations/R...",1648730000.0,1648730000.0,168
0,add knikolla and cleanup,,S,schwesig,1,1.665411,sesheta,1.665411,sesheta,2,1,"{'schwesig': 175, 'quaid': 1, 'sesheta': 65}",{},"[lgtm, triage/accepted, approved, priority/cri...","[20fcc08d8ebd4e7453469c7fd5492995f156181c, 764...",[OWNERS],,,229
28,Fix typos and bad links,Corrected spelling of tuesday,XS,msdisme,1,1.651095,sesheta,1.651095,sesheta,1,1,"{'quaid': 2, 'sesheta': 65}",{},"[lgtm, approved, size/XS]",[c1701b6c7283020e24a6a4048cc16b7ffdceb0cf],[sig-telemetry/README.md],,,182
37,[notes] First working session on docs gap anal...,- define common steps from zero to first pull ...,M,quaid,1,1.64971,quaid,1.64971,quaid,1,2,{'sesheta': 73},{},[size/M],[75c6ac7a3fe7d9f0d0bf2f548985b45b549058e6],[sig-community/wg-contrib_x-docs/meeting-notes...,,,167
23,Fix typo,,XS,mh21,1,1.657734,sesheta,1.657734,sesheta,1,1,"{'sesheta': 65, 'schwesig': 2}","{'993606986': {'author': 'schwesig', 'words_co...","[lgtm, approved, size/XS]",[f5ddb06d2a7de89189a73ad4c3a300985a8f3898],[open-source-services.md],1654179000.0,1654179000.0,194
34,working group fybrik,new working group to collaborate at using fybr...,M,durandom,1,1.649767,sesheta,1.649767,sesheta,1,4,"{'sesheta': 65, 'durandom': 15, 'quaid': 27}","{'927659859': {'author': 'schwesig', 'words_co...","[lgtm, approved, size/M]",[9b981d194303b04a896a187f0f50ab6d46f4e29f],"[OWNERS_ALIASES, sig-list.md, sigs.yaml, wg-fy...",1648736000.0,1648736000.0,170
12,Rename meeting notes to follow the template,Rename 20220719_meeting-nodes.md to 20220719-m...,XS,schwesig,1,1.65955,schwesig,1.65955,schwesig,2,2,{'sesheta': 73},{},[size/XS],"[8feaa06fcf4d81ea9ac7f8eafbb42512c18ab8ae, 22f...",[sig-community/wg-website-updates/meeting-note...,,,214
84,Operate First Community Metrics EDA notebook,Initial notebook to fetch and visualize the Gi...,XXL,hemajv,1,1.630492,durandom,1.630492,durandom,1,3,"{'durandom': 33, 'sesheta': 68}","{'743639779': {'author': 'quaid', 'words_count...","[approved, size/XXL]",[e9ef25a79564a7c0575cc4bed5513ef233f488d2],"[metrics/README.md, metrics/community_metrics....",1630485000.0,1630493000.0,80
5,office hours,- add office hours\n- ran the markdown generat...,M,durandom,1,1.66308,durandom,1.66308,durandom,1,8,{'sesheta': 144},{},"[do-not-merge/invalid-owners-file, size/M]",[c10b264bd795ac3110035bef51fef9e50a63f782],"[OWNERS_ALIASES, README.md, sig-data-science/R...",,,222
14,Create 2022-07-26_meeting-nodes.md,"and renaming the last meeting nodes, removing ...",S,schwesig,1,1.658856,schwesig,1.658856,schwesig,3,2,"{'sesheta': 211, 'schwesig': 7}","{'1051361742': {'author': 'schwesig', 'words_c...","[kind/documentation, approved, size/S, kind/we...","[2ce8850d7152b21579f018f3872181b3117114e3, f9e...",[sig-community/wg-website-updates/meeting-note...,1658856000.0,,212


In [17]:
sample_payload.dtypes

title                    object
body                     object
size                     object
created_by               object
created_at                int64
closed_at               float64
closed_by                object
merged_at               float64
merged_by                object
commits_number            int64
changed_files_number      int64
interactions             object
reviews                  object
labels                   object
commits                  object
changed_files            object
first_review_at         float64
first_approve_at        float64
id                       object
dtype: object

In [18]:
sample_payload

Unnamed: 0,title,body,size,created_by,created_at,closed_at,closed_by,merged_at,merged_by,commits_number,changed_files_number,interactions,reviews,labels,commits,changed_files,first_review_at,first_approve_at,id
36,running generator,regenerate content based on yamls\r\n\r\n```\r...,L,durandom,1,1.64873,sesheta,1.64873,sesheta,1,7,{'sesheta': 65},"{'927505912': {'author': 'schwesig', 'words_co...","[size/L, lgtm, approved]",[6877b914c18198a4e590d56c9ad5bbf6d3d37e6a],"[OWNERS_ALIASES, sig-list.md, sig-operations/R...",1648730000.0,1648730000.0,168
0,add knikolla and cleanup,,S,schwesig,1,1.665411,sesheta,1.665411,sesheta,2,1,"{'schwesig': 175, 'quaid': 1, 'sesheta': 65}",{},"[lgtm, triage/accepted, approved, priority/cri...","[20fcc08d8ebd4e7453469c7fd5492995f156181c, 764...",[OWNERS],,,229
28,Fix typos and bad links,Corrected spelling of tuesday,XS,msdisme,1,1.651095,sesheta,1.651095,sesheta,1,1,"{'quaid': 2, 'sesheta': 65}",{},"[lgtm, approved, size/XS]",[c1701b6c7283020e24a6a4048cc16b7ffdceb0cf],[sig-telemetry/README.md],,,182
37,[notes] First working session on docs gap anal...,- define common steps from zero to first pull ...,M,quaid,1,1.64971,quaid,1.64971,quaid,1,2,{'sesheta': 73},{},[size/M],[75c6ac7a3fe7d9f0d0bf2f548985b45b549058e6],[sig-community/wg-contrib_x-docs/meeting-notes...,,,167
23,Fix typo,,XS,mh21,1,1.657734,sesheta,1.657734,sesheta,1,1,"{'sesheta': 65, 'schwesig': 2}","{'993606986': {'author': 'schwesig', 'words_co...","[lgtm, approved, size/XS]",[f5ddb06d2a7de89189a73ad4c3a300985a8f3898],[open-source-services.md],1654179000.0,1654179000.0,194
34,working group fybrik,new working group to collaborate at using fybr...,M,durandom,1,1.649767,sesheta,1.649767,sesheta,1,4,"{'sesheta': 65, 'durandom': 15, 'quaid': 27}","{'927659859': {'author': 'schwesig', 'words_co...","[lgtm, approved, size/M]",[9b981d194303b04a896a187f0f50ab6d46f4e29f],"[OWNERS_ALIASES, sig-list.md, sigs.yaml, wg-fy...",1648736000.0,1648736000.0,170
12,Rename meeting notes to follow the template,Rename 20220719_meeting-nodes.md to 20220719-m...,XS,schwesig,1,1.65955,schwesig,1.65955,schwesig,2,2,{'sesheta': 73},{},[size/XS],"[8feaa06fcf4d81ea9ac7f8eafbb42512c18ab8ae, 22f...",[sig-community/wg-website-updates/meeting-note...,,,214
84,Operate First Community Metrics EDA notebook,Initial notebook to fetch and visualize the Gi...,XXL,hemajv,1,1.630492,durandom,1.630492,durandom,1,3,"{'durandom': 33, 'sesheta': 68}","{'743639779': {'author': 'quaid', 'words_count...","[approved, size/XXL]",[e9ef25a79564a7c0575cc4bed5513ef233f488d2],"[metrics/README.md, metrics/community_metrics....",1630485000.0,1630493000.0,80
5,office hours,- add office hours\n- ran the markdown generat...,M,durandom,1,1.66308,durandom,1.66308,durandom,1,8,{'sesheta': 144},{},"[do-not-merge/invalid-owners-file, size/M]",[c10b264bd795ac3110035bef51fef9e50a63f782],"[OWNERS_ALIASES, README.md, sig-data-science/R...",,,222
14,Create 2022-07-26_meeting-nodes.md,"and renaming the last meeting nodes, removing ...",S,schwesig,1,1.658856,schwesig,1.658856,schwesig,3,2,"{'sesheta': 211, 'schwesig': 7}","{'1051361742': {'author': 'schwesig', 'words_c...","[kind/documentation, approved, size/S, kind/we...","[2ce8850d7152b21579f018f3872181b3117114e3, f9e...",[sig-community/wg-website-updates/meeting-note...,1658856000.0,,212


In [19]:
## read model
MODEL_KEY = os.path.join(s3_input_data_path, ORG, REPO, "ttm-model")
MODEL_FILENAME = "model.joblib"


s3_resource = boto3.resource(
    "s3",
    endpoint_url=s3_endpoint_url,
    aws_access_key_id=s3_access_key,
    aws_secret_access_key=s3_secret_key,
)

buffer = BytesIO()
s3_object = s3_resource.Object(s3_bucket, f"{MODEL_KEY}/{MODEL_FILENAME}")
s3_object.download_fileobj(buffer)
model = joblib.load(buffer)
model

In [20]:
# Test model on the dataset
preds = model.predict(sample_payload)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         3
           3       0.33      0.50      0.40         2
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         2
           9       0.00      0.00      0.00         2

    accuracy                           0.06        16
   macro avg       0.03      0.05      0.04        16
weighted avg       0.04      0.06      0.05        16



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Conclusion

This notebook shows how raw PR data can be sent to the deployed Seldon service to get time-to-merge predictions. Additionally, we see that the evaluation scores in the classification report match the ones we saw in the training notebook. So, great, looks like our inference service and model are working as expected, and are ready to predict some times to merge for GitHub PRs! 