In [1]:
import gzip
import pickle
from datetime import datetime, timedelta
from typing import List
from langdetect import detect
from tqdm import tqdm

import data

def to_api_url(url: str):
    return url[:url.rindex("#")]\
        .replace("github.com/", "api.github.com/repos/")\
        .replace("/pull/", "/pulls/")

def get_prs():
    seen = set()
    for l in data.load():
        if l["URL"] is not None:
            url = to_api_url(l["URL"])
            if url not in seen:
                seen.add(url)
                yield url
            
            

In [2]:
# Uncomment to re-load data (and comment out the load-from-file line)

# pr_docs = []
# 
# TOTAL_DOCS = 10_000
# 
# # progress bar will be inaccurate as we don't know how many will be merged.  Set to 110% of needed as an estimate
# pbar = tqdm(total=int(TOTAL_DOCS * 1.1), desc="Loading PRs", unit="pr")
# skipped = 0
# 
# total_merged = 0
# total_unmerged = 0
# 
# for pr_url in get_prs():
#     try:
#         repo, num = data.PR.repo_and_num_from_api_url(pr_url)
#         doc = data.PR.create_from_api(repo, num)
# 
#         text = "  ".join(doc.all_comments).replace("\\n", "  ")
# 
#         if detect(text) != 'en':
#             skipped  += 1
#             continue
# 
#         if total_merged >= int(TOTAL_DOCS / 2) and doc.merged:
#             continue
# 
#         if total_unmerged >= int(TOTAL_DOCS / 2) and not doc.merged:
#             continue
# 
#         pr_docs.append(doc)
#         pbar.update()
# 
#         if doc.merged:
#             total_merged += 1
#         else:
#             total_unmerged += 1
# 
#     except Exception as e:
#         skipped  += 1
# 
#     pbar.set_postfix_str("Skipped: " + str(skipped) + " - Merged: " + str(total_merged))
# 
#     if len(pr_docs) == TOTAL_DOCS:
#         break
# 
# pickle.dump(pr_docs, gzip.open(data.prs_obj_file, 'wb+', 9))

pr_docs: List[data.PR] = pickle.load(gzip.open(data.prs_obj_file, 'rb', 9))

print(len(pr_docs), "PR Objects")
print(sum(1 if p.merged else 0 for p in pr_docs), "Were Merged")

print(sum(len(p.review_comments) for p in pr_docs), "review comments")
print(sum(len(p.issue_comments) for p in pr_docs), "issue comments")
print(sum(len(p.all_comments) for p in pr_docs), "total comments")

10000 PR Objects
5000 Were Merged
63650 review comments
42694 issue comments
106344 total comments


In [3]:
doc = pr_docs[3]
print("Example:", doc.repo, "-", doc.pr_number)
print("Comments:\n", "\n-------------------------------------\n".join(doc.all_comments))
print("\n\n\nIssue Comments:\n", "\n-------------------------------------\n".join(doc.issue_comments))
print("\n\n\nCode Review Comments:\n", "\n-------------------------------------\n".join(doc.review_comments))

Example: gtk-rs/glib - 231
Comments:
 @GuillaumeGomez @EPashkin Any comments on this? Once this and the gir change landed, we need to regenerate all bindings that come with signals.
-------------------------------------
Ah sorry, I said it on IRC but I repeat here as well: I like it. All good for me!
-------------------------------------
`handler as u64`?
-------------------------------------
I meant: you lost conversion.
-------------------------------------
No, it's internally all based on c_ulong now again, like the FFI functions. No cast is needed here unless you meant something else.
-------------------------------------
Seems this not fully so:
```
error[E0277]: the trait bound `signal::SignalHandlerId: translate::FromGlib<u64>` is not satisfied
  --> src\signal.rs:57:5
   |
57 |     from_glib(handle)
   |     ^^^^^^^^^ the trait `translate::FromGlib<u64>` is not implemented for `signal::SignalHandlerId`
   |
   = help: the following implementations were found:
         

In [4]:
doc = pr_docs[5]
print("Example:", doc.repo, "-", doc.pr_number)
print("Comments:\n", "\n-------------------------------------\n".join(doc.all_comments))
print("\n\n\nIssue Comments:\n", "\n-------------------------------------\n".join(doc.issue_comments))
print("\n\n\nCode Review Comments:\n", "\n-------------------------------------\n".join(doc.review_comments))

Example: tibbe/unordered-containers - 132
Comments:
 Nitpick: end sentences in a dot.

-------------------------------------
oops, thanks, fixed




Issue Comments:
 



Code Review Comments:
 Nitpick: end sentences in a dot.

-------------------------------------
oops, thanks, fixed



In [5]:
# all_data = []
# errored = 0
# 
# for pr in tqdm(pr_docs):
#     try:
#         pr: data.PR
#         pr_data = data.github_api(pr.api_url)
#     
#         issue_comments = data.github_api(pr.issue_comments_api_url)
# 
#         review_comments = data.github_api(pr.review_comments_api_url)
#             
#         all_data.append({
#             "pr": pr,
#             "pr_data": pr_data,
#             "issue_comments": issue_comments,
#             "review_comments": review_comments
#         })
#     except Exception:
#         errored += 1
# 

100%|██████████| 10000/10000 [5:00:48<00:00,  1.33s/it]  


In [6]:
# made_after_close = 0
# amount_after_close = 0
# for data in tqdm(all_data):
#     try:
#         pr = data["pr"]
#         pr_data = data["pr_data"]
#         issue_comments = data["issue_comments"]
#         review_comments = data["review_comments"]
#         
#         ended_at = datetime.strptime(pr_data['closed_at'], "%Y-%m-%dT%H:%M:%SZ")
#         ended_at += timedelta(minutes=5)
#         
#         issue_dates = [datetime.strptime(x['created_at'], "%Y-%m-%dT%H:%M:%SZ") for x in issue_comments]
# 
#         was_after = False
# 
#         for d in issue_dates:
#             if d > ended_at:
#                 amount_after_close +=  1
#                 was_after = True
# 
#         review_dates = [datetime.strptime(x['created_at'], "%Y-%m-%dT%H:%M:%SZ") for x in review_comments]
# 
#         for d in review_dates:
#             if d > ended_at:
#                 amount_after_close +=  1
#                 was_after = True
#         
#         if was_after:
#             made_after_close += 1
#             # print(pr.api_url)
#             # print(pr.issue_comments_api_url)
#             # print(pr.review_comments_api_url)
#             # break
#         
#         # print(pr_data)
#     except Exception as e:
#         errored += 1
#         
# print("Comments Made After Close:", amount_after_close)  # 4214
# print("PRs with comments after close:", made_after_close)  # 1915
# print("Errored:", errored)
# 

100%|██████████| 9977/9977 [00:03<00:00, 3269.44it/s]


Comments Made After Close: 3263
PRs with comments after close: 1375
Errored: 23


In [8]:
# states = set()
# 
# open = 0
# for data in tqdm(all_data):
#     states.add(data["pr_data"]["state"])
#     if data["pr_data"]["state"] != "closed":
#         open += 1
#         
# print(open)



100%|██████████| 9977/9977 [00:00<00:00, 232033.64it/s]


0
