In [1]:
import codecs
from collections import defaultdict
import csv
import lzma
import operator
import re
from statistics import mean
import sys

In [2]:
PATH_TO_CSV = "/home/waren/sourced/data/datasets/review_comments.csv.xz"

## Basic statistics

In [3]:
def getusers(filename):
    with lzma.open(filename) as archf:
        reader = csv.DictReader(codecs.getreader("utf-8")(archf))
        for record in reader:
            yield record["AUTHOR"]

In [4]:
n_comments = 0
users = defaultdict(int)
for user in getusers(PATH_TO_CSV):
    n_comments += 1
    if n_comments % 100000 == 0:
        sys.stderr.write("%d\r" % n_comments)
    users[user] += 1

25300000

In [5]:
print("Number of PR review comments: %d" % (n_comments))
print("Number of different reviewers: %d" % (len(users)))
print("Average number of GitHub review comments per user from 2015 to 2019: %d" % (mean(list(users.values()))))

Number of PR review comments: 25323640
Number of different reviewers: 540054
Average number of GitHub review comments per user from 2015 to 2019: 46


## Most active reviewers

In [6]:
sorted_users = sorted(users.items(), key=operator.itemgetter(1), reverse=True)
sorted_users[:10]

[('houndci-bot', 797827),
 ('houndci', 264998),
 ('codacy-bot', 237814),
 ('stickler-ci', 36707),
 ('sonarcloud[bot]', 20378),
 ('jreback', 19694),
 ('seanlip', 19540),
 ('codeschool-kiddo', 18946),
 ('stephentoub', 18744),
 ('vkurennov', 18141)]

## Number of review comments about typos

In [14]:
typos = r"(?i)\b(typo)\b"

In [16]:
c = 0
typos_comments = []
with lzma.open(PATH_TO_CSV) as archf:
    reader = csv.DictReader(codecs.getreader("utf-8")(archf))
    for record in reader:
        if c % 100 == 0:
            sys.stderr.write("%d\r" % c)
        try:
            if re.search(typos, record["BODY"]):
                c += 1
                typos_comments.append(record["BODY"])
        except TypeError:
            continue

113200

In [17]:
print("Number of PR review comments about typos: %d" % (len(typos_comments)))
print("Overall proportion amount all review comments: %.2f%%" % (100 * len(typos_comments) / n_comments))

Number of PR review comments about typos: 113272
Overall proportion amount all review comments: 0.45%


In [18]:
typos_comments[:10]

['There is a slight typo here. I will point it out to you in class',
 '@Nylanfs \r\nUh... Summoner slayer syes? Is this intentional or a typo?',
 "It's a better practice to put constant first in a comparison. Consider this comparison typo issue:\r\n``` javascript\r\nif (attribute = 'aValue')\r\n```\r\n``` javascript\r\nif ('aValue' = attribute)\r\n```\r\nThe second form is really easier to detect\r\n\r\nBut I can change it if it breaks project coding style",
 'UNI_TIMESTAMP is that a typo ?',
 'typo ;) "vestivals"',
 'typo',
 'typo!',
 'typo!',
 'You made a little typo, fixing ;)\r\n@eiriktsarpalis',
 'typo in the package name ("performaceplatform"), is this working?']