In [1]:
JIRA_CREDENTIALS = {
    "token_auth": "",
    "server": "https://issues.apache.org/jira/",
    "async_": True
}

GITHUB_CREDENTIALS = {
    "Authorization": "", "User-Agent": "rok",
    "Accept": "application/vnd.github.golden-comet-preview+json"}

# Process

1. Download issues, comments, watchers and external links (~20min)
2. Create full GitHub issues (with Jira referring issue links) (~3hrs 20min)
2. Collect Jira Issue url to GitHub issue url map (~30min)
3. Post GitHub issue links to Jira issue comments (?)
4. Lock Jira issues (manual)
5. Update GitHub issues with corrected related issue / subtask links (?)

# NOTES

* Only active milestones can be added to. Activate relevant github milestones before import.

# Questions

* Which labels should still be added?
* Should assignees be transferred?
* Should watchers be pinged in a post import comment? Should watchers be assignees?

In [2]:
import csv, pickle, re, time
from string import punctuation
import jira2markdown
from jira2markdown.markup.links import Mention
from jira2markdown.markup.base import AbstractMarkup
from jira import JIRA
from multiprocessing import Pool, cpu_count
from pyparsing import (
    CaselessLiteral,
    Char,
    Combine,
    FollowedBy,
    Optional,
    ParserElement,
    ParseResults,
    PrecededBy,
    SkipTo,
    StringEnd,
    StringStart,
    Suppress,
    White,
    Word,
    alphanums,
)
import requests


def is_completed(item):
    return item.fields.status.name in ["Closed", "Resolved"]


def extract_linked_issues(linked_issue):
    if hasattr(linked_issue, "outwardIssue"):
        return {
            "key": linked_issue.outwardIssue.key,
            "relationship": linked_issue.type.outward,
            "summary": linked_issue.outwardIssue.fields.summary,
            "url": linked_issue.outwardIssue.permalink(),
            "completed": is_completed(linked_issue.outwardIssue)
        }
    else:
        return {
            "key": linked_issue.inwardIssue.key,
            "relationship": linked_issue.type.inward,
            "summary": linked_issue.inwardIssue.fields.summary,
            "url": linked_issue.inwardIssue.permalink(),
            "completed": is_completed(linked_issue.inwardIssue)
        }


def get_user_string(jira_author, jira_url):
    if jira_author.name in USER_MAPPING:
        github_id = f" / @{USER_MAPPING[jira_author.name]}"
    else:
        github_id = ""
    return f"[{jira_author.displayName}]({jira_url})" + github_id


def get_comments(issue):
    comments = []
    for comment in issue.fields.comment.comments:
        # Skip ASF GitHub Bot comments per https://github.com/apache/arrow/issues/14648
        if comment.author.name == "githubbot":
            continue

        jira_url = f"{issue.permalink()}?focusedCommentId={comment.id}"
        user_string = get_user_string(comment.author, jira_url)
        
        fixed_comments = TRANSLATED_MARKUP[issue.key]["comments"]
        comments.append({
            "body": f"{user_string}:\n{fixed_comments[comment.id]}",
            "created_at": comment.created[:-5] + "Z"
        })
    return comments


def request_to_github(params, session):
    while True:
        r = session.request(**params)

        if r.status_code in (200, 202, 204):
            # all is good
            return r
        elif r.status_code == 403:
            # throttling
            print("Response was: ", r.json())
            reset_time = int(r.headers["X-RateLimit-Reset"])
            wait_time = reset_time - round(time.time() + .5)
            if wait_time > 0:
                print(f"Throttled on {params} call. Sleeping for {wait_time // 60} minutes.")
                time.sleep(wait_time)
            else:
                time.sleep(1)
        else:
            # something is wrong
            print(f"Request {params} returned status code {r.status_code} and ", r.text)


class MigratedMention(AbstractMarkup):
    def action(self, tokens: ParseResults) -> str:
        username = self.usernames.get(tokens.accountid)
        return f"`[~{tokens.accountid}]`" if username is None else f"@{username}"

    @property
    def expr(self) -> ParserElement:
        MENTION = Combine(
            "["
            + Optional(
                SkipTo("|", failOn="]") + Suppress("|"),
                default="",
                )
            + "~"
            + Optional(CaselessLiteral("accountid:"))
            + Word(alphanums + ":-").setResultsName("accountid")
            + "]",
            )
        return (
                (StringStart() | Optional(PrecededBy(White(), retreat=1), default=" "))
                + MENTION.setParseAction(self.action)
                + (StringEnd() | Optional(FollowedBy(White() | Char(punctuation, excludeChars="[") | MENTION), default=" "))
        )


LEADING_SPACE_HASH_PATTERN = re.compile(r"\n\s(#+\s+\S.*)")
ELEMENTS = jira2markdown.elements.MarkupElements()
ELEMENTS.replace(Mention, MigratedMention)


def translate_markup(issue):
    if issue.fields.description:
        description = issue.fields.description
    else:
        description = ""

    description = re.sub(LEADING_SPACE_HASH_PATTERN, r"\n\1", description)
    text = jira2markdown.convert(description, elements=ELEMENTS, usernames=USER_MAPPING)

    for attachment in issue.fields.attachment:
        text = text.replace(f"![{attachment.filename}]({attachment.filename})",
                            f"![{attachment.filename}]({attachment.content})")

    comments = {}
    for comment in issue.fields.comment.comments:
        # Skip ASF GitHub Bot comments per https://github.com/apache/arrow/issues/14648
        if comment.author.name == "githubbot":
            continue
        comment_body = re.sub(LEADING_SPACE_HASH_PATTERN, r"\n\1", comment.body)
        comment_text = jira2markdown.convert(comment_body, elements=ELEMENTS, usernames=USER_MAPPING)

        for attachment in issue.fields.attachment:
            comment_text = comment_text.replace(f"![{attachment.filename}]({attachment.filename})",
                                                f"![{attachment.filename}]({attachment.content})")
        comments[comment.id] = comment_text

    return (issue.key, {"description": text, "comments": comments})

In [3]:
raw_jira_issues_filename = 'raw_jira_issues.pickle'
raw_jira_watchers_filename = 'raw_jira_watchers.pickle'
raw_jira_remote_links_filename = "raw_jira_remote_links.pickle"
raw_github_prs_filename = "raw_github_prs.pickle"
translated_markup_filename = "translated_markdown.pickle"
jira_to_github_user_mapping_file = 'jira-to-github-user-mapping.csv'

RELEASE_ORDER = (
    '0.1.0', '0.2.0', '0.3.0', 'JS-0.3.0', 'JS-0.3.1', '0.4.0', 'JS-0.4.0',
    '0.4.1', 'JS-0.4.1', '0.5.0', '0.6.0', '0.7.0', '0.7.1', '0.8.0',
    '0.9.0', '0.10.0', '0.11.0', '0.11.1', '0.12.0', '0.12.1', '0.13.0',
    '0.14.0', '0.14.1', '0.15.0', '0.15.1', '0.16.0', '0.17.0', '0.17.1',
    '1.0.0', '1.0.1', '2.0.0', '3.0.0', '3.0.1', '4.0.0', '4.0.1', '5.0.0',
    '5.0.1', '6.0.0', '6.0.1', '6.0.2', '6.0.3', '7.0.0', '7.0.1', '7.0.2',
    '8.0.0', '8.0.1', '9.0.0', '9.0.1', '10.0.0', '10.0.1', '10.0.2',
    '11.0.0', '12.0.0'
)

ISSUETYPE_MAP = {
    "Bug": "Type: bug",
    "Improvement": "Type: enhancement",
    "Wish": "Type: enhancement",
    "New Feature": "Type: enhancement",
    "Task": "Type: task",
    "Sub-task": "Type: task",
    "Test": "Type: test"
}

GITHUB_LABELS = (
    "Component: Archery", "Component: Benchmarking", "Component: C",
    "Component: C#", "Component: C++", "Component: C++ - Gandiva",
    "Component: C++ - Plasma", "Component: Continuous Integration",
    "Component: Developer Tools", "Component: Documentation",
    "Component: FlightRPC", "Component: Format", "Component: GLib",
    "Component: Go", "Component: GPU", "Component: Integration",
    "Component: Java", "Component: JavaScript", "Component: Julia",
    "Component: MATLAB", "Component: Other", "Component: Packaging",
    "Component: Parquet", "Component: Python", "Component: R",
    "Component: Release", "Component: Ruby", "Component: Rust",
    "Component: Rust - Ballista", "Component: Rust - DataFusion",
    "Component: Website", "Component: Wiki", "dependencies",
    "good-first-issue", "hacktoberfest-accepted", "java", "javascript",
    "lang-go", "needs-rebase", "ready-for-review", "Type: bug",
    "Type: enhancement", "Type: task", "Type: test", "Type: usage",
    "WIP"
)

milestone_url = "https://api.github.com/repos/apache/arrow/milestones"
raw_milestone_map = requests.get(milestone_url, params={"state": "all"}, headers=GITHUB_CREDENTIALS)
MILESTONE_MAP = {x["title"]: x["number"] for x in raw_milestone_map.json()}

testing_milestone_url = "https://api.github.com/repos/datatart/import_dry_run_4/milestones"
raw_testing_milestone_map = requests.get(testing_milestone_url, params={"state": "all"}, headers=GITHUB_CREDENTIALS)
TESTING_MILESTONE_MAP = {x["title"]: x["number"] for x in raw_testing_milestone_map.json()}

MILESTONE_MAP = TESTING_MILESTONE_MAP

MIGRATION_NOTE = "\n\n**Note**: *This issue was originally created as [{issue_key}]({jira_url}). " \
    "Please see the " \
    "[migration documentation](https://gist.github.com/toddfarmer/12aa88361532d21902818a6044fda4c3) " \
    "for further details.*"

JIRA_MIGRATION_NOTE = "This issue has been migrated to [issue #{gh_id}|{gh_url}] on GitHub. " \
    "Please see the " \
    "[migration documentation|https://gist.github.com/toddfarmer/12aa88361532d21902818a6044fda4c3] " \
    "for further details."

USER_MAPPING = {}
with open(jira_to_github_user_mapping_file, newline="") as f:
    reader = csv.reader(f)
    for row in reader:
        USER_MAPPING[row[0]] = row[2]
        USER_MAPPING[row[1]] = row[2]


OWNER = "datatart"
REPO = "import_dry_run_4"
IMPORT_URL = f"https://api.github.com/repos/{OWNER}/{REPO}/import/issues"

GITHUB_PROJECT_URL = "https://github.com/apache/arrow/pull/"

PROJECT_NAME = "ARROW"

In [4]:
# This is to check for GitHubs assignable users. We currently don't need it.
#
# def get_assignable_users(users):
#     user_can_be_assignee = []
#     with requests.Session() as s:
#         for user in users:
#             url = f"https://api.github.com/repos/apache/arrow/assignees/{user}"
#             params = {"method": "GET", "url": url, "headers": GITHUB_CREDENTIALS}
#             response = request_to_github(params, s)
#             if response.status_code == 204:
#                 user_can_be_assignee.append(user)
#
# USER_CAN_BE_ASSIGNEE = get_assignable_users(USER_MAPPING.values())

# Get Jira issue data and cache it to pickle

In [None]:
%%time
CONN = JIRA(**JIRA_CREDENTIALS)


ISSUES = CONN.search_issues(f"project = {PROJECT_NAME} order by key", maxResults = False, fields = '*all')
with open(raw_jira_issues_filename, 'wb') as handle:
    pickle.dump(ISSUES, handle, protocol=pickle.HIGHEST_PROTOCOL)

    
WATCHERS = {}
for i, issue in enumerate(ISSUES):
    if i % 1000 == 0:
        print(f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] Getting watchers for {issue.key} [{i}/{len(ISSUES)}].")
    WATCHERS[issue.id] = CONN.watchers(issue.id)
with open(raw_jira_watchers_filename, 'wb') as handle:
    pickle.dump(WATCHERS, handle, protocol=pickle.HIGHEST_PROTOCOL)

    
REMOTE_LINKS = {}
for i, issue in enumerate(ISSUES):
    if i % 1000 == 0:
        print(f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] Getting remote links for {issue.key} [{i}/{len(ISSUES)}].")
    REMOTE_LINKS[issue.id] = CONN.remote_links(issue)
with open(raw_jira_remote_links_filename, 'wb') as handle:
    pickle.dump(REMOTE_LINKS, handle, protocol=pickle.HIGHEST_PROTOCOL)


with open(raw_jira_issues_filename, 'rb') as handle:
    ISSUES = pickle.load(handle)

with open(raw_jira_watchers_filename, 'rb') as handle:
    WATCHERS = pickle.load(handle)

with open(raw_jira_remote_links_filename, 'rb') as handle:
    REMOTE_LINKS = pickle.load(handle)

# Jira -> GitHub markdown translation

In [None]:
%%time

with Pool(processes=int(cpu_count() / 2)) as pool:
    TRANSLATED_MARKUP = pool.map_async(translate_markup, ISSUES, chunksize=100).get()
TRANSLATED_MARKUP = {k: v for k, v in TRANSLATED_MARKUP}

with open(translated_markup_filename, 'wb') as handle:
    pickle.dump(TRANSLATED_MARKUP, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(translated_markup_filename, 'rb') as handle:
    TRANSLATED_MARKUP = pickle.load(handle)

# Generate import payloads

In [None]:
def generate_import_payload(issue):
    issue_type = ISSUETYPE_MAP[issue.fields.issuetype.name]
    labels = [f"Component: {c.name}" for c in issue.fields.components] + issue.fields.labels + [issue_type]
    # Filter out nonexisting labels
    labels = [label for label in labels if label in GITHUB_LABELS]
    
    # Get the earliest fix version and map it to a milestone
    fix_versions = sorted((x.name for x in issue.fields.fixVersions), key=lambda x: RELEASE_ORDER.index(x))
    fix_version = fix_versions[0] if fix_versions else None
    milestone = MILESTONE_MAP.get(fix_version, None)
    
    jira_url = issue.permalink()

    # Get watchers 
    watchers = [get_user_string(watcher, jira_url) for watcher in WATCHERS[issue.id].watchers]
    watchers = ", ".join(watchers)
   
    remote_links = [remote_link.object for remote_link in REMOTE_LINKS[issue.id]]

    body = TRANSLATED_MARKUP[issue.key]["description"] + "\n"

    if issue.fields.environment:
        body += "\n**Environment**: " + issue.fields.environment
    if issue.fields.reporter:
        body += "\n**Reporter**: " + get_user_string(issue.fields.reporter, jira_url)
    if issue.fields.assignee:
        body += "\n**Assignee**: " + get_user_string(issue.fields.assignee, jira_url)
    if watchers:
        body += f"\n**Watchers**: {watchers}"
    
    if issue.fields.subtasks:
        body += "\n#### Subtasks:"
        for subtask in issue.fields.subtasks:
            body += f"\n- [{'X' if is_completed(subtask) else ' '}] " \
                f"[{subtask.fields.summary}]({subtask.permalink()})"

    linked_issues = [extract_linked_issues(linked_issue) for linked_issue in issue.fields.issuelinks]
    
    if linked_issues:
        body += "\n#### Related issues:"
        for li in linked_issues:
            body += \
                f"\n- [{li['summary']}]({li['url']}) ({li['relationship']})"
    

    if issue.fields.attachment:
        body += f"\n#### Original Issue Attachments:"

        for attachment in issue.fields.attachment:
            body += f"\n- [{attachment.filename}]({attachment.content})"

    if issue.fields.customfield_12311020:
        body += "\n#### Externally tracked issue: " \
            f"[{issue.fields.customfield_12311020}]({issue.fields.customfield_12311020})"

    if remote_links:
        body += "\n#### PRs and other links:"
        for pr in remote_links:
            body += f"\n- [{pr.title}]({pr.url})"

    body += MIGRATION_NOTE.format(issue_key=issue.key, jira_url=jira_url)
    
    data = {
        "issue": {
            "title": f"{issue.fields.summary}",
            "labels": labels,
            "body": body,
            "created_at": issue.fields.created[:-5] + "Z",
            "updated_at": issue.fields.updated[:-5] + "Z",
            "closed": is_completed(issue),
        },
      "comments": get_comments(issue)
    }

    if issue.fields.resolutiondate:
        data["issue"]["closed_at"] = issue.fields.resolutiondate[:-5] + "Z"
    if milestone:
        data["issue"]["milestone"] = milestone
#     if issue.fields.assignee:
#         data["issue"]["assignee"] = USER_MAPPING.get(issue.fields.assignee.name, None)

    return data

In [None]:
%%time
all_payloads = [(issue.key, generate_import_payload(issue)) for issue in ISSUES]
import_responses = {}
payloads = all_payloads

In [None]:
print(all_payloads[0][1]["issue"]["body"])

# Import issues into GitHub

In [None]:
%%time
with requests.Session() as s:
    for i, (key, payload) in enumerate(payloads):
        if key in import_responses:
            continue
        if (i % 100 == 0):
            print(f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] importing ", i, "/", len(payloads))

        params = {"method": "POST", "url": IMPORT_URL, "json": payload, "headers": GITHUB_CREDENTIALS}
        response = request_to_github(params, s)
        import_responses[key] = {"import_response": response, "status": ""}

In [None]:
%%time

# Check import statuses to get github issue links
with requests.Session() as s:
    for i, key in enumerate(import_responses.keys()):
        if (i % 100 == 0):
            print(f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] importing ", i, "/", len(payloads))

        if import_responses[key]["status"] != "imported":
            status_url = import_responses[key]["import_response"].json()["url"]

            params = {"method": "GET", "url": status_url, "headers": GITHUB_CREDENTIALS}
            response = request_to_github(params, s)
            import_responses[key]["status"] = response.json()["status"]

            if import_responses[key]["status"] == "imported":
                import_responses[key]["issue_url"] = \
                    response.json()["issue_url"].replace("https://api.github.com/repos/", "https://github.com/")

In [None]:
# from IPython.display import display, HTML

# for url in [x['issue_url'] for x in import_responses.values() if 'issue_url' in x]:
#     display(HTML("""<a href="{}">{}</a>""".format(url, url)))

# Update Jira issues to link to new GitHub Issues

In [None]:
# TODO: this is untested

def update_source_jira(issue, gh_url):
    gh_id = gh_url.split("/")[-1]
    comment = JIRA_MIGRATION_NOTE.format(gh_id=gh_id, gh_url=gh_url)
    CONN.add_comment(issue, comment)

    if not issue.fields.customfield_12311020:
        issue.update(fields={"customfield_12311020" : gh_url})

# if all([x["status"] == "imported" for x in import_responses.values()]):
#     # All issues were imported we can post links to Jira
    
#     for issue in ISSUES:
#         update_source_jira(issue, import_responses[issue.key]["issue_url"])

# Update cross issue linking to link GitHub instead of Jira issues

In [None]:
def fix_issue_bodies(issues, payloads, import_responses):
    issue_bodies = {key: payload["issue"]["body"] for key, payload in payloads}
    github_urls = {k: v["issue_url"] for k, v in import_responses.items() if "issue_url" in v}
    
    new_issue_bodies = {}
    for issue in issues:
        print(issue.key)
        print(issue.fields.subtasks)
        if issue.fields.issuelinks or issue.fields.subtasks:
            body = issue_bodies[key]

            if issue.fields.issuelinks:
                for linked_issue in issue.fields.issuelinks:
                    print(linked_issue)
                    attr = "inwardIssue"
                    if not hasattr(linked_issue, attr):
                        attr = "outwardIssue"

                    li = linked_issue.__getattribute__(attr)
                    jira_url = li.permalink()
                    github_url = github_urls[li.key]
                    body.replace(jira_url, github_url)

            if issue.fields.subtasks:
                for subtask in issue.fields.subtasks:

                    jira_url = subtask.permalink()
                    github_url = github_urls[subtask.key]
                    body.replace(jira_url, github_url)

            new_issue_bodies[key] = body

    return new_issue_bodies

def update_gh_issue_links(issue_bodies, import_responses):
    github_urls = {k: v["issue_url"].replace("https://github.com/", "https://api.github.com/repos/")
         for k, v in import_responses.items() if "issue_url" in v}

    with requests.Session() as s:
        for key, body in issue_bodies.items():
            issue_id = import_responses[key]["issue_url"].split("/")[-1]
            url = github_urls[key]
            
            params = {"method": "POST", "url": url, "json": {"body": body}, "headers": GITHUB_CREDENTIALS}
            request_to_github(params, s)


tmp_issues = [i for i in ISSUES if import_responses[i.key]["status"] == "imported"]
# new_issue_bodies = fix_issue_bodies(ISSUES, payloads, import_responses)
new_issue_bodies = fix_issue_bodies(tmp_issues, payloads, import_responses)


# TODO: this is untested
# update_gh_issue_links(new_issue_bodies, import_responses)