In [None]:
JIRA_CREDENTIALS = {
    "token_auth": "",
    "server": "https://issues.apache.org/jira/",
    "async_": True
}

GITHUB_CREDENTIALS = {
    "X-Github-Next-Global-ID": "1",
    "Authorization": "",
    "User-Agent": "asfimport",
    "Accept": "application/vnd.github.golden-comet-preview+json"}

# Process

1. Download issues, comments, watchers and external links (~20min)
2. Lock Jira issues (manual)
3. Set all milestones to open (manual, not 100% sure it's needed)
4. Create full GitHub issues (with Jira referring issue links) (~4hrs 20min)
5. Restore pre-import milestone state (manual)
6. Collect Jira Issue id to GitHub issue url map (~30min)
7. Update GitHub issues with corrected issue / subtask links (20min?)
8. Post GitHub issue links to Jira issue comments (30min?)
9. Create self subscribe dataset (manual)

In [None]:
import csv, pickle, re, time
from datetime import datetime
from string import punctuation

from dateutil.rrule import rrule, MONTHLY
import jira2markdown
from jira2markdown.markup.links import Mention
from jira2markdown.markup.base import AbstractMarkup
from jira import JIRA
from multiprocessing import Pool, cpu_count
from pyparsing import (
    CaselessLiteral,
    Char,
    Combine,
    FollowedBy,
    Optional,
    ParserElement,
    ParseResults,
    PrecededBy,
    SkipTo,
    StringEnd,
    StringStart,
    Suppress,
    White,
    Word,
    alphanums,
)
import requests


def is_completed(item):
    return item.fields.status.name in ["Closed", "Resolved"]


def extract_linked_issues(linked_issue):
    if hasattr(linked_issue, "outwardIssue"):
        return {
            "key": linked_issue.outwardIssue.key,
            "relationship": linked_issue.type.outward,
            "summary": linked_issue.outwardIssue.fields.summary,
            "url": linked_issue.outwardIssue.permalink(),
            "completed": is_completed(linked_issue.outwardIssue)
        }
    else:
        return {
            "key": linked_issue.inwardIssue.key,
            "relationship": linked_issue.type.inward,
            "summary": linked_issue.inwardIssue.fields.summary,
            "url": linked_issue.inwardIssue.permalink(),
            "completed": is_completed(linked_issue.inwardIssue)
        }


def get_user_string(jira_author, jira_url):
    if jira_author.name in USER_MAPPING:
        github_id = f" / @{USER_MAPPING[jira_author.name]}"
    else:
        github_id = ""
    return f"[{jira_author.displayName}]({jira_url})" + github_id


def get_comments(issue):
    comments = []
    for comment in issue.fields.comment.comments:
        # Skip ASF GitHub Bot comments per https://github.com/apache/arrow/issues/14648
        if comment.author.name == "githubbot":
            continue

        jira_url = f"{issue.permalink()}?focusedCommentId={comment.id}"
        user_string = get_user_string(comment.author, jira_url)
        
        fixed_comments = TRANSLATED_MARKUP[issue.key]["comments"]
        comments.append({
            "body": f"{user_string}:\n{fixed_comments[comment.id]}",
            "created_at": comment.created[:-5] + "Z"
        })
    return comments


def request_to_github(params, session):
    while True:
        r = session.request(**params)
        request_data = ", ".join((params["method"], params["url"], params.get("body", "")))

        if r.status_code in (200, 202, 204):
            # all is good
            return r
        elif r.status_code == 403:
            # throttling
            print("Response was: ", r.json())
            reset_time = int(r.headers["X-RateLimit-Reset"])
            wait_time = reset_time - round(time.time() + .5)
            if wait_time > 0:
                print(f"Throttled on {request_data}, call:  {r.text}\nSleeping for {wait_time // 60} minutes.")
                time.sleep(wait_time)
            else:
                time.sleep(1)
        else:
            # something is wrong
            print(f"Request {request_data} returned status code {r.status_code} and {r.text}")
            response.raise_for_status() 


def run_query(query): # A simple function to use requests.post to make the API call. Note the json= section.
    request = requests.post('https://api.github.com/graphql', json={'query': query}, headers=GITHUB_CREDENTIALS)
    if request.status_code == 200:
        return request.json()
    else:
        raise Exception("Query failed to run by returning code of {}. {}".format(request.status_code, query))


def get_issues(owner, repo, interval, created_by="asfimport"):
    get_issue_texts = """
    query get_issue_texts {{
      search(query: "repo:{owner}/{repo} author:{created_by} is:issue created:{interval}",
                     type: ISSUE, first: 100{cursor}) {{
        edges {{
          node {{
            ... on Issue {{
              id
              bodyText
              url
              databaseId
              milestone {{
                id
                number
                title
              }}
            }}
          }}
        }}
        pageInfo {{
          endCursor
          hasNextPage
        }}
        issueCount
      }}
    }}
    """

    cursor = ""
    has_next_page = True
    responses = []
    page = 0

    while has_next_page:
        page += 1

        q = get_issue_texts.format(owner=owner, repo=repo, cursor=cursor, interval=interval)
        response = run_query(q)
        responses.append(response)

        if int(response['data']["search"]["issueCount"]) > 1000:
            print(f"Query for {interval} is not granular enough and will not capture all isssues!")

        has_next_page = response["data"]["search"]["pageInfo"]["hasNextPage"]
        cursor = f', after: "{response["data"]["search"]["pageInfo"]["endCursor"]}"'

    return [e["node"] for x in responses for e in x["data"]["search"]["edges"]]

            
def get_assignable_users(users):
    user_can_be_assignee = []
    with requests.Session() as s:
        for user in users:
            url = f"https://api.github.com/repos/apache/arrow/assignees/{user}"
            params = {"method": "GET", "url": url, "headers": GITHUB_CREDENTIALS}
            response = session.request(**params)
            if response.status_code == 204:
                user_can_be_assignee.append(user)

    return user_can_be_assignee


class MigratedMention(AbstractMarkup):
    def action(self, tokens: ParseResults) -> str:
        username = self.usernames.get(tokens.accountid)
        return f"`[~{tokens.accountid}]`" if username is None else f"@{username}"

    @property
    def expr(self) -> ParserElement:
        MENTION = Combine(
            "["
            + Optional(
                SkipTo("|", failOn="]") + Suppress("|"),
                default="",
                )
            + "~"
            + Optional(CaselessLiteral("accountid:"))
            + Word(alphanums + ":-").setResultsName("accountid")
            + "]",
            )
        return (
                (StringStart() | Optional(PrecededBy(White(), retreat=1), default=" "))
                + MENTION.setParseAction(self.action)
                + (StringEnd() | Optional(FollowedBy(White() | Char(punctuation, excludeChars="[") | MENTION), default=" "))
        )


LEADING_SPACE_HASH_PATTERN = re.compile(r"\n\s(#+\s+\S.*)")
ELEMENTS = jira2markdown.elements.MarkupElements()
ELEMENTS.replace(Mention, MigratedMention)


def translate_markup(issue):
    if issue.fields.description:
        description = issue.fields.description
    else:
        description = ""

    description = re.sub(LEADING_SPACE_HASH_PATTERN, r"\n\1", description)
    text = jira2markdown.convert(description, elements=ELEMENTS, usernames=USER_MAPPING)

    for attachment in issue.fields.attachment:
        text = text.replace(f"![{attachment.filename}]({attachment.filename})",
                            f"![{attachment.filename}]({attachment.content})")

    comments = {}
    for comment in issue.fields.comment.comments:
        # Skip ASF GitHub Bot comments per https://github.com/apache/arrow/issues/14648
        if comment.author.name == "githubbot":
            continue
        comment_body = re.sub(LEADING_SPACE_HASH_PATTERN, r"\n\1", comment.body)
        comment_text = jira2markdown.convert(comment_body, elements=ELEMENTS, usernames=USER_MAPPING)

        for attachment in issue.fields.attachment:
            comment_text = comment_text.replace(f"![{attachment.filename}]({attachment.filename})",
                                                f"![{attachment.filename}]({attachment.content})")
        comments[comment.id] = comment_text

    return (issue.key, {"description": text, "comments": comments})

In [None]:
raw_jira_issues_filename = 'raw_jira_issues.pickle'
raw_jira_watchers_filename = 'raw_jira_watchers.pickle'
raw_jira_remote_links_filename = "raw_jira_remote_links.pickle"
translated_markup_filename = "translated_markdown.pickle"
issue_subscriptions_file = "issue_subscriptions.csv"

RELEASE_ORDER = (
    '0.1.0', '0.2.0', '0.3.0', 'JS-0.3.0', 'JS-0.3.1', '0.4.0', 'JS-0.4.0',
    '0.4.1', 'JS-0.4.1', '0.5.0', '0.6.0', '0.7.0', '0.7.1', '0.8.0',
    '0.9.0', '0.10.0', '0.11.0', '0.11.1', '0.12.0', '0.12.1', '0.13.0',
    '0.14.0', '0.14.1', '0.15.0', '0.15.1', '0.16.0', '0.17.0', '0.17.1',
    '1.0.0', '1.0.1', '2.0.0', '3.0.0', '4.0.0', '3.0.1', '4.0.1', '5.0.0',
    '6.0.0', '5.0.1', '6.0.1', '6.0.2', '6.0.3', '7.0.0', '7.0.1', '7.0.2',
    '8.0.0', '8.0.1', '9.0.0', '9.0.1', '10.0.0', '10.0.1', '10.0.2',
    '11.0.0', '12.0.0'
)

ISSUETYPE_MAP = {
    "Bug": "Type: bug",
    "Improvement": "Type: enhancement",
    "Wish": "Type: enhancement",
    "New Feature": "Type: enhancement",
    "Task": "Type: task",
    "Sub-task": "Type: task",
    "Test": "Type: test"
}

GITHUB_LABELS = (
    "Component: Archery", "Component: Benchmarking", "Component: C",
    "Component: C#", "Component: C++", "Component: C++ - Gandiva",
    "Component: C++ - Plasma", "Component: Continuous Integration",
    "Component: Developer Tools", "Component: Documentation",
    "Component: FlightRPC", "Component: Format", "Component: GLib",
    "Component: Go", "Component: GPU", "Component: Integration",
    "Component: Java", "Component: JavaScript", "Component: Julia",
    "Component: MATLAB", "Component: Other", "Component: Packaging",
    "Component: Parquet", "Component: Python", "Component: R",
    "Component: Release", "Component: Ruby", "Component: Rust",
    "Component: Rust - Ballista", "Component: Rust - DataFusion",
    "Component: Website", "Component: Wiki", "dependencies",
    "good-first-issue", "hacktoberfest-accepted", "java", "javascript",
    "lang-go", "needs-rebase", "ready-for-review", "Type: bug",
    "Type: enhancement", "Type: task", "Type: test", "Type: usage",
    "WIP",
    "good-second-issue", "Priority: Critical", "Priority: Blocker"
)

# OWNER = "test_user"
# REPO = "test_repo"
OWNER = "apache"
REPO = "arrow"
IMPORT_URL = f"https://api.github.com/repos/{OWNER}/{REPO}/import/issues"
ISSUE_URL_TEMPLATE = f"https://github.com/{OWNER}/{REPO}/issues/{{}}"
GITHUB_PROJECT_URL = "https://github.com/apache/arrow/pull/"
JIRA_PROJECT_NAME = "ARROW"

milestone_url = "https://api.github.com/repos/apache/arrow/milestones"
raw_milestone_map = requests.get(milestone_url, params={"state": "all", "per_page": 100},
                                 headers=GITHUB_CREDENTIALS)
MILESTONE_MAP = {x["title"]: x["number"] for x in raw_milestone_map.json()}

# testing_milestone_url = f"https://api.github.com/repos/{OWNER}/{REPO}/milestones"
# raw_milestone_map = requests.get(milestone_url, params={"state": "all", "per_page": 100},
#                                  headers=GITHUB_CREDENTIALS)
# TESTING_MILESTONE_MAP = {x["title"]: x["number"] for x in raw_testing_milestone_map.json()}

# MILESTONE_MAP = TESTING_MILESTONE_MAP

MIGRATION_NOTE = "\n\n<sub>**Note**: *This issue was originally created as [{issue_key}]({jira_url}). " \
    "Please see the " \
    "[migration documentation](https://github.com/apache/arrow/issues/14542) " \
    "for further details.*</sub>"

JIRA_MIGRATION_NOTE = "This issue has been migrated to [issue #{gh_id}|{gh_url}] on GitHub. " \
    "Please see the " \
    "[migration documentation|https://github.com/apache/arrow/issues/14542] " \
    "for further details."

USER_MAPPING = {}
with open(jira_to_github_user_mapping_file, newline="") as f:
    reader = csv.reader(f)
    for row in reader:
        USER_MAPPING[row[0]] = row[2]
        USER_MAPPING[row[1]] = row[2]

# Find assignable users on GitHub

In [None]:
# USER_CAN_BE_ASSIGNEE = get_assignable_users(USER_MAPPING.values())

USER_CAN_BE_ASSIGNEE = ('AlenkaF', 'BryanCutler', 'Dandandan', 'Jimexist', 'TheNeuralBit', 'alamb', 'amol-',
    'andygrove', 'assignUser', 'bkietz', 'cpcloud', 'cyb70289', 'domoritz', 'eerhardt', 'emkornfield',
    'fsaintjacques', 'houqp', 'ianmcook', 'icexelloss', 'jacques-n', 'jonkeane', 'jorgecarleitao',
    'jorisvandenbossche', 'julienledem', 'kiszk', 'kou', 'kszucs', 'lidavidm', 'liukun4515', 'liyafan82',
    'majetideepak', 'milesgranger', 'mrkn', 'nealrichardson', 'nevi-me', 'paddyhoran', 'paleolimbot',
    'pcmoritz', 'pitrou', 'praveenbingo', 'pravindra', 'ptgoetz', 'quinnj', 'raulcd', 'robertnishihara',
    'rok', 'romainfrancois', 'sbinet', 'shiro615', 'siddharthteotia', 'sunchao', 'thisisnic', 'tianchen92',
    'trxcllnt', 'tustvold', 'wesm', 'westonpace', 'wjones127', 'xhochy',  'zeroshade')

# Lock Jira comments

# Get Jira issue data and cache it to pickle

In [None]:
%%time
CONN = JIRA(**JIRA_CREDENTIALS)


ISSUES = CONN.search_issues(f"project = {JIRA_PROJECT_NAME} order by key", maxResults = False, fields = '*all')
with open(raw_jira_issues_filename, 'wb') as handle:
    pickle.dump(ISSUES, handle, protocol=pickle.HIGHEST_PROTOCOL)

    
WATCHERS = {}
for i, issue in enumerate(ISSUES):
    if i % 1000 == 0:
        print(f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] Getting watchers for {issue.key} [{i}/{len(ISSUES)}].")
    WATCHERS[issue.id] = CONN.watchers(issue.id)
with open(raw_jira_watchers_filename, 'wb') as handle:
    pickle.dump(WATCHERS, handle, protocol=pickle.HIGHEST_PROTOCOL)

    
REMOTE_LINKS = {}
for i, issue in enumerate(ISSUES):
    if i % 1000 == 0:
        print(f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] Getting remote links for {issue.key} [{i}/{len(ISSUES)}].")
    REMOTE_LINKS[issue.id] = CONN.remote_links(issue)
with open(raw_jira_remote_links_filename, 'wb') as handle:
    pickle.dump(REMOTE_LINKS, handle, protocol=pickle.HIGHEST_PROTOCOL)


with open(raw_jira_issues_filename, 'rb') as handle:
    ISSUES = pickle.load(handle)

with open(raw_jira_watchers_filename, 'rb') as handle:
    WATCHERS = pickle.load(handle)

with open(raw_jira_remote_links_filename, 'rb') as handle:
    REMOTE_LINKS = pickle.load(handle)

# Jira -> GitHub markdown translation

In [None]:
%%time

with Pool(processes=int(cpu_count() / 2)) as pool:
    TRANSLATED_MARKUP = pool.map_async(translate_markup, ISSUES, chunksize=100).get()
TRANSLATED_MARKUP = {k: v for k, v in TRANSLATED_MARKUP}

with open(translated_markup_filename, 'wb') as handle:
    pickle.dump(TRANSLATED_MARKUP, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(translated_markup_filename, 'rb') as handle:
    TRANSLATED_MARKUP = pickle.load(handle)

# Generate GitHub import payloads

In [None]:
def generate_import_payload(issue):
    issue_type = ISSUETYPE_MAP[issue.fields.issuetype.name]
    labels = [f"Component: {c.name}" for c in issue.fields.components] + issue.fields.labels + \
        [issue_type, f"Priority: {issue.fields.priority.name}"]
    labels = [label for label in labels if label in GITHUB_LABELS]
    
    # Get the earliest fix version and map it to a milestone
    fix_versions = sorted((x.name for x in issue.fields.fixVersions), key=lambda x: RELEASE_ORDER.index(x))
    fix_version = fix_versions[0] if fix_versions else None
    milestone = MILESTONE_MAP.get(fix_version, None)
    
    jira_url = issue.permalink()

    # Get watchers 
    watchers = [get_user_string(watcher, jira_url) \
                for watcher in WATCHERS[issue.id].watchers if watcher.name != "arrowjira"]
    watchers = ", ".join(watchers)
   
    remote_links = [remote_link.object for remote_link in REMOTE_LINKS[issue.id]]

    body = TRANSLATED_MARKUP[issue.key]["description"] + "\n"

    if issue.fields.environment:
        body += "\n**Environment**: " + issue.fields.environment
    if issue.fields.reporter:
        body += "\n**Reporter**: " + get_user_string(issue.fields.reporter, jira_url)
    if issue.fields.assignee:
        body += "\n**Assignee**: " + get_user_string(issue.fields.assignee, jira_url)
    if watchers:
        body += f"\n**Watchers**: {watchers}"
    
    if issue.fields.subtasks:
        body += "\n#### Subtasks:"
        for subtask in issue.fields.subtasks:
            body += f"\n- [{'X' if is_completed(subtask) else ' '}] " \
                f"[{subtask.fields.summary}]({subtask.permalink()})"

    linked_issues = [extract_linked_issues(linked_issue) for linked_issue in issue.fields.issuelinks]
    
    if linked_issues:
        body += "\n#### Related issues:"
        for li in linked_issues:
            body += \
                f"\n- [{li['summary']}]({li['url']}) ({li['relationship']})"
    

    if issue.fields.attachment:
        body += f"\n#### Original Issue Attachments:"

        for attachment in issue.fields.attachment:
            body += f"\n- [{attachment.filename}]({attachment.content})"

    if issue.fields.customfield_12311020:
        body += "\n#### Externally tracked issue: " \
            f"[{issue.fields.customfield_12311020}]({issue.fields.customfield_12311020})"

    if remote_links:
        body += "\n#### PRs and other links:"
        for pr in remote_links:
            body += f"\n- [{pr.title}]({pr.url})"

    body += MIGRATION_NOTE.format(issue_key=issue.key, jira_url=jira_url)
    
    data = {
        "issue": {
            "title": f"{issue.fields.summary}",
            "labels": labels,
            "body": body,
            "created_at": issue.fields.created[:-5] + "Z",
            "updated_at": issue.fields.updated[:-5] + "Z",
            "closed": is_completed(issue),
        },
      "comments": get_comments(issue)
    }

    if issue.fields.resolutiondate:
        data["issue"]["closed_at"] = issue.fields.resolutiondate[:-5] + "Z"
    if milestone:
        data["issue"]["milestone"] = milestone
    if issue.fields.assignee and issue.fields.assignee.name in USER_MAPPING:
        assignee = USER_MAPPING[issue.fields.assignee.name]
        if assignee in USER_CAN_BE_ASSIGNEE:
            data["issue"]["assignee"] = assignee

    return data

In [None]:
%%time
payloads = [(issue.key, generate_import_payload(issue)) for issue in ISSUES]
import_responses = {}

In [None]:
print(payloads[0][1]["issue"]["body"])

# Import issues into GitHub

In [None]:
%%time
with requests.Session() as s:
    for i, (key, payload) in enumerate(payloads):
        if key in import_responses:
            continue
        if (i % 100 == 0):
            print(f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] importing ", i, "/", len(payloads))

        params = {"method": "POST", "url": IMPORT_URL, "json": payload, "headers": GITHUB_CREDENTIALS}
        response = request_to_github(params, s)
        import_responses[key] = {"import_response": response, "status": ""}

# Get created issues to map them to Jira tickets

In [None]:
%%time

dates = rrule(MONTHLY, interval=1, dtstart=datetime.strptime("2016", "%Y"),
              until=datetime.strptime("2023-02", "%Y-%m"))
fmt = "%Y-%m-%d"
intervals = [f"{x.strftime(fmt)}..{y.strftime(fmt)}" for x, y in zip(dates, dates[1:])]

results = []
for interval in intervals:
    results += get_issues(owner=OWNER, repo=REPO, interval=interval)
#     print(interval, len(results))

p = re.compile(r"(ARROW-\d+).")
GITHUB_URLS = {p.findall(x["bodyText"])[-1]: x["url"] for x in results}
GITHUB_IDS = {x["url"]: x["id"] for x in results}

# Update cross issue links on GitHub to link to GitHub issues

In [None]:
%%time

def fix_issue_bodies(issues, payloads):
    issue_bodies = {key: payload["issue"]["body"] for key, payload in payloads}    
    new_issue_bodies = {}

    for issue in issues:
        if issue.fields.issuelinks or issue.fields.subtasks:
            body = issue_bodies[issue.key]

            if issue.fields.issuelinks:
                for li in issue.fields.issuelinks:
                    linked_issue = li.outwardIssue if hasattr(li, "outwardIssue") else li.inwardIssue
                    jira_url = linked_issue.permalink()
                    github_url = GITHUB_URLS.get(linked_issue.key, jira_url)
                    body = body.replace(jira_url, github_url)


            if issue.fields.subtasks:
                for subtask in issue.fields.subtasks:
                    jira_url = subtask.permalink()
                    github_url = GITHUB_URLS.get(subtask.key, jira_url)
                    body = body.replace(jira_url, github_url)

            new_issue_bodies[issue.key] = body

    return new_issue_bodies

def update_gh_issue_links(issue_bodies):
    responses = {}

    with requests.Session() as s:
        for i, (key, body) in enumerate(issue_bodies.items()):
            url = GITHUB_URLS[key].replace("https://github.com/", "https://api.github.com/repos/")
            params = {"method": "POST", "url": url, "json": {"body": body}, "headers": GITHUB_CREDENTIALS}
            print(i, "/", len(issue_bodies), params["url"])
            responses[key] = request_to_github(params, s)

    return responses

new_issue_bodies = fix_issue_bodies(ISSUES, payloads)
print(f"Updating {len(new_issue_bodies)} issue bodies with corrected links.")
_ = update_gh_issue_links(new_issue_bodies)

# Unlock Jira comments

# Update Jira issues to link to new GitHub Issues

In [None]:
%%time

def update_source_jira(issue, gh_url):
    gh_id = gh_url.split("/")[-1]
    comment = JIRA_MIGRATION_NOTE.format(gh_id=gh_id, gh_url=gh_url)
    CONN.add_comment(issue, comment)

    if not issue.fields.customfield_12311020:
        issue.update(fields={"customfield_12311020" : gh_url})

for issue in ISSUES:
    # Please verify all issues were successfully imported before running this so links can be posted to Jira
    print(issue.key)
    update_source_jira(issue, GITHUB_URLS[issue.key])

# Lock Jira comments

# Create a self subscription dataset

In [None]:
issue_subscriptions = [
    *[(watcher.key, "watcher", issue.key) for issue in ISSUES for watcher in WATCHERS[issue.id].watchers],
    *[(issue.fields.reporter.key, "reporter", issue.key) for issue in ISSUES if issue.fields.reporter],
    *[(issue.fields.creator.key, "creator", issue.key) for issue in ISSUES if issue.fields.creator],
    *[(issue.fields.assignee.key, "assignee", issue.key) for issue in ISSUES if issue.fields.assignee]
]
issue_subscriptions = [(USER_MAPPING[x], y, GITHUB_URLS[z], GITHUB_IDS[GITHUB_URLS[key]], z)
                       for x, y, z in issue_subscriptions if x in USER_MAPPING]

with open(issue_subscriptions_file, 'w') as f:
    writer = csv.writer(f)

    for issue_subscription in issue_subscriptions:
        writer.writerow(issue_subscription)