In [None]:
import csv
import os
import re
import time
from datetime import datetime
from dateutil.rrule import rrule, MONTHLY, YEARLY

import requests

import migration_helpers

# suffix = "_test"
suffix = ""

JIRA_CREDENTIALS = {
    "token_auth": "",
    "server": "https://issues.apache.org/jira/",
    "async_": True
}
GITHUB_CREDENTIALS = {
    "X-Github-Next-Global-ID": "1",
    "Authorization": "token ghp_",
    "User-Agent": "asfimport",
    "Accept": "application/vnd.github.golden-comet-preview+json"
}

JIRA_PROJECT_NAME = "parquet"
MIGRATION_DOC_URL = "https://issues.apache.org/jira/browse/PARQUET-2502"
IMPORT_URL = "https://api.github.com/repos/{}/{}/import/issues"

JIRA_MIGRATION_NOTE = "This issue has been migrated to [issue #{gh_id}|{gh_url}] on GitHub. " \
    "Please see the " \
    "[migration documentation|https://issues.apache.org/jira/browse/PARQUET-2502] " \
    "for further details."
ARROW_GITHUB_OWNER = "apache"

jira_to_github_user_mapping_file = "jira-to-github-user-mapping.csv"
issue_subscriptions_file = os.path.join(JIRA_PROJECT_NAME + suffix, "issue_subscriptions.csv")

# Get Jira issue data and cache
Note: Jira should be read/write at this point as watchers will not be visible otherwise

In [None]:
%%time
ISSUES, WATCHERS, REMOTE_LINKS, TRANSLATED_MARKUP, USER_MAPPING = migration_helpers.get_and_cache_jira_data(
    JIRA_PROJECT_NAME, JIRA_CREDENTIALS, jira_to_github_user_mapping_file, cache_folder=JIRA_PROJECT_NAME + suffix)

munged_issues = migration_helpers.munge_issues(ISSUES, REMOTE_LINKS, WATCHERS)

In [None]:
# ARROW_USER_CAN_BE_ASSIGNEE = \
# migration_helpers.get_assignable_users(ARROW_GITHUB_OWNER, "arrow", USER_MAPPING.values(), GITHUB_CREDENTIALS)
ARROW_USER_CAN_BE_ASSIGNEE = ('wesm', 'kou', 'pitrou', 'kszucs', 'nealrichardson', 'xhochy', 'lidavidm',
    'jorisvandenbossche', 'andygrove', 'bkietz', 'jorgecarleitao', 'liyafan82', 'westonpace', 'zeroshade',
    'thisisnic', 'fsaintjacques', 'AlenkaF', 'jonkeane', 'nevi-me', 'emkornfield', 'tianchen92', 'alamb',
    'cyb70289', 'pcmoritz', 'shiro615', 'Dandandan', 'romainfrancois', 'cpcloud', 'BryanCutler', 'raulcd',
    'sbinet', 'rok', 'mrkn', 'wjones127', 'trxcllnt', 'julienledem', 'kiszk', 'domoritz', 'paddyhoran',
    'ianmcook', 'amol-', 'pravindra', 'assignUser', 'paleolimbot', 'sunchao', 'houqp', 'TheNeuralBit',
    'robertnishihara', 'vibhatha', 'praveenbingo', 'siddharthteotia', 'emkornfield', 'eerhardt', 'js8544',
    'icexelloss', 'crepererum', 'jduo', 'tustvold', 'kevingurney', 'majetideepak', 'quinnj', 'jorgecarleitao',
    'benibus', 'waynexia', 'anjakefala', 'jacques-n', 'wgtmac', 'amoeba', 'liukun4515', 'andygrove', 'ptgoetz',
    'sgilmore10', 'zanmato1984', 'assignUser', 'mapleFU', 'parthchandra', 'yjshen', 'viirya', 'Ted-Jiang', 'alkis')

# ARROW_USER_CAN_BE_ASSIGNEE = ('rok',)

# Generate GitHub import payloads

In [None]:
ARROW_RELEASE_REMAP = {
    "cpp-4.0.0": "4.0.0", "cpp-5.0.0": "5.0.0", "cpp-6.0.0": "6.0.0",
    "cpp-7.0.0": "7.0.0", "cpp-8.0.0": "8.0.0", "cpp-9.0.0": "9.0.0",
    "cpp-10.0.0": "10.0.0", "cpp-11.0.0": "11.0.0", "cpp-12.0.0": "12.0.0",
    "cpp-13.0.0": "13.0.0", "cpp-15.0.0": "15.0.0", "cpp-16.0.0": "16.0.0",
}

ARROW_RELEASE_ORDER = (
    '0.1.0', '0.2.0', '0.3.0', 'JS-0.3.0', 'JS-0.3.1', '0.4.0', 'JS-0.4.0',
    '0.4.1', 'JS-0.4.1', '0.5.0', '0.6.0', '0.7.0', '0.7.1', '0.8.0',
    '0.9.0', '0.10.0', '0.11.0', '0.11.1', '0.12.0', '0.12.1', '0.13.0',
    '0.14.0', '0.14.1', '0.15.0', '0.15.1', '0.16.0', '0.17.0', '0.17.1',
    '1.0.0', '1.0.1', '2.0.0', '3.0.0', '4.0.0', '3.0.1', '4.0.1', '5.0.0',
    '6.0.0', '5.0.1', '6.0.1', '6.0.2', '6.0.3', '7.0.0', '7.0.1', '7.0.2',
    '8.0.0', '8.0.1', '9.0.0', '9.0.1', '10.0.0', '10.0.1', '10.0.2',
    '11.0.0', '12.0.0',
    '12.0.1', '13.0.0', '14.0.0', '14.0.1', '14.0.2',
    '15.0.0', '15.0.1', '15.0.2', '16.0.0', '16.1.0', '17.0.0', '18.0.0',
)

PARQUET_JAVA_MILESTONES = [
    '1.10.0', '1.10.1', '1.10.2', '1.11.0', '1.11.1', '1.11.2', '1.12.0', '1.12.1',
    '1.12.2', '1.12.3', '1.13.0', '1.13.1', '1.13.2', '1.14.0', '1.14.1', '1.15.0',
    '1.6.0', '1.6.1', '1.7.0', '1.8.0', '1.8.1', '1.8.2', '1.8.3', '1.9.0', '2.0.0'
]

PARQUET_FORMAT_RELEASE_REMAP = {
    'format-2.10.0': '2.10.0', 'format-2.11.0': '2.11.0', 'format-2.3.1': '2.3.1',
    'format-2.4.0': '2.4.0', 'format-2.5.0': '2.5.0', 'format-2.7.0': '2.7.0',
    'format-2.8.0': '2.8.0', 'format-2.9.0': '2.9.0'
}

ARROW_ISSUETYPE_MAP = {
    "Bug": "Type: bug",
    "Improvement": "Type: enhancement",
    "Wish": "Type: enhancement",
    "New Feature": "Type: enhancement",
    "Task": "Type: task",
    "Sub-task": "Type: task",
    "Test": "Type: test"
}
ARROW_LABEL_MAP = {
    "beginner": "good-first-issue",
    "n00b": "good-first-issue",
    "newbe": "good-first-issue",
    "newbie": "good-first-issue",
    "noob": "good-first-issue",
    "starter": "good-first-issue",
    "build": "Component: Developer Tools",
    "c++": "Component: C++",
    "parquet-cpp": "Component: C++",
    "documentation": "Component: Documentation",
    "docs": "Component: Documentation",
    "features": "Type: enhancement",
    "bug": "Type: bug",
    "parquet-avro": "Component: Avro",
    "parquet-cascading": "Component: Cascading",
    "parquet-cli": "Component: CLI",
    "parquet-format": "Component: Format",
    "parquet-hadoop": "Component: Hadoop",
    "parquet-java": "Component: Java",
    "parquet-mr": "Component: Java",
    "parquet-pig": "Component: Pig",
    "parquet-protobuf": "Component: Protobuf",
    "parquet-site": "Component: Site",
    "parquet-testing": "Component: Testing",
    "parquet-thrift": "Component: Thrift",
}

# TESTING ONLY!
# Create milestones in the target repo
# milestones = [{"title": x} for x in ARROW_RELEASE_ORDER]
# milestone_map = migration_helpers.make_milestones(milestones, "rok", "test-parquet-cpp", GITHUB_CREDENTIALS)
# ARROW_MILESTONE_MAP = migration_helpers.get_milestone_map("rok", "test-parquet-cpp", GITHUB_CREDENTIALS)
# /TESTING ONLY!

In [None]:
# WARNING: RUN ONCE ONLY
# Create milestones in the target repo
parquet_java_milestones = [{"title": x} for x in PARQUET_JAVA_MILESTONES]
parquet_format_milestones = [{"title": x} for x in PARQUET_FORMAT_RELEASE_REMAP.values()]
_ = migration_helpers.make_milestones(parquet_java_milestones, ARROW_GITHUB_OWNER, "parquet-java", GITHUB_CREDENTIALS)
_ = migration_helpers.make_milestones(parquet_format_milestones, ARROW_GITHUB_OWNER, "parquet-format", GITHUB_CREDENTIALS)

# Create labels in the target repo
NON_CPP_LABELS  = [{"name": name} for name in [
    'Component: Avro', 'Component: C++', 'Component: CLI', 'Component: Cascading', 'Component: Format', 'Component: Hadoop',
    'Component: Java', 'Component: Parquet', 'Component: Pig', 'Component: Protobuf', 'Component: Site', 'Component: Testing',
    'Component: Thrift', 'Priority: Blocker', 'Priority: Critical', 'Priority: Major', 'Priority: Minor', 'Priority: Trivial',
    'Type: bug', 'Type: enhancement', 'Type: task', 'Type: test'
]]

migration_helpers.make_labels(NON_CPP_LABELS, ARROW_GITHUB_OWNER, "parquet-java", GITHUB_CREDENTIALS)
migration_helpers.make_labels(NON_CPP_LABELS, ARROW_GITHUB_OWNER, "parquet-format", GITHUB_CREDENTIALS)
migration_helpers.make_labels(NON_CPP_LABELS, ARROW_GITHUB_OWNER, "parquet-site", GITHUB_CREDENTIALS)
migration_helpers.make_labels(NON_CPP_LABELS, ARROW_GITHUB_OWNER, "parquet-testing", GITHUB_CREDENTIALS)

In [None]:
ARROW_MILESTONE_MAP = migration_helpers.get_milestone_map(ARROW_GITHUB_OWNER, "arrow", GITHUB_CREDENTIALS)
PARQUET_FORMAT_MILESTONE_MAP = migration_helpers.get_milestone_map(ARROW_GITHUB_OWNER, "parquet-format", GITHUB_CREDENTIALS)
PARQUET_JAVA_MILESTONE_MAP = migration_helpers.get_milestone_map(ARROW_GITHUB_OWNER, "parquet-java", GITHUB_CREDENTIALS)

In [None]:
# Correct labels and milestones
for i, munged_issue in enumerate(munged_issues):
    issue_type = ARROW_ISSUETYPE_MAP[munged_issue["issuetype"]]
    components = [ARROW_LABEL_MAP.get(c.name, None) for c in munged_issue["components"]]
    priority = f"Priority: {munged_issue['priority']}"

    munged_issues[i]["labels"] = [issue_type, priority, *components, "Component: Parquet"]

# Determine which issue tracker should the ticket be migrated to

In [None]:
java_components = ("parquet-avro", "parquet-cascading", "parquet-cli", "parquet-hadoop", "parquet-mr",
                   "parquet-pig", "parquet-protobuf", "parquet-thrift")

for i, munged_issue in enumerate(munged_issues):
    is_java = any([c.name in java_components for c in munged_issue["components"]])
    is_cpp = any([c.name == "parquet-cpp" for c in munged_issue["components"]])
    is_format = any([c.name == "parquet-format" for c in munged_issue["components"]])
    is_site = any([c.name == "parquet-site" for c in munged_issue["components"]])
    is_testing = any([c.name == "parquet-testing" for c in munged_issue["components"]])

    if is_java:
        munged_issue["repo"] = "parquet-java"
    elif is_format:
        munged_issue["repo"] = "parquet-format"
    elif is_site:
        munged_issue["repo"] = "parquet-site"
    elif is_testing:
        munged_issue["repo"] = "parquet-testing"
    elif is_cpp:
        munged_issue["repo"] = "parquet-cpp"
    else:
        munged_issue["repo"] = "parquet-java"

java_munged_issues = [x for x in munged_issues if x["repo"] == "parquet-java"]
format_munged_issues = [x for x in munged_issues if x["repo"] == "parquet-format"]
site_munged_issues = [x for x in munged_issues if x["repo"] == "parquet-site"]
testing_munged_issues = [x for x in munged_issues if x["repo"] == "parquet-testing"]
cpp_munged_issues = [x for x in munged_issues if x["repo"] == "parquet-cpp"]

In [None]:
# import pandas as pd
# df = pd.DataFrame(munged_issues)
# df.sample(2)

# Prepare payloads to be sent to GitHub

In [None]:
# Correct milestones

for i, munged_issue in enumerate(format_munged_issues):
    if munged_issue["fixVersion"]:
        releases = sorted([x.name for x in munged_issue["fixVersion"] if x.name in PARQUET_FORMAT_RELEASE_REMAP])
        if releases:
            format_munged_issues[i]["milestone"] = PARQUET_FORMAT_MILESTONE_MAP[PARQUET_FORMAT_RELEASE_REMAP[releases[0]]]

for i, munged_issue in enumerate(java_munged_issues):
    if munged_issue["fixVersion"]:
        releases = sorted([x.name for x in munged_issue["fixVersion"] if x.name in PARQUET_JAVA_MILESTONES])
        if releases:
            java_munged_issues[i]["milestone"] = PARQUET_JAVA_MILESTONE_MAP[releases[0]]


for i, munged_issue in enumerate(cpp_munged_issues):
    cpp_munged_issues[i]["title"] = "[C++][Parquet] " + cpp_munged_issues[i]["title"].lstrip("[C++] ").lstrip(" : ")

    if munged_issue["fixVersion"]:
        releases = [ARROW_RELEASE_REMAP[x.name] for x in munged_issue["fixVersion"] if x.name in ARROW_RELEASE_REMAP]
        if releases:
            earliest_fix_version = sorted([x for x in releases], key=lambda y: ARROW_RELEASE_ORDER.index(y))[0]
            cpp_munged_issues[i]["milestone"] = ARROW_MILESTONE_MAP[earliest_fix_version]

In [None]:
cpp_payloads = [migration_helpers.generate_payload(
    i, USER_MAPPING, TRANSLATED_MARKUP, WATCHERS, MIGRATION_DOC_URL, ARROW_USER_CAN_BE_ASSIGNEE) for i in cpp_munged_issues]

java_payloads = [migration_helpers.generate_payload(
    i, USER_MAPPING, TRANSLATED_MARKUP, WATCHERS, MIGRATION_DOC_URL, ()) for i in java_munged_issues]

format_payloads = [migration_helpers.generate_payload(
    i, USER_MAPPING, TRANSLATED_MARKUP, WATCHERS, MIGRATION_DOC_URL, ()) for i in format_munged_issues]

site_payloads = [migration_helpers.generate_payload(
    i, USER_MAPPING, TRANSLATED_MARKUP, WATCHERS, MIGRATION_DOC_URL, ()) for i in site_munged_issues]

testing_payloads = [migration_helpers.generate_payload(
    i, USER_MAPPING, TRANSLATED_MARKUP, WATCHERS, MIGRATION_DOC_URL, ()) for i in testing_munged_issues]

# Import issues into GitHub

In [None]:
%%time
def import_into_github(payloads, import_url, github_credentials):
    import_responses = {}
    with requests.Session() as s:
        for i, (key, payload) in enumerate(payloads):
            if key in import_responses:
                continue
            if (i % 100 == 0):
                print(f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] importing ", i, "/", len(payloads))
    
            params = {"method": "POST", "url": import_url, "json": payload, "headers": GITHUB_CREDENTIALS}
            response = migration_helpers.request_to_github(params, s)
            import_responses[key] = {"import_response": response, "status": ""}

    return import_responses

cpp_import_responses = import_into_github(
    cpp_payloads, IMPORT_URL.format(ARROW_GITHUB_OWNER, "arrow"), GITHUB_CREDENTIALS)

java_import_responses = import_into_github(
    java_payloads, IMPORT_URL.format(ARROW_GITHUB_OWNER, "parquet-java"), GITHUB_CREDENTIALS)

format_import_responses = import_into_github(
    format_payloads, IMPORT_URL.format(ARROW_GITHUB_OWNER, "parquet-format"), GITHUB_CREDENTIALS)

site_import_responses = import_into_github(
    site_payloads, IMPORT_URL.format(ARROW_GITHUB_OWNER, "parquet-site"), GITHUB_CREDENTIALS)

testing_import_responses = import_into_github(
    testing_payloads, IMPORT_URL.format(ARROW_GITHUB_OWNER, "parquet-testing"), GITHUB_CREDENTIALS)

# Get created issue's urls to map them to Jira tickets

In [None]:
%%time

def get_github_issue_urls(github_owner, github_repo, github_credentials, period=YEARLY):
    dates = rrule(period, interval=1, dtstart=datetime.strptime("2013", "%Y"),
                  until=datetime.strptime("2025", "%Y"))
    fmt = "%Y-%m-%d"
    intervals = [f"{x.strftime(fmt)}..{y.strftime(fmt)}" for x, y in zip(dates, dates[1:])]
    
    results = []
    for interval in intervals:
        results += migration_helpers.get_issues(
            owner=github_owner, repo=github_repo, interval=interval, github_credentials=github_credentials)
        print(interval, len(results))

    def get_key(body):
        try:
            return re.findall(r"(PARQUET-\d+).", body)[-1]
        except:
            # Bad regex in PARQUET-1546
            return "PARQUET-1546"

    github_urls = {get_key(x["bodyText"]): x["url"] for x in results}
    github_ids = {x["url"]: x["id"] for x in results}
    return github_urls, github_ids

github_info = [
    get_github_issue_urls(ARROW_GITHUB_OWNER, "arrow", GITHUB_CREDENTIALS, period=MONTHLY),
    get_github_issue_urls(ARROW_GITHUB_OWNER, "parquet-java", GITHUB_CREDENTIALS),
    get_github_issue_urls(ARROW_GITHUB_OWNER, "parquet-format", GITHUB_CREDENTIALS),
    get_github_issue_urls(ARROW_GITHUB_OWNER, "parquet-site", GITHUB_CREDENTIALS),
    get_github_issue_urls(ARROW_GITHUB_OWNER, "parquet-testing", GITHUB_CREDENTIALS),
]

GITHUB_URLS = {k: v for d in github_info for k, v in d[0].items()}
GITHUB_IDS = {k: v for d in github_info for k, v in d[1].items()}

# Update cross issue links on GitHub to link to GitHub issues

In [None]:
all_payloads = [*cpp_payloads, *java_payloads, *format_payloads, *site_payloads, *testing_payloads]
new_issue_bodies = migration_helpers.fix_issue_bodies(ISSUES, all_payloads, GITHUB_URLS)
print(f"Updating {len(new_issue_bodies)} issue bodies with corrected links.")
_ = migration_helpers.update_gh_issue_links(new_issue_bodies, GITHUB_URLS, GITHUB_CREDENTIALS)

# Create a self subscription dataset

In [None]:
issue_subscriptions = [
    *[(watcher.key, "watcher", issue.key) for issue in ISSUES for watcher in WATCHERS[issue.id].watchers],
    *[(issue.fields.reporter.key, "reporter", issue.key) for issue in ISSUES if issue.fields.reporter],
    *[(issue.fields.creator.key, "creator", issue.key) for issue in ISSUES if issue.fields.creator],
    *[(issue.fields.assignee.key, "assignee", issue.key) for issue in ISSUES if issue.fields.assignee]
]

issue_subscriptions = [(USER_MAPPING[usr], usr_type, GITHUB_URLS[key], GITHUB_IDS[GITHUB_URLS[key]], key)
                       for usr, usr_type, key in issue_subscriptions if usr in USER_MAPPING]

with open(issue_subscriptions_file, 'w') as f:
    writer = csv.writer(f)

    for issue_subscription in issue_subscriptions:
        writer.writerow(issue_subscription)

# Update Jira issues to link to new GitHub Issues

In [None]:
%%time
# WARNING: Please verify all issues were successfully imported before running this so links can be posted to Jira

CONN = migration_helpers.get_jira_client(JIRA_CREDENTIALS)

for issue in ISSUES:
    print(issue.key)
    migration_helpers.update_source_jira(issue, GITHUB_URLS[issue.key], JIRA_MIGRATION_NOTE, CONN)    