# Imports

In [2]:
import requests  # To get the data
from requests.adapters import HTTPAdapter
# from requests.packages.urllib3.util.retry import Retry
from urllib3.util.retry import Retry

from pymongo import MongoClient  # Database to store the data
import json  # File IO
from time import time  # To time the duration of the requests
from time import sleep
from IPython.display import display, clear_output
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

import pandas as pd
import numpy as np
import csv

# Load Program Data

In [2]:
# Get list of Jira sources
with open('/home/mikel/Desktop/project1/JiraDataCrawler/jira_data_sources.json') as f:
    jira_data_sources = json.load(f)
    
# Connect to the database
db = MongoClient()['JiraRepos']

# Now-Invalid Jiras
INVALID_JIRAS = ['Mindville', 'MariaDB']

# Investigate Jira Data Accessibility

In [3]:
def check_jira_url(jira_url):
    print('')
    print(f"💡 Check Jira: {jira_url}")
    print('')
    ## CHECK PROVIDED JIRA URL AVAILABILITY ##
    print(f'Checking Jira url existence with GET: {jira_url}')
    try:
        requests.head(jira_url)
    except ConnectionError:
        print('❌ Provided Jira base url does not exist')
        return
    else:
        print('✅ Provided Jira base url is reachable')
        
    ## CHECK PROVIDED JIRA URL API AVAILABILITY ##
    response = requests.get(jira_url + '/rest/api/2/issuetype')
    print('')
    print(f'Checking Jira api with GET: {response.url}')
    # Check response code
    if response.status_code < 300:
        print('✅ Jira API returned a successful response')
    else:
        print(response.status_code)
        print(response.text)
        print(response.url)
        print('❌ Jira API did not return a successful response')
        return
    
    ## CHECK NUMBER OF ISSUES ##
    response = requests.get(jira_url + '/rest/api/2/search?jql=&maxResults=0')
    print('')
    print(f"Retrieving total issue count with GET: {response.url}")
    # Check response code
    if response.status_code < 300:
        try:
            print(f"Total Number of Issues: {response.json()['total']}")
            print('✅ Jira API returned a successful response')
        except:
            print("JSON ERROR lads!! Carry on!!")
    else:
        print(response.status_code)
        print(response.text)
        print(response.url)
        print('❌ Jira API did not return a successful response')
        return

In [4]:
# Check all Jira URLs in provided jira_data_sources
for jira_name, jira_obj in jira_data_sources.items():
    
    # Ignore Jiras that we know are now unreachable or empty
    if jira_name in INVALID_JIRAS:
        continue
    
    check_jira_url(jira_obj['jira_url'])


💡 Check Jira: https://issues.apache.org/jira

Checking Jira url existence with GET: https://issues.apache.org/jira
✅ Provided Jira base url is reachable

Checking Jira api with GET: https://issues.apache.org/jira/rest/api/2/issuetype
✅ Jira API returned a successful response

Retrieving total issue count with GET: https://issues.apache.org/jira/rest/api/2/search?jql=&maxResults=0
Total Number of Issues: 1058418
✅ Jira API returned a successful response

💡 Check Jira: https://jira.hyperledger.org

Checking Jira url existence with GET: https://jira.hyperledger.org
✅ Provided Jira base url is reachable

Checking Jira api with GET: https://jira.hyperledger.org/rest/api/2/issuetype
✅ Jira API returned a successful response

Retrieving total issue count with GET: https://jira.hyperledger.org/rest/api/2/search?jql=&maxResults=0
Total Number of Issues: 25886
✅ Jira API returned a successful response

💡 Check Jira: https://jira.hpdd.intel.com

Checking Jira url existence with GET: https://jira

# Download Jira Data

### Helper Functions

In [5]:
def format_duration(start_time, end_time):
    # Get the total seconds of the duration
    seconds = end_time - start_time
    # Calculate the other time 
    milliseconds = int((seconds % 1) * 10000)
    minutes = int(seconds / 60)
    hours   = int(minutes / 60)
    # Trim the values to fit in their appopriate slots
    display_minutes = int(minutes % 60)
    display_seconds = int(seconds % 60)

    return f"{hours:02}:{display_minutes:02}:{display_seconds:02}.{milliseconds:04}"

### Download Jira Issue Type Information

In [24]:
pi = requests.get("https://issues.apache.org/jira/rest/api/2/issuetype").json()

In [6]:
jira_data_sources

{'Apache': {'company_url': 'https://apache.org/',
  'jira_url': 'https://issues.apache.org/jira',
  'name': 'Apache',
  'rough_issue_count': '1,015,000'},
 'Hyperledger': {'company_url': 'https://hyperledger.org/about',
  'jira_url': 'https://jira.hyperledger.org',
  'name': 'Hyperledger',
  'rough_issue_count': '28,000'},
 'IntelDAOS': {'company_url': 'https://wiki.hpdd.intel.com/display/DC/DAOS+Community+Home',
  'jira_url': 'https://jira.hpdd.intel.com',
  'name': 'IntelDAOS',
  'rough_issue_count': '9,000'},
 'JFrog': {'company_url': 'https://jfrog.com/about/',
  'jira_url': 'https://jfrog.com/jira',
  'name': 'JFrog',
  'rough_issue_count': '16,000'},
 'Jira': {'company_url': 'https://www.atlassian.com/company',
  'jira_url': 'https://jira.atlassian.com',
  'name': 'Jira',
  'rough_issue_count': '275,000'},
 'JiraEcosystem': {'company_url': 'https://ecosystem.atlassian.net',
  'jira_url': 'https://ecosystem.atlassian.net',
  'name': 'JiraEcosystem',
  'rough_issue_count': '42,000'

In [14]:
# Write the result to a JSON
output_json = {}

for jira_name, jira_data in jira_data_sources.items():
    
    # Ignore Jiras that we know are now unreachable or empty
    if jira_name in INVALID_JIRAS:
        continue

    # Build the URL to get the information from
    jira_issuetype_url = jira_data['jira_url'] + '/rest/api/2/issuetype'

    try:
        # Get the issuetype definitions
        documented_issuetypes = {

            issuetype['name']: issuetype
            for issuetype in requests.get(jira_issuetype_url).json()
        }
    except:
        pass

        # Save the information
    output_json[jira_name] = documented_issuetypes

# Write JSON to file
with open('jira_issuetype_information.json', 'w', encoding='utf-8') as json_file:
    json.dump(output_json, json_file, ensure_ascii=False, indent=4)

### Download Jira Issue Link Type Information

In [15]:
# Write the result to a JSON
output_json = {}

for jira_name, jira_data in jira_data_sources.items():
    
    # Ignore Jiras that we know are now unreachable or empty
    if jira_name in INVALID_JIRAS:
        continue

    # Build the URL to get the information from
    jira_issuelinktype_url = jira_data['jira_url'] + '/rest/api/2/issueLinkType'

    try:
        # Get the issuelinktype definitions
        documented_issuelinktypes = {
            issuelinktype['name']: issuelinktype
            for issuelinktype in requests.get(jira_issuelinktype_url).json()['issueLinkTypes']
        }
    except:
        pass

    # Save the information
    output_json[jira_name] = documented_issuelinktypes

# Write JSON to file
with open('jira_issuelinktype_information.json', 'w', encoding='utf-8') as json_file:
    json.dump(output_json, json_file, ensure_ascii=False, indent=4)

### Download Jira Issue Field Information

In [17]:
import requests
from requests.auth import HTTPBasicAuth
import json

jiras_fields_information = {}

for jira_name, jira_data in jira_data_sources.items():
    
    # Ignore Jiras that we know are now unreachable or empty
    if jira_name in INVALID_JIRAS:
        continue
    
    # Query Jira for field information
    response = requests.get(f"{jira_data['jira_url']}/rest/api/2/field")

    try:
        # Store result in JSON
        jiras_fields_information[jira_name] = response.json()
    except:
        pass

# Write JSON to file for later use
with open('jira_field_information.json', 'w', encoding='utf-8') as json_file:
    json.dump(jiras_fields_information, json_file, ensure_ascii=False, indent=4)

### Download Jira Data Commands

In [18]:
def download_and_write_data_mongo(
    jira_data_source,
    num_desired_results = None,  # Leave as "None" to download all, otherwise specify a number
    iteration_max = 250,  # Recommended to keep at or below 500
    start_index = 0,  # This allows you to start back up from a different place
    ):
    
    def build_url(base_url, start_index, iteration_max=100):
        return (
            base_url +
            f"/rest/api/2/search?"
            f"jql="
            f"&ORDER%20BY%20created%20ASC"
            f"&startAt={start_index}"
            f"&maxResults={iteration_max}"
            f"&expand=changelog"
            )
    
    collection = db[jira_data_source['name']]

    # iteration_max is the number of issues the script will attempt to get at one time.
    # The Jira default max is 1000. Trying with 1000 consistently returned errors after a short while
    # as the object being returned was likely too large. Values of 500 or less serve no particular issue
    # to the script except that more calls (of smaller size) have to be made.
    
    # How many issues to collect before writing to MongoDB
    num_issues_per_write = 10000

    last_write_start_index = start_index
    issues = []

    # Available and requested number of results
    num_available_results = requests.get(build_url(jira_data_source['jira_url'], 0,0)).json()['total']
    print(f'Number of Desired Results   : {num_desired_results if num_desired_results else "All"}')
    print(f'Number of Available Results : {num_available_results}')
    print('')

    # Set the number of results to retrieve based on information from Jira server
    if not num_desired_results:
        num_remaining_results = num_available_results
    else:
        num_remaining_results = min(int(num_desired_results), num_available_results)
    # Adjust remaining results based on their start index
    num_remaining_results -= start_index

    # Collect results while there are more results to gather
    issues_downloaded = 0
    max_count_width = len(str(num_remaining_results)) + 1
    print(f"Total Remaining:{num_remaining_results:< {max_count_width}}")
    while(num_remaining_results > 0):

        # Start a timer for this particular chunk
        start_time = time()

        # Number of items to retrieve
        num_items_to_retrieve = min(iteration_max, num_remaining_results)
        
        # Get issues from Jira
        url = build_url(jira_data_source['jira_url'], start_index, num_items_to_retrieve)
        response = requests.get(url)
        try:
            response_json = response.json()
        except:
            pass

        if 'issues' in response_json:
            # Add issues to program list
            issues.extend(response_json['issues'])
            num_returned_issues = len(response_json['issues'])

        # Adjust the remaining results to get
        num_remaining_results -= num_returned_issues

        # Print progress for user
        end_index = start_index + num_returned_issues - 1
        print(
            f"Total Remaining:{num_remaining_results:< {max_count_width}}  "
            f"Retrieved Items: {start_index:< {max_count_width}} - {end_index:< {max_count_width}}  "
            f"Duration: {format_duration(start_time, time())}")

        # Move the start index
        start_index += num_returned_issues

        # Write the issues to file IF there are enough of them. This is a nice way to save state and start over at a
        # certain place if there are too many to download in one go.
        if len(issues) >= num_issues_per_write or num_remaining_results == 0 or num_returned_issues == 0:
            # Write the data to mongodb
            collection.insert_many(issues)

            print('... Issues written to database ...')
            last_write_start_index = start_index

            issues_downloaded += len(issues)
            issues = []  # Clear the issues so that our memory doesn't get too full

        # If we have for some reason run out of results, we may want to react to this in some way
        if num_returned_issues == 0:
            print('Number of Returned Issues is 0. This is strange and should not happen. Investigate.')
            return

    print('')
    print(f"Number of Downloaded Issues: {issues_downloaded}")

In [None]:
# Last download time: 6h 47m
download_and_write_data_mongo(jira_data_sources['Apache'])

In [None]:
# Last download time: 0h 27m
download_and_write_data_mongo(jira_data_sources['Hyperledger'])

In [None]:
# Last download time: 0h 5m
download_and_write_data_mongo(jira_data_sources['IntelDAOS'])

In [None]:
# Last download time: 0h 12m
download_and_write_data_mongo(jira_data_sources['JFrog'])

In [None]:
# Last download time: 6h 26m
download_and_write_data_mongo(jira_data_sources['Jira'])

In [None]:
# Last download time: 0h 30m
download_and_write_data_mongo(jira_data_sources['JiraEcosystem'])

In [None]:
# download_and_write_data_mongo(jira_data_sources['MariaDB'])

In [None]:
# download_and_write_data_mongo(jira_data_sources['Mindville'])

In [None]:
# Last download time: 1h 26m
download_and_write_data_mongo(jira_data_sources['Mojang'])

In [None]:
# Last download time: 3h 23m
download_and_write_data_mongo(jira_data_sources['MongoDB'])

In [None]:
# Last download time: 0h 50m
download_and_write_data_mongo(jira_data_sources['Qt'])

In [None]:
# Last download time: 3h 58m
download_and_write_data_mongo(jira_data_sources['RedHat'])

In [None]:
# Last download time: 0h 24m
download_and_write_data_mongo(jira_data_sources['Sakai'])

In [None]:
# Last download time: 1h 25m
download_and_write_data_mongo(jira_data_sources['SecondLife'])

In [None]:
# Last download time: 1h 25m
download_and_write_data_mongo(jira_data_sources['Sonatype'])

In [None]:
# Last download time: 0h 20m
download_and_write_data_mongo(jira_data_sources['Spring'])

### Download Jira Issue Comments

In [None]:
def download_jira_issue_comments(jira_data_source, *, max_comments_per_query=50, resume_at_date='0', query_wait_time_minutes=None):
    
    def build_url(base_url, issue_key, comments_start_index=0, max_comments_per_query=max_comments_per_query):
        return (
            base_url +
            f"/rest/api/2/issue/{issue_key}/comment"
            f"?orderBy=created"
            f"&startAt={comments_start_index}"
            f"&maxResults={max_comments_per_query}"
            )
    
    # Initialise requests object with configurations to make it more stable
    session = requests.Session()
    retry = Retry(total=4, connect=4, backoff_factor=0.5)
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    
    acceptable_failure_messages = [
        'Login Required',
        'Issue Does Not Exist',
        'Issue does not exist or you do not have permission to see it.',
    ]
    
    progress_bar_num_chunks = 100
    progress_bar_last = -1

    collection = db[jira_data_source['name']]

    # Ignore Jiras that we know are now unreachable or empty
    if jira_data_source['name'] in INVALID_JIRAS:
        print(f"Cannot download comments for {jira_data_source['name']} due to innaccessible Jira repo.")
        return
    
    print('Querying MongoDB for list of remaining issue keys to get comments for ...')

    # The data is downloaded per issue key, so we must get a complete list of all issue keys
    # This could be kept as a gnerator (by removing the explicit "list()" operator), but then we have to query the dataset twice.
    # This is a space-time tradoff, and I have chosen time.
    jira_issue_keys = list(collection.aggregate([
        # Only get issues without comments already
        { '$match': { 'fields.comments': { '$eq': None } } },
        # We only need the issue id, key, created date
        { '$project': { 'key': 1, 'created': '$fields.created' } },
        # Only get issues at or equal to our "resume" date
        { '$match': { 'created': { '$gte': resume_at_date } } },
        # Sort the results by created date so we can resume if failure occurs
        { '$sort': { 'created': 1 } },
    ], allowDiskUse=True))
    
    # Get the total count of issues so we can get an understanding of progress
    # jira_query_issue_count = collection.count_documents({ 'fields.created': { '$gte': resume_at_date } })
    jira_query_issue_count = len(jira_issue_keys)
    num_issues_complete = 0
    
    print(f"There are {jira_query_issue_count} remaining issues.")

    # Loop through the issue keys, downloading the comments one at a time
    for issue in jira_issue_keys:
        
        # print(f"Working on issue {issue['key']} with creation date {issue['created']} ({num_issues_complete:,} / {jira_query_issue_count:,})")
        
        issue_comments = []
        comments_index = 0
        
        # Loop through comment downloads until all are downloaded. We don't know how many comments exist until we ask for the first N, so we need a while loop
        while True:
            
            # Get the issue comments
            url = build_url(jira_data_source['jira_url'], issue['key'], comments_start_index=comments_index)
            response = session.get(url, verify=False)
            try:
                response_json = response.json()
            except:
                pass
            
            # Check if the response is valid. If not, we skip to the next issue. Some issues are private, etc., so we skip them
            if 'errorMessages' in response_json:
                # Here are the error messages we may run into that we simply skip. We don't want to skip all error messages,
                # so we only check a few here and otherwise break the script to investigate.
                if [message for message in acceptable_failure_messages if message in response_json['errorMessages']]:
                    break  # Break the while loop collecting comments for this issue, and move on to the next issue
                # Otherwise ...
                print(f"\nWas working on {issue['key']} with creation date {issue['created']} ({num_issues_complete:,} / {jira_query_issue_count:,})")
                print('\nLast response json:')
                display(response_json)
                raise Exception('Not sure why the network request has failed.')
            
            # Save this round of issue comments
            issue_comments.extend(response_json['comments'])
            comments_index = len(issue_comments)
            
            # Wait a little as to not overload the number of requests being sent
            if query_wait_time_minutes:
                sleep(query_wait_time_minutes)
            
            # Check if we have collected all of the possible comments
            if comments_index == response_json['total']:
                break

        # Write all of the comments to the MongoDB Jira issue at once
        collection.update_one(
            {'_id': issue['_id']},
            { '$set': { 'fields.comments': issue_comments } }
        )
        
        num_issues_complete += 1
        
        # Output progress
        clear_output(wait=True)
        print(f"Jira: {jira_data_source['name']}")
        print(f"Number of issues to download comments from: {jira_query_issue_count:,}")
        print(f"resume_at_date: {resume_at_date}")
        print('')
        print(f"Last confirmed issue {issue['key']} with creation date {issue['created']} ({num_issues_complete:,} / {jira_query_issue_count:,})")
        print(f"Progress: [{'#'*round((num_issues_complete/jira_query_issue_count)*progress_bar_num_chunks):.<{progress_bar_num_chunks}}]")

In [None]:
# Last download time: Multiple weeks due to extreme rate-limiting.
download_jira_issue_comments(
    jira_data_sources['Apache'],
    query_wait_time_minutes=0.1
)

# REQUEST TEST FOR JIRA DATA DOWNLOAD FUNCTION
I would need to check which are the variables that we could combine with Github data but also check if they work for our scopes.

In [12]:
pi = requests.get("https://issues.apache.org/jira/rest/api/2/search?jql=&ORDER%20BY%20created%20ASC&startAt=0&maxResults=250&expand=changelog")

In [87]:
pi.json()

{'expand': 'schema,names',
 'startAt': 0,
 'maxResults': 250,
 'total': 1058423,
 'issues': [{'expand': 'operations,versionedRepresentations,editmeta,changelog,renderedFields',
   'id': '13524824',
   'self': 'https://issues.apache.org/jira/rest/api/2/issue/13524824',
   'key': 'ZOOKEEPER-4675',
   'fields': {'fixVersions': [{'self': 'https://issues.apache.org/jira/rest/api/2/version/12351304',
      'id': '12351304',
      'name': '3.9.0',
      'archived': False,
      'released': False},
     {'self': 'https://issues.apache.org/jira/rest/api/2/version/12352866',
      'id': '12352866',
      'description': '',
      'name': '3.8.2',
      'archived': False,
      'released': False}],
    'resolution': None,
    'customfield_12312322': None,
    'customfield_12312323': None,
    'customfield_12310420': '9223372036854775807',
    'customfield_12312320': None,
    'customfield_12312321': None,
    'customfield_12312328': None,
    'customfield_12312329': None,
    'customfield_12312326

In [58]:
pi.json().keys()

dict_keys(['expand', 'startAt', 'maxResults', 'total', 'issues'])

In [74]:
pi.json()['issues'][150]['key']

'ZOOKEEPER-4503'

In [68]:
pi.json()['issues'][0].keys()

dict_keys(['expand', 'id', 'self', 'key', 'fields', 'changelog'])

In [27]:
pi.json()['issues'][0]['fields']

{'fixVersions': [{'self': 'https://issues.apache.org/jira/rest/api/2/version/12351304',
   'id': '12351304',
   'name': '3.9.0',
   'archived': False,
   'released': False},
  {'self': 'https://issues.apache.org/jira/rest/api/2/version/12352866',
   'id': '12352866',
   'description': '',
   'name': '3.8.2',
   'archived': False,
   'released': False}],
 'resolution': None,
 'customfield_12312322': None,
 'customfield_12312323': None,
 'customfield_12310420': '9223372036854775807',
 'customfield_12312320': None,
 'customfield_12312321': None,
 'customfield_12312328': None,
 'customfield_12312329': None,
 'customfield_12312326': None,
 'customfield_12310300': None,
 'customfield_12312327': None,
 'customfield_12312324': None,
 'customfield_12312720': None,
 'customfield_12312325': None,
 'lastViewed': None,
 'priority': {'self': 'https://issues.apache.org/jira/rest/api/2/priority/2',
  'iconUrl': 'https://issues.apache.org/jira/images/icons/priorities/critical.svg',
  'name': 'Critical'

# REQUEST TEST FOR Jira Issue Comments
Note that the script comment already mentions that the download takes weeks.

In [30]:
gamma = requests.get("https://issues.apache.org/jira/rest/api/2/issue/ZOOKEEPER-4443/comment?orderBy=created&startAt=0&maxResults=0")

In [32]:
gamma.json()

{'startAt': 0,
 'maxResults': 1,
 'total': 2,
 'comments': [{'self': 'https://issues.apache.org/jira/rest/api/2/issue/13423245/comment/17480103',
   'id': '17480103',
   'author': {'self': 'https://issues.apache.org/jira/rest/api/2/user?username=noneblah',
    'name': 'noneblah',
    'key': 'JIRAUSER284000',
    'avatarUrls': {'48x48': 'https://issues.apache.org/jira/secure/useravatar?avatarId=34058',
     '24x24': 'https://issues.apache.org/jira/secure/useravatar?size=small&avatarId=34058',
     '16x16': 'https://issues.apache.org/jira/secure/useravatar?size=xsmall&avatarId=34058',
     '32x32': 'https://issues.apache.org/jira/secure/useravatar?size=medium&avatarId=34058'},
    'displayName': 'Margarita Stoilova',
    'active': True,
    'timeZone': 'Etc/UTC'},
   'body': 'Thanks to [~mrMigles]\xa0for bringing up the question.\xa0\r\n\r\nIs there anything we can assist with so the 3.7.1 version can be released?\xa0',
   'updateAuthor': {'self': 'https://issues.apache.org/jira/rest/api

In [13]:
pi = requests.get("https://issues.apache.org/jira/rest/api/2/search?jql=project=flex&startAt=0&maxResults=35393")

In [5]:
pi.headers

{'Date': 'Sat, 18 Feb 2023 08:35:36 GMT', 'Server': 'Apache', 'X-AREQUESTID': '515x49035014x5', 'Referrer-Policy': 'strict-origin-when-cross-origin', 'X-XSS-Protection': '1; mode=block', 'X-Content-Type-Options': 'nosniff', 'X-Frame-Options': 'SAMEORIGIN', 'Content-Security-Policy': 'sandbox', 'Strict-Transport-Security': 'max-age=31536000', 'X-AUSERNAME': 'anonymous', 'Cache-Control': 'no-cache, no-store, no-transform', 'Content-Type': 'application/json;charset=UTF-8', 'Set-Cookie': 'atlassian.xsrf.token=A5KQ-2QAV-T4JA-FDED_59ea7b43c862abad40b4dda70627c5b5c74fd141_lout; Path=/jira; Secure; SameSite=None, JSESSIONID=3C7D2588A5202744356A9820801202CF; Path=/jira; Secure; HttpOnly', 'Via': '1.1 jira2-he-de.apache.org', 'Vary': 'Accept-Encoding', 'Content-Encoding': 'gzip', 'Keep-Alive': 'timeout=15, max=100', 'Connection': 'Keep-Alive', 'Transfer-Encoding': 'chunked'}

In [35]:
pi.json().keys()

dict_keys(['startAt', 'maxResults', 'total', 'issues'])

In [14]:
len(pi.json()['issues'])

1000

In [28]:
'''
for i in range(0, len(pi.json()['issues'])):

    if pi.json()['issues'][i]['key'] == 'ABDERA-290':
        print(i)
        break
'''

pi.json()['issues'][310]['fields'] # Shows what each issue displays
# 41

{'fixVersions': [],
 'resolution': {'self': 'https://issues.apache.org/jira/rest/api/2/resolution/1',
  'id': '1',
  'description': 'A fix for this issue is checked into the tree and tested.',
  'name': 'Fixed'},
 'customfield_12312322': None,
 'customfield_12312323': None,
 'customfield_12312320': None,
 'customfield_12310420': '48426',
 'customfield_12312321': None,
 'customfield_12312328': None,
 'customfield_12312329': None,
 'customfield_12312326': None,
 'customfield_12310300': None,
 'customfield_12312327': None,
 'customfield_12312324': None,
 'customfield_12312720': None,
 'customfield_12312325': None,
 'lastViewed': None,
 'priority': {'self': 'https://issues.apache.org/jira/rest/api/2/priority/3',
  'iconUrl': 'https://issues.apache.org/jira/images/icons/priorities/major.svg',
  'name': 'Major',
  'id': '3'},
 'labels': [],
 'customfield_12312333': None,
 'customfield_12312334': None,
 'customfield_12313422': 'false',
 'customfield_12310310': '0.0',
 'customfield_12312331': 

In [10]:
import pandas as pd
import time

date = pd.to_datetime(pi.json()['issues'][0]['fields']['created'], utc=True)
print(date)
print(pi.json()['issues'][41]['fields']['resolutiondate'])
date_2 = pd.Timestamp('1970-01-01').tz_localize('UTC')
print(date_2)

if date > date_2:
    print("2011 came after 1970")
else:
    print("Doesn't work")

print(pd.Timedelta(date - date_2, '1s').total_seconds())
'''
issueitem['created_at'] = (pd.to_datetime(item['created_at'], utc=True)- pd.Timestamp("1970-01-01").tz_localize('UTC'))// pd.Timedelta('1s')
'''

2017-01-30 22:25:09+00:00
2011-10-16T07:00:27.000+0000
1970-01-01 00:00:00+00:00
2011 came after 1970
1485815109.0


'\nissueitem[\'created_at\'] = (pd.to_datetime(item[\'created_at\'], utc=True)- pd.Timestamp("1970-01-01").tz_localize(\'UTC\'))// pd.Timedelta(\'1s\')\n'

In [36]:
# Second pagination has written the same information that the first request.
try:
    epsilon = requests.get("https://issues.apache.org/jira/rest/api/2/search?jql=project=exec")
    print("200")
except:
    print(epsilon)

200


In [40]:
epsilon.status_code

400

# JIRA REPOSITORY CRAWLER
## Explanation of what it does, and how it's done.
Param: Name of the project to crawl
Process: REST API of the Jira repo request with the selected attributes from projects' issues.
Returns: Writes a csv file with the downloaded data.

In [3]:
# API call for the list of projects from Jira Apache projects.
delta = requests.get("https://issues.apache.org/jira/rest/api/2/project")
jiraProjects = delta.json()

In [4]:
# CLEANING: Ignores the projects in the attic + stores the names of the rest.

activeProjects = [] # Projects that are not retired and have a clear category.

for project in jiraProjects: # Where all the projects handled with Jira are stored

    try:
        category = project['projectCategory']['name']
    except KeyError: # If the project doesn't have a category section, bad signal!
        continue

    if category == "Retired": # If the project is in the attic.
        continue
    else:
        activeProjects.append(project)

In [5]:
# MAIN FUNCTION

def jiraCrawler(projectLinkName, responseError):

    # Fixed parameters for the API call search query
    startAt = 0
    maxResult = 100

    # list gathering all issue dictionaries.
    projectissuedataitems = []

    # List of attributes to be downloaded.
    issue_feature_list = ['project_ID', 'project_name', 'issue_ID', 'issue_type_ID', 'issue_type_name', 'key', 'user_login', 'state', 'state_key', 'issue_link', 'created_at', 'updated_at', 'resolution_at', 'active_time', 'title', 'body', 'description', 'watchcount']

    # Creation of the CSV file with the header.
    with open(f"/home/mikel/Desktop/project1/JiraDataCrawler/1.DataDownload/jiraProjectIssues/{projectLinkName}.csv", 'w', encoding='utf-8') as file:
        writer = csv.writer(file, delimiter=',')
        writer.writerow([x for x in issue_feature_list])

    # It will loop until there's no more issues to crawl.
    while 1 == 1:

        # Link for the API call.
        jiraLink = "https://issues.apache.org/jira/rest/api/2/" + f"search?jql=project={projectLinkName}&startAt={startAt}&maxResults={maxResult}"

        # Request call.
        theResult = requests.get(jiraLink)

        # Control check to omit problematic requests:
        if theResult.status_code != 200:
            responseError.append(projectLinkName)
            break
        # List of issues from the project.
        theItemListPerPage = theResult.json()['issues']

        if len(theItemListPerPage) == 0:
            break
        else: # If there're issues in the fetched json, the process starts.
            print(f"API call status: {theResult.headers['Keep-Alive']}")
            print(f"Length of issues: {len(theResult.json()['issues'])}")
            print(projectLinkName+' '+f"Starting at: {startAt}, Max results: {maxResult}")
            for item in theItemListPerPage:
                issueitem = {}
                # If the issue was closed before sonarQube existed (created on 2006), then it directly means
                # they didn't use it at all. Hence we first check that.
                resolutionDate = pd.Timestamp(item['fields']['resolutiondate'])
                if resolutionDate is not None and resolutionDate < pd.Timestamp("2006-01-01").tz_localize('UTC'):
                    continue

                issueitem['project_ID'] = item['fields']['project']['id']
                issueitem['project_name'] = item['fields']['project']['name']
                issueitem['issue_ID'] = item['id']
                issueitem['issue_type_ID'] = item['fields']['issuetype']['id']
                issueitem['issue_type_name'] = item['fields']['issuetype']['name']
                issueitem['key'] = item['key'] # Somehow like 'number' in Github
                try:
                    issueitem['user_login'] = item['fields']['creator']['name']
                except TypeError:
                    issueitem['user_login'] = 'unassigned'
                issueitem['state'] = item['fields']['status']['name']
                issueitem['state_key'] = item['fields']['status']['statusCategory']['key']
                issueitem['issue_link'] = item['self']
                issueitem['created_at'] = pd.Timestamp(item['fields']['created'])

                if item['fields']['updated'] == None:
                    issueitem['updated_at'] = np.NaN
                else:
                    issueitem['updated_at'] = pd.Timestamp(item['fields']['updated'])

                # There's a resolution date but it can be tricky to understand.
                # An issue can have a resolution date when being resolved and/or closed.
                if resolutionDate == None:
                    issueitem['resolution_at'] = np.NaN
                    issueitem['active_time(segs)'] = pd.Timedelta(issueitem['updated_at'] - issueitem['created_at']).total_seconds()
                else:
                    issueitem['resolution_at'] = resolutionDate
                    issueitem['active_time'] = pd.Timedelta(resolutionDate - issueitem['created_at']).total_seconds()

                issueitem['title'] = item['fields']['issuetype']['description']
                issueitem['body'] = item['fields']['status']['description']
                issueitem['description'] = item['fields']['description']
                issueitem['watchcount'] = item['fields']['watches']['watchCount']

                # All issues are stored in this list.
                projectissuedataitems.append(issueitem)

                with open(f"/home/mikel/Desktop/project1/JiraDataCrawler/1.DataDownload/jiraProjectIssues/{projectLinkName}.csv", 'a', encoding='utf-8') as file:
                    writer = csv.writer(file, delimiter=',')
                    writer.writerow([issueitem[x] for x in issue_feature_list])

            if startAt == 0:
                startAt += (maxResult)
            else:
                startAt += maxResult

    print(f"{projectLinkName} download completed!")

In [47]:
# Error of bad request cases fixed.
# Once the download is done, remove the index of project 'exec' value from the list "active projects"
errorRequest = []
for project in activeProjects[122:]:
    jiraCrawler(project['key'].lower(), errorRequest)
    sleep(15) # Sleep 15 from project to project for timeouts

exec download completed!
API call status: timeout=15, max=100
Length of issues: 100
fileupload Starting at: 0, Max results: 100
API call status: timeout=15, max=100
Length of issues: 100
fileupload Starting at: 100, Max results: 100
API call status: timeout=15, max=100
Length of issues: 100
fileupload Starting at: 200, Max results: 100
API call status: timeout=15, max=100
Length of issues: 44
fileupload Starting at: 300, Max results: 100
fileupload download completed!
API call status: timeout=15, max=100
Length of issues: 31
functor Starting at: 0, Max results: 100
functor download completed!
API call status: timeout=15, max=100
Length of issues: 100
geometry Starting at: 0, Max results: 100
API call status: timeout=15, max=100
Length of issues: 35
geometry Starting at: 100, Max results: 100
geometry download completed!
API call status: timeout=15, max=100
Length of issues: 100
imaging Starting at: 0, Max results: 100
API call status: timeout=15, max=100
Length of issues: 100
imaging S

In [8]:
test = activeProjects[0]['name'].lower()
errorRequest = []
jiraCrawler('flex', errorRequest)

API call status: timeout=15, max=100
Length of issues: 100
flex Starting at: 0, Max results: 100
API call status: timeout=15, max=100
Length of issues: 100
flex Starting at: 100, Max results: 100
API call status: timeout=15, max=100
Length of issues: 100
flex Starting at: 200, Max results: 100
API call status: timeout=15, max=100
Length of issues: 100
flex Starting at: 300, Max results: 100
API call status: timeout=15, max=100
Length of issues: 100
flex Starting at: 400, Max results: 100
API call status: timeout=15, max=100
Length of issues: 100
flex Starting at: 500, Max results: 100
API call status: timeout=15, max=100
Length of issues: 100
flex Starting at: 600, Max results: 100
API call status: timeout=15, max=100
Length of issues: 100
flex Starting at: 700, Max results: 100
API call status: timeout=15, max=100
Length of issues: 100
flex Starting at: 800, Max results: 100
API call status: timeout=15, max=100
Length of issues: 100
flex Starting at: 900, Max results: 100
API call sta

In [29]:
for project in activeProjects:
    print(project['key'].lower())


abdera
accumulo
ace
amq
amqnet
artemis
amqcpp
amqcli
openwire
airavata
ambari
amber
anakia
any23
apexcore
apexmalhar
asterixdb
avro
awf
chainsaw
commonssite
commonsrdf
testing
cb
datasketches
directmemory
drill
echarts
fineract
flex
freemarker
gobblin
gora
hawq
helix
horn
inlong
jena
knox
clownfish
madlib
marvin
masfres
mxnet
netbeansinfra
nifi
minifi
minificpp
onami
openaz
hdds
petri
pinot
qpidit
rat
rocketmq
rol
s4
sdap
sedona
scb
storm
taverna
tentacles
tez
mtomcat
unomi
whisker
mrm
aries
asyncweb
atlas
attic
aurora
axiom
axis
axiscpp
axis2
transports
axis2c
bahir
batchee
batik
beam
bigtop
bookkeeper
tm
brooklyn
bval
stdcxx
calcite
camel
carbondata
cassandra
causeway
cay
celix
clke
cloudstack
cocoon
cocoon3
attributes
bcel
beanutils
betwixt
bsf
chain
cli
codec
collections
compress
configuration
crypto
csv
daemon
dbcp
dbutils
digester
discovery
dormant
el
email
exec
fileupload
functor
geometry
imaging
io
jci
jcs
jelly
jexl
jxpath
lang
launcher
logging
math
modeler
net
numbers
ognl
po

In [45]:
for project in activeProjects[122:]:
    print(project['key'].lower())

exec
fileupload
functor
geometry
imaging
io
jci
jcs
jelly
jexl
jxpath
lang
launcher
logging
math
modeler
net
numbers
ognl
pool
primitives
proxy
resources
rng
sandbox
sanselan
scxml
statistics
text
transaction
validator
vfs
weaver
comdev
continuum
couchdb
ctakes
cxf
cxfxjc
fediz
daffodil
datafu
daytrader
ddlutils
dtacloud
deltaspike
derby
dmap
dir
dirserver
dirapi
dirgroovy
dirkrb
dirnaming
dirshared
dirstudio
dl
dbf
dvsl
easyant
edgent
empiredb
esme
escimo
etch
falcon
felix
fincn
flagon
flink
flume
fop
fc
ftpserver
gbuild
geode
geronimo
geronimodevtools
giraph
griffin
groovy
gshell
guacamole
gump
hadoop
hdt
hdfs
mapreduce
yarn
hbase
hcatalog
hive
httpasync
httpclient
httpcore
ignite
impala
incubator
infratest3
infracloud1
infra
iota
ivy
ivyde
jcr
jcrvlt
jcrbench
jcrcl
jcrservlet
jcrtck
jcrrmi
oak
ocm
jcrsite
imap
jdkim
jsieve
jspf
mailbox
mailet
mime4j
mpt
protocols
james
jclouds
jdo
johnzon
jspwiki
juddi
juneau
kafka
kand
karaf
kitty
kudu
kylin
legal
libcloud
logcxx
log4j2
log4net
log

In [43]:

for i in range(0, len(activeProjects)):
    if activeProjects[i]['key'].lower() == 'exec':
        print(i)

122
