# Imports

In [1]:
import requests  # To get the data
import json  # File IO
from time import sleep
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
import pandas as pd
import numpy as np
import csv

# Load Program Data

In [2]:
# Get list of Jira sources
with open('/home/mikel/Desktop/project1/JiraDataCrawler/jira_data_sources.json') as f:
    jira_data_sources = json.load(f)

INVALID_JIRAS = ['Mindville', 'MariaDB']

# Investigate Jira Data Accessibility

In [3]:
def check_jira_url(jira_url):
    print('')
    print(f"💡 Check Jira: {jira_url}")
    print('')
    ## CHECK PROVIDED JIRA URL AVAILABILITY ##
    print(f'Checking Jira url existence with GET: {jira_url}')
    try:
        requests.head(jira_url)
    except ConnectionError:
        print('❌ Provided Jira base url does not exist')
        return
    else:
        print('✅ Provided Jira base url is reachable')
        
    ## CHECK PROVIDED JIRA URL API AVAILABILITY ##
    response = requests.get(jira_url + '/rest/api/2/issuetype')
    print('')
    print(f'Checking Jira api with GET: {response.url}')
    # Check response code
    if response.status_code < 300:
        print('✅ Jira API returned a successful response')
    else:
        print(response.status_code)
        print(response.text)
        print(response.url)
        print('❌ Jira API did not return a successful response')
        return
    
    ## CHECK NUMBER OF ISSUES ##
    response = requests.get(jira_url + '/rest/api/2/search?jql=&maxResults=0')
    print('')
    print(f"Retrieving total issue count with GET: {response.url}")
    # Check response code
    if response.status_code < 300:
        try:
            print(f"Total Number of Issues: {response.json()['total']}")
            print('✅ Jira API returned a successful response')
        except:
            print("JSON ERROR lads!! Carry on!!")
    else:
        print(response.status_code)
        print(response.text)
        print(response.url)
        print('❌ Jira API did not return a successful response')
        return

In [4]:
# Check all Jira URLs in provided jira_data_sources
for jira_name, jira_obj in jira_data_sources.items():
    
    # Ignore Jiras that we know are now unreachable or empty
    if jira_name in INVALID_JIRAS:
        continue
    
    check_jira_url(jira_obj['jira_url'])


💡 Check Jira: https://issues.apache.org/jira

Checking Jira url existence with GET: https://issues.apache.org/jira
✅ Provided Jira base url is reachable

Checking Jira api with GET: https://issues.apache.org/jira/rest/api/2/issuetype
✅ Jira API returned a successful response

Retrieving total issue count with GET: https://issues.apache.org/jira/rest/api/2/search?jql=&maxResults=0
Total Number of Issues: 1058418
✅ Jira API returned a successful response

💡 Check Jira: https://jira.hyperledger.org

Checking Jira url existence with GET: https://jira.hyperledger.org
✅ Provided Jira base url is reachable

Checking Jira api with GET: https://jira.hyperledger.org/rest/api/2/issuetype
✅ Jira API returned a successful response

Retrieving total issue count with GET: https://jira.hyperledger.org/rest/api/2/search?jql=&maxResults=0
Total Number of Issues: 25886
✅ Jira API returned a successful response

💡 Check Jira: https://jira.hpdd.intel.com

Checking Jira url existence with GET: https://jira

# Download Jira Data

In [5]:
def format_duration(start_time, end_time):
    # Get the total seconds of the duration
    seconds = end_time - start_time
    # Calculate the other time 
    milliseconds = int((seconds % 1) * 10000)
    minutes = int(seconds / 60)
    hours   = int(minutes / 60)
    # Trim the values to fit in their appopriate slots
    display_minutes = int(minutes % 60)
    display_seconds = int(seconds % 60)

    return f"{hours:02}:{display_minutes:02}:{display_seconds:02}.{milliseconds:04}"

In [14]:
# Write the result to a JSON
output_json = {}

for jira_name, jira_data in jira_data_sources.items():
    
    # Ignore Jiras that we know are now unreachable or empty
    if jira_name in INVALID_JIRAS:
        continue

    # Build the URL to get the information from
    jira_issuetype_url = jira_data['jira_url'] + '/rest/api/2/issuetype'

    try:
        # Get the issuetype definitions
        documented_issuetypes = {

            issuetype['name']: issuetype
            for issuetype in requests.get(jira_issuetype_url).json()
        }
    except:
        pass

        # Save the information
    output_json[jira_name] = documented_issuetypes

# Write JSON to file
with open('jira_issuetype_information.json', 'w', encoding='utf-8') as json_file:
    json.dump(output_json, json_file, ensure_ascii=False, indent=4)

In [15]:
# Write the result to a JSON
output_json = {}

for jira_name, jira_data in jira_data_sources.items():
    
    # Ignore Jiras that we know are now unreachable or empty
    if jira_name in INVALID_JIRAS:
        continue

    # Build the URL to get the information from
    jira_issuelinktype_url = jira_data['jira_url'] + '/rest/api/2/issueLinkType'

    try:
        # Get the issuelinktype definitions
        documented_issuelinktypes = {
            issuelinktype['name']: issuelinktype
            for issuelinktype in requests.get(jira_issuelinktype_url).json()['issueLinkTypes']
        }
    except:
        pass

    # Save the information
    output_json[jira_name] = documented_issuelinktypes

# Write JSON to file
with open('jira_issuelinktype_information.json', 'w', encoding='utf-8') as json_file:
    json.dump(output_json, json_file, ensure_ascii=False, indent=4)

# JIRA REPOSITORY CRAWLER
## Explanation of what it does, and how it's done.
Param: Name of the project to crawl
Process: REST API of the Jira repo request with the selected attributes from projects' issues.
Returns: Writes a csv file with the downloaded data.

In [6]:
# API call for the list of projects from Jira Apache projects.
delta = requests.get("https://issues.apache.org/jira/rest/api/2/project")
jiraProjects = delta.json()

In [7]:
# CLEANING: Ignores the projects in the attic + stores the names of the rest.

activeProjects = [] # Projects that are not retired and have a clear category.

for project in jiraProjects: # Where all the projects handled with Jira are stored

    try:
        category = project['projectCategory']['name']
    except KeyError: # If the project doesn't have a category section, bad signal!
        continue

    if category == "Retired": # If the project is in the attic.
        continue
    else:
        activeProjects.append(project)

In [15]:
# MAIN FUNCTION
def jiraCrawler(projectLinkName, responseError):

    # Fixed parameters for the API call search query.
    startAt = 0
    maxResult = 100

    # list gathering all issue dictionaries.
    projectissuedataitems = []

    # List of attributes to be downloaded.
    issue_feature_list = ['project_ID', 'project_name', 'issue_ID', 'issue_type_ID', 'issue_type_name', 'key', 'user_login', 'state', 'state_key', 'issue_link', 'created_at', 'updated_at', 'resolution_at', 'active_time', 'title', 'body', 'description', 'watchcount']

    # Creation of the CSV file with the header.
    with open(f"/home/mikel/Desktop/project1/JiraDataCrawler/1.DataDownload/jiraProjectIssues/{projectLinkName}.csv", 'w', encoding='utf-8') as file:
        writer = csv.writer(file, delimiter=',')
        writer.writerow([x for x in issue_feature_list])

    # It will loop until there's no more issues to crawl.
    while 1 == 1:

        # Link for the API call.
        jiraLink = "https://issues.apache.org/jira/rest/api/2/" + f"search?jql=project={projectLinkName}&startAt={startAt}&maxResults={maxResult}"

        # Request call.
        theResult = requests.get(jiraLink)

        # Control check to omit problematic requests:
        if theResult.status_code != 200:
            #responseError.append(projectLinkName)
            break
        # List of issues from the project.
        theItemListPerPage = theResult.json()['issues']

        if len(theItemListPerPage) == 0:
            break
        else: # If there're issues in the fetched json, the process starts.
            print(f"API call status: {theResult.headers['Keep-Alive']}")
            print(f"Length of issues: {len(theResult.json()['issues'])}")
            print(projectLinkName+' '+f"Starting at: {startAt}, Max results: {maxResult}")
            for item in theItemListPerPage:
                issueitem = {}
                # If the issue was closed before sonarQube existed (created on 2006), then it directly means
                # they didn't use SQ at all for that issue. Hence we first check that.
                resolutionDate = pd.Timestamp(item['fields']['resolutiondate'])
                if resolutionDate is not None and resolutionDate < pd.Timestamp("2006-01-01").tz_localize('UTC'):
                    responseError.append(item['id'])
                    continue

                issueitem['project_ID'] = item['fields']['project']['id']
                issueitem['project_name'] = item['fields']['project']['name']
                issueitem['issue_ID'] = item['id']
                issueitem['issue_type_ID'] = item['fields']['issuetype']['id']
                issueitem['issue_type_name'] = item['fields']['issuetype']['name']
                issueitem['key'] = item['key'] # Somehow like 'number' in Github
                try:
                    issueitem['user_login'] = item['fields']['creator']['name']
                except TypeError:
                    issueitem['user_login'] = 'unassigned'
                issueitem['state'] = item['fields']['status']['name']
                issueitem['state_key'] = item['fields']['status']['statusCategory']['key']
                issueitem['issue_link'] = item['self']
                issueitem['created_at'] = pd.Timestamp(item['fields']['created'])

                if item['fields']['updated'] == None:
                    issueitem['updated_at'] = np.NaN
                else:
                    issueitem['updated_at'] = pd.Timestamp(item['fields']['updated'])

                # There's a resolution date but it can be tricky to understand.
                # An issue can have a resolution date when being resolved and/or closed.
                if resolutionDate == None:
                    issueitem['resolution_at'] = np.NaN
                    issueitem['active_time(segs)'] = pd.Timedelta(issueitem['updated_at'] - issueitem['created_at']).total_seconds()
                else:
                    issueitem['resolution_at'] = resolutionDate
                    issueitem['active_time'] = pd.Timedelta(resolutionDate - issueitem['created_at']).total_seconds()

                issueitem['title'] = item['fields']['issuetype']['description']
                issueitem['body'] = item['fields']['status']['description']
                issueitem['description'] = item['fields']['description']
                issueitem['watchcount'] = item['fields']['watches']['watchCount']

                # All issues are stored in this list.
                projectissuedataitems.append(issueitem)
                # f"/home/mikel/Desktop/project1/JiraDataCrawler/1.DataDownload/jiraProjectIssues/{projectLinkName}.csv"
                with open(f"/home/mikel/Desktop/project1/JiraDataCrawler/1.DataDownload/jiraProjectIssues/{projectLinkName}.csv", 'a', encoding='utf-8') as file:
                    writer = csv.writer(file, delimiter=',')
                    writer.writerow([issueitem[x] for x in issue_feature_list])

            if startAt == 0:
                startAt += (maxResult)
            else:
                startAt += maxResult

    print(f"{projectLinkName} download completed!")

    return responseError

In [47]:
# Error of bad request cases fixed.
# Once the download is done, remove the index of project 'exec' value from the list "active projects"
errorRequest = 0
for project in activeProjects:
    jiraCrawler(project['key'].lower(), errorRequest)
    sleep(15) # Sleep 15 from project to project for timeouts

exec download completed!
API call status: timeout=15, max=100
Length of issues: 100
fileupload Starting at: 0, Max results: 100
API call status: timeout=15, max=100
Length of issues: 100
fileupload Starting at: 100, Max results: 100
API call status: timeout=15, max=100
Length of issues: 100
fileupload Starting at: 200, Max results: 100
API call status: timeout=15, max=100
Length of issues: 44
fileupload Starting at: 300, Max results: 100
fileupload download completed!
API call status: timeout=15, max=100
Length of issues: 31
functor Starting at: 0, Max results: 100
functor download completed!
API call status: timeout=15, max=100
Length of issues: 100
geometry Starting at: 0, Max results: 100
API call status: timeout=15, max=100
Length of issues: 35
geometry Starting at: 100, Max results: 100
geometry download completed!
API call status: timeout=15, max=100
Length of issues: 100
imaging Starting at: 0, Max results: 100
API call status: timeout=15, max=100
Length of issues: 100
imaging S