### **1. Import Dependencies**

In [111]:
import pandas as pd
import numpy as np

import gc
import os
import requests
import json

import concurrent.futures
import asyncio
import httpx
from urllib.parse import urlencode

from typing import List, Dict, Optional
from glob import glob

import joblib

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier

from sentence_transformers import SentenceTransformer

### **2. Settings**

In [121]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 50)

### **2. Load Data**

In [2]:
def fetch_data(url: str, params=None):
  try:
    response = requests.get(url, params=params)
    if (response.status_code == 200):
      return response.json()
    
    print(f"Error response with status code: {response.status_code}")
  except Exception as error:
    print(f'Failed to fetch data: {error}')

def urls_builder(base_url: str, n_fetch: int, limit: int, products: List[str], **kwargs):
  urls = []
  for product in products:
    for i in range(n_fetch):
      
      param = {
        'offset': i * limit,
        'limit': limit,
        'product': product,
        **kwargs,
      }
      
      full_url = base_url + '?' + urlencode(param)
      urls.append(full_url)
    
  return urls

def save_json(data, path: str):
  with open(path, 'w') as json_file:
    json.dump(data, json_file)

def load_json(path: str):
  with open(path, 'r') as json_file:
    loaded_data = json.load(json_file)
  return loaded_data

In [66]:
selected_fields = ['id', 'duplicates', 'summary', 'status', 'resolution', 'platform', 'product', 'type', 'priority', 'severity', 'component']
products = ['Firefox']

base_params = {
  'include_fields': ', '.join(selected_fields),
}
saved_data_path = os.path.join('data', 'firefox_raw_data.json')

base_url = 'https://bugzilla.mozilla.org/rest/bug'
n_fetch = 20
limit = 10000

if os.path.exists(saved_data_path):
  response_data = load_json(saved_data_path)
else:
  urls = urls_builder(base_url, n_fetch, limit, products, **base_params)
  response_data = []

  max_workers = 100
  with concurrent.futures.ThreadPoolExecutor(max_workers) as executor:
    response_data = list(executor.map(fetch_data, urls))
    
  response_data = [item['bugs'] for item in response_data]
  response_data = [item for sublist in response_data for item in sublist]
  
  save_json(response_data, saved_data_path)

In [108]:
df = pd.DataFrame()

data_paths = glob(os.path.join('data', '*.json'))

for path in data_paths:
  data = load_json(path)
  data = pd.DataFrame(data)
  df = pd.concat([df, data])
  
df = df.set_index('id')
df.head()

Unnamed: 0_level_0,duplicates,product,resolution,priority,summary,severity,status,platform,component,type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
406067,[],Cloud Services,FIXED,--,Chrome error,minor,RESOLVED,x86,General,defect
408325,[],Cloud Services,FIXED,--,Can no longer synch between two machines runni...,normal,VERIFIED,x86,General,defect
409568,[],Cloud Services,DUPLICATE,--,Arrow not pointing to weave icon on services.m...,minor,RESOLVED,All,General,defect
409579,[409713],Cloud Services,FIXED,--,Mail from Weave detected as spam,major,RESOLVED,All,General,defect
409600,"[409568, 420386]",Cloud Services,WONTFIX,--,First run page has an arrow that points to the...,normal,RESOLVED,All,General,defect


In [109]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 1131592 entries, 406067 to 1876125
Data columns (total 10 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   duplicates  1131592 non-null  object
 1   product     1131592 non-null  object
 2   resolution  1131592 non-null  object
 3   priority    1131592 non-null  object
 4   summary     1131592 non-null  object
 5   severity    1131592 non-null  object
 6   status      1131592 non-null  object
 7   platform    1131592 non-null  object
 8   component   1131592 non-null  object
 9   type        1131592 non-null  object
dtypes: object(10)
memory usage: 769.4 MB


### **3. EDA**

### **3.1. Missing Values**

In [126]:
df.isna().sum()

duplicates    0
product       0
resolution    0
priority      0
summary       0
severity      0
status        0
platform      0
component     0
type          0
dtype: int64

### **3.2. Product**

In [120]:
df['product'].value_counts(ascending=False)

product
Core                                   500000
Firefox                                210650
SeaMonkey                               96704
Thunderbird                             68967
Toolkit                                 60076
DevTools                                42318
Testing                                 37572
Firefox Build System                    24496
Mozilla Localizations                   14491
WebExtensions                           11405
Cloud Services                           9948
Websites                                 8595
Firefox for iOS                          7226
Tree Management                          6852
Remote Protocol                          6287
Fenix                                    6231
Web Compatibility                        5992
GeckoView                                4909
Hello (Loop)                             3331
Webtools                                 2254
Input                                    1064
Shield                    

### **3.3. Resolution**

In [122]:
df['resolution'].value_counts(ascending=False)

resolution
FIXED         435203
DUPLICATE     207028
              129433
WORKSFORME    122231
INCOMPLETE     95960
INVALID        81294
WONTFIX        46331
EXPIRED         8849
INACTIVE        4512
MOVED            751
Name: count, dtype: int64

### **3.4. Priority**

In [128]:
df['priority'].value_counts(ascending=False)

priority
--    763048
P3    145837
P5     81812
P2     54671
P1     52002
P4     34222
Name: count, dtype: int64

### **3.5. Severity**

In [129]:
df['severity'].value_counts(ascending=False)

severity
normal         680726
S3             115244
--              95645
critical        70737
major           53828
S4              45603
minor           28913
N/A             15658
trivial         11790
blocker          6686
S2               6218
S1                499
enhancement        45
Name: count, dtype: int64

### **3.6. Status**

In [130]:
df['status'].value_counts(ascending=False)

status
RESOLVED       841157
VERIFIED       160019
NEW             99645
UNCONFIRMED     22094
REOPENED         5075
ASSIGNED         2619
CLOSED            983
Name: count, dtype: int64

### **3.7. Platform**

In [131]:
df['platform'].value_counts(ascending=False)

platform
Unspecified    381749
x86            327986
All            276949
x86_64          76594
PowerPC         24133
ARM             14795
Desktop         14357
Other           10254
Sun              2640
ARM64            1117
DEC               454
HP                341
SGI               188
RISCV64            31
XScale              4
Name: count, dtype: int64

### **3.8. Component**

In [140]:
df['component'].value_counts(ascending=False)

component
General                   164761
DOM: Core & HTML           43368
JavaScript Engine          42674
Layout                     33973
Untriaged                  28837
                           ...  
bo / Tibetan                   1
Morgoth                        1
Operations: Activedata         1
Operations: Location           1
Operations: Pageshot           1
Name: count, Length: 693, dtype: int64

### **3.9. Type**

In [133]:
df['type'].value_counts(ascending=False)

type
defect         924476
enhancement    128147
task            78959
--                 10
Name: count, dtype: int64