### **1. Import Dependencies**

In [2]:
import pandas as pd
import numpy as np

import gc
import os
import requests
import json
import re

import concurrent.futures
import asyncio
import httpx
import multiprocessing

from urllib.parse import urlencode

from typing import List, Dict, Optional, Union
from glob import glob

import joblib

import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics.pairwise import cosine_similarity

from sentence_transformers import SentenceTransformer

import tensorflow as tf
import tensorflow.keras as keras

from keras.models import Model, Sequential
from keras.layers import Dense, Input, Lambda

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\msi.pc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\msi.pc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### **2. Settings**

In [3]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 50)

### **2. Load Data**

In [None]:
def fetch_data(url: str, params=None):
  try:
    response = requests.get(url, params=params)
    if (response.status_code == 200):
      return response.json()
    
    print(f"Error response with status code: {response.status_code}")
  except Exception as error:
    print(f'Failed to fetch data: {error}')

def urls_builder(base_url: str, n_fetch: int, limit: int, products: List[str], **kwargs):
  urls = []
  for product in products:
    for i in range(n_fetch):
      
      param = {
        'offset': i * limit,
        'limit': limit,
        'product': product,
        **kwargs,
      }
      
      full_url = base_url + '?' + urlencode(param)
      urls.append(full_url)
    
  return urls

def save_json(data, path: str):
  with open(path, 'w') as json_file:
    json.dump(data, json_file)

def load_json(path: str):
  with open(path, 'r') as json_file:
    loaded_data = json.load(json_file)
  return loaded_data

In [None]:
%%script skipped # Skipped due to resource optimization

selected_fields = ['id', 'duplicates', 'summary', 'description', 'status', 'resolution', 'platform', 'product', 'type', 'priority', 'severity', 'component']
products = ['Core']

base_params = {
  'include_fields': ', '.join(selected_fields),
}
saved_data_path = os.path.join('data', 'raw_data', 'core_raw_data.json')

base_url = 'https://bugzilla.mozilla.org/rest/bug'
n_fetch = 50
limit = 5000

if os.path.exists(saved_data_path):
  response_data = load_json(saved_data_path)
else:
  urls = urls_builder(base_url, n_fetch, limit, products, **base_params)
  response_data = []

  max_workers = 50
  with concurrent.futures.ThreadPoolExecutor(max_workers) as executor:
    response_data = list(executor.map(fetch_data, urls))
    
  response_data = [item['bugs'] for item in response_data]
  response_data = [item for sublist in response_data for item in sublist]
  
  save_json(response_data, saved_data_path)

In [4]:
raw_data_path = os.path.join('data', 'cache', 'raw_data.parquet')
df = pd.DataFrame()

if (os.path.exists(raw_data_path)):
  df = pd.read_parquet(raw_data_path)
else:
  data_paths = glob(os.path.join('data', 'raw_data', '*.json'))

  for path in data_paths:
    data = load_json(path)
    data = pd.DataFrame(data)
    df = pd.concat([df, data])

  df = df.set_index('id')
  df.to_parquet(raw_data_path)

df.head()

Unnamed: 0_level_0,platform,summary,status,type,description,priority,resolution,product,severity,component,duplicates
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
406067,x86,Chrome error,RESOLVED,defect,Weave currently generates the following error ...,--,FIXED,Cloud Services,minor,General,[]
408325,x86,Can no longer synch between two machines runni...,VERIFIED,defect,"Since just after 8:12 PM on 12/11, I've been g...",--,FIXED,Cloud Services,normal,General,[]
409568,All,Arrow not pointing to weave icon on services.m...,RESOLVED,defect,See screenshot. The arrow is not pointing to t...,--,DUPLICATE,Cloud Services,minor,General,[]
409579,All,Mail from Weave detected as spam,RESOLVED,defect,The confirmation mail that Weave sent me was d...,--,FIXED,Cloud Services,major,General,[409713]
409600,All,First run page has an arrow that points to the...,RESOLVED,defect,The Weave extension's firstrun page has an arr...,--,WONTFIX,Cloud Services,normal,General,"[409568, 420386]"


In [5]:
df = df.drop(['status', 'priority', 'resolution', 'severity', 'component'], axis=1)

In [6]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 1131641 entries, 406067 to 1876207
Data columns (total 6 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   platform     1131641 non-null  object
 1   summary      1131641 non-null  object
 2   type         1131641 non-null  object
 3   description  1131641 non-null  object
 4   product      1131641 non-null  object
 5   duplicates   1131641 non-null  object
dtypes: object(6)
memory usage: 1.9 GB


### **3. EDA**

### **3.1. Missing Values**

In [None]:
df.isna().sum()

### **3.2. Product**

In [None]:
df['product'].value_counts(ascending=False)

### **3.3. Platform**

In [None]:
df['platform'].value_counts(ascending=False)

### **3.4. Type**

In [None]:
df['type'].value_counts(ascending=False)

### **3.5. Duplicates**

In [None]:
duplicates = df['duplicates'].apply(lambda x: len(x)).sort_values(ascending=False)
duplicates

In [None]:
duplicated_bugs = duplicates[duplicates > 0]
duplicated_bugs.count()

### **4. Feature Engineering**

### **4.1. Missing Values**

In [None]:
df.loc[df['type'] == '--', 'type'] = 'no type'

### **4.2. Duplicates**

In [None]:
duplicated = df.loc[duplicated_bugs.index, 'duplicates']
duplicated

In [None]:
data_path = os.path.join('data', 'cache', 'data.parquet')

if (os.path.exists(data_path)):
  df = pd.read_parquet(data_path)
else:
  df['duplicates_to'] = -1

  for idx, dups in zip(duplicated.index, duplicated):
    for item in dups:
      df.loc[df.index == item, 'duplicates_to'] = idx
      
  df.to_parquet(data_path)

In [None]:
df = df.drop(columns=['duplicates'])

### **4.3. Clean Sentences**

In [None]:
def remove_special_chars(text: str):
  text = text.lower()
  text = re.sub(r'\n|\t|\r|\0', ' ', text)
  text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
  text = re.sub(r'\s{2,}', ' ', text)
  text = re.sub(r'\s$', '', text)
  text = re.sub(r'\s[b-z]\s', ' ', text)
  text = re.sub(r'\s[b-z]\s', ' ', text)
  
  return text

def remove_stopwords(text: str):
  stop_words = set(stopwords.words('english'))
  words = word_tokenize(text)
  filtered_words = [word for word in words if word not in stop_words]
  
  return ' '.join(filtered_words)

def clean_text(text: str):
  text = remove_special_chars(text)
  text = remove_stopwords(text)
  
  return text

df['platform'] = df['platform'].apply(clean_text)
df['summary'] = df['summary'].apply(clean_text)
df['type'] = df['type'].apply(clean_text)
df['description'] = df['description'].apply(clean_text)
df['product'] = df['product'].apply(clean_text)

### **4.4. Combined Text**

In [None]:
df['text'] = df['type'] + ' ' + df['platform'] + ' ' + df['product'] + ' ' + df['summary'] + ' ' + df['description']

In [None]:
df = df[['text', 'duplicates_to']]
gc.collect()

df.head()

### **4.5. Sentence Embedding**

In [None]:
sent_embd_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [None]:
embd_data_path = os.path.join('data', 'cache', 'embd_data.parquet')

if (os.path.exists(embd_data_path)):
  df = pd.read_parquet(embd_data_path)
else:
  df['text_embedded'] = df['text'].apply(sent_embd_model.encode)
  df.to_parquet(embd_data_path)

df['text_embedded'] = df['text_embedded'].apply(np.array)
gc.collect()

In [None]:
df_id = df[['text']]
df = df.drop(columns=['text'])

gc.collect()

### **4.6. Sentence Pairs**

In [None]:
df = df.reset_index()
df['_id'] = df['id']

In [None]:
df_uniques = df[df['duplicates_to'] == -1].copy()
df_duplicates = df[df['duplicates_to'] != -1].copy()

In [None]:
def custom_cosine_similarity(text_1: pd.Series, text_2: pd.Series):
    dot_product = np.dot(text_1, text_2)
    norm_text_1 = np.linalg.norm(text_1)
    norm_text_2 = np.linalg.norm(text_2)
    
    similarity = dot_product / (norm_text_1 * norm_text_2)
    
    return similarity

In [None]:
df_duplicates = pd.merge(left=df_duplicates,
                        right=df_uniques,
                        left_on='duplicates_to',
                        right_on='id',
                        suffixes=('__left', '__right'))
df_duplicates = df_duplicates[['text_embedded__left', 'text_embedded__right', '_id__left', '_id__right']]
df_duplicates['label'] = 1
df_duplicates = df_duplicates.reset_index(drop=True)

In [None]:
df_duplicates['cosine_similarity'] = df_duplicates.apply(lambda x: custom_cosine_similarity(x['text_embedded__left'], x['text_embedded__right']), axis=1)

df_duplicates.head()

In [None]:
half_rows = df_uniques.shape[0] // 2

df_uniques_temp = pd.DataFrame()
df_uniques_temp['text_embedded__left'] = df_uniques['text_embedded']
df_uniques_temp['_id__left'] = df_uniques['_id']
df_uniques_temp['text_embedded__right'] = df_uniques['text_embedded'].shift(100)
df_uniques_temp['_id__right'] = df_uniques['_id'].shift(100)
df_uniques_temp = df_uniques_temp.dropna()
df_uniques_temp['label'] = 0

df_uniques = df_uniques_temp.reset_index(drop=True)
gc.collect()

In [None]:
df_uniques['cosine_similarity'] = df_uniques.apply(lambda x: custom_cosine_similarity(x['text_embedded__left'], x['text_embedded__right']), axis=1)

df_uniques.head()

In [None]:
threshold = 0.7

unique = df_uniques['cosine_similarity'][df_uniques['cosine_similarity'] > threshold].count() / df_uniques['cosine_similarity'].count()
duplicate = df_duplicates['cosine_similarity'][df_duplicates['cosine_similarity'] > threshold].count() / df_duplicates['cosine_similarity'].count()

print(unique * 100, duplicate * 100)

In [None]:
df = pd.concat([df_duplicates, df_uniques], axis=0).sample(frac=1).reset_index(drop=True)
gc.collect()

### **5. Data Split**

### **5.1. Train Test Split**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['label']),
                                                    df['label'],
                                                    shuffle=True,
                                                    stratify=df['label'],
                                                    test_size=0.3,
                                                    random_state=42)

print(f'Shape of X_train: {X_train.shape}')
print(f'Shape of X_test: {X_test.shape}')
print(f'Shape of y_train: {y_train.shape}')
print(f'Shape of y_test: {y_test.shape}')

### **6. Preprocessing**

In [None]:
inputs = Input(shape=(384, ))

x1 = Dense(128, activation="relu")(inputs)
x1 = Dense(64, activation="relu")(x1)
x1 = Dense(32, activation="relu")(x1)

x2 = Dense(128, activation="relu")(inputs)
x2 = Dense(64, activation="relu")(x2)
x2 = Dense(32, activation="relu")(x2)

distance = Lambda(lambda x: tf.abs(x[0] - x[1]))([x1, x2])