In [3]:
pip install transformers torch scikit-learn pandas matplotlib



In [4]:
import pandas as pd
import numpy as np
import csv
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
import re
from nltk.probability import FreqDist
import matplotlib.cm as cm

from sklearn.cluster import MiniBatchKMeans

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

## Load bug data

In [5]:
pd.set_option=("display.max_colwidth",'100')

df1=pd.read_csv("bugzilla_3 (1).csv")
df1.tail()

Unnamed: 0,type,summary,product,comp,assignee,status,resolution,updated
9995,,Firefox wants to install helper application wi...,Toolkit,Password Manager,nobody,UNCO,---,Wed 12:26
9996,,[macOS] Opening the library produces a pile of...,Firefox,Bookmarks & History,gtatum,NEW,---,Wed 13:00
9997,,Audio and video desynchronization on Twitch us...,Core,Audio/Video: Playbac,nobody,UNCO,---,Wed 15:44
9998,,Access native messaging host with file write p...,Firefox,Messaging System,nobody,UNCO,---,Wed 20:42
9999,,Bookmark submenus have inconsistent look on ma...,Firefox,Theme,nobody,NEW,---,Wed 21:33


## Drop unnecessary columns

In [6]:
df2=df1.drop(['type','resolution','status','assignee',],axis=1)
#df2=df1.drop(['type','resolution','status','assignee'],axis=1)
df2['summary']=df2['summary'].str.lower()
df2.head()
df3=df2
df3

Unnamed: 0,summary,product,comp,updated
0,extremely laggy zoom and scroll in firefox mob...,Fenix,Performance,1:46:11
1,firefox 106.0.5 on macbook intel os ventura pa...,Core,Graphics,2:05:45
2,"pdf doesn't prompt for location with ""always a...",Firefox,Downloads Panel,2:32:10
3,"in the newest ubuntu 22.10 release, firefox sn...",Core,Widget: Gtk,6:45:29
4,issues with download negative download speed ...,Firefox,Downloads Panel,7:01:12
...,...,...,...,...
9995,firefox wants to install helper application wi...,Toolkit,Password Manager,Wed 12:26
9996,[macos] opening the library produces a pile of...,Firefox,Bookmarks & History,Wed 13:00
9997,audio and video desynchronization on twitch us...,Core,Audio/Video: Playbac,Wed 15:44
9998,access native messaging host with file write p...,Firefox,Messaging System,Wed 20:42


## Removing punctuations from the summary

In [7]:
#removing punctuations from the summary

import string

string.punctuation

#creating a function
def remove_punctuation(txt):
    txt_nopunt="".join([c for c in txt if c not in string.punctuation])
    return txt_nopunt

df3['msg_clean']=df3['summary'].apply(lambda x:remove_punctuation(x))
df3

Unnamed: 0,summary,product,comp,updated,msg_clean
0,extremely laggy zoom and scroll in firefox mob...,Fenix,Performance,1:46:11,extremely laggy zoom and scroll in firefox mob...
1,firefox 106.0.5 on macbook intel os ventura pa...,Core,Graphics,2:05:45,firefox 10605 on macbook intel os ventura page...
2,"pdf doesn't prompt for location with ""always a...",Firefox,Downloads Panel,2:32:10,pdf doesnt prompt for location with always ask...
3,"in the newest ubuntu 22.10 release, firefox sn...",Core,Widget: Gtk,6:45:29,in the newest ubuntu 2210 release firefox snap...
4,issues with download negative download speed ...,Firefox,Downloads Panel,7:01:12,issues with download negative download speed ...
...,...,...,...,...,...
9995,firefox wants to install helper application wi...,Toolkit,Password Manager,Wed 12:26,firefox wants to install helper application wi...
9996,[macos] opening the library produces a pile of...,Firefox,Bookmarks & History,Wed 13:00,macos opening the library produces a pile of a...
9997,audio and video desynchronization on twitch us...,Core,Audio/Video: Playbac,Wed 15:44,audio and video desynchronization on twitch us...
9998,access native messaging host with file write p...,Firefox,Messaging System,Wed 20:42,access native messaging host with file write p...


## Tokenize the summary

In [8]:
import re

def tokenize(text):
    tokens=re.split('\W+',text)
    return tokens

df3['summary_tokenized']=df3['msg_clean'].apply(lambda x: tokenize(x.lower()))
df3

Unnamed: 0,summary,product,comp,updated,msg_clean,summary_tokenized
0,extremely laggy zoom and scroll in firefox mob...,Fenix,Performance,1:46:11,extremely laggy zoom and scroll in firefox mob...,"[extremely, laggy, zoom, and, scroll, in, fire..."
1,firefox 106.0.5 on macbook intel os ventura pa...,Core,Graphics,2:05:45,firefox 10605 on macbook intel os ventura page...,"[firefox, 10605, on, macbook, intel, os, ventu..."
2,"pdf doesn't prompt for location with ""always a...",Firefox,Downloads Panel,2:32:10,pdf doesnt prompt for location with always ask...,"[pdf, doesnt, prompt, for, location, with, alw..."
3,"in the newest ubuntu 22.10 release, firefox sn...",Core,Widget: Gtk,6:45:29,in the newest ubuntu 2210 release firefox snap...,"[in, the, newest, ubuntu, 2210, release, firef..."
4,issues with download negative download speed ...,Firefox,Downloads Panel,7:01:12,issues with download negative download speed ...,"[issues, with, download, negative, download, s..."
...,...,...,...,...,...,...
9995,firefox wants to install helper application wi...,Toolkit,Password Manager,Wed 12:26,firefox wants to install helper application wi...,"[firefox, wants, to, install, helper, applicat..."
9996,[macos] opening the library produces a pile of...,Firefox,Bookmarks & History,Wed 13:00,macos opening the library produces a pile of a...,"[macos, opening, the, library, produces, a, pi..."
9997,audio and video desynchronization on twitch us...,Core,Audio/Video: Playbac,Wed 15:44,audio and video desynchronization on twitch us...,"[audio, and, video, desynchronization, on, twi..."
9998,access native messaging host with file write p...,Firefox,Messaging System,Wed 20:42,access native messaging host with file write p...,"[access, native, messaging, host, with, file, ..."


## Remove stop-words

In [9]:
#importing nltk to remove stopwords from the summary_tokenized

import nltk
nltk.download('stopwords')
stopwords=nltk.corpus.stopwords.words('english')

def remove_stopwords(txt_tokenized):
    txt_clean=[word for word in txt_tokenized if word not in stopwords]
    return txt_clean


df3['no_swds']=df3['summary_tokenized'].apply(lambda x: remove_stopwords(x))
df3

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,summary,product,comp,updated,msg_clean,summary_tokenized,no_swds
0,extremely laggy zoom and scroll in firefox mob...,Fenix,Performance,1:46:11,extremely laggy zoom and scroll in firefox mob...,"[extremely, laggy, zoom, and, scroll, in, fire...","[extremely, laggy, zoom, scroll, firefox, mobi..."
1,firefox 106.0.5 on macbook intel os ventura pa...,Core,Graphics,2:05:45,firefox 10605 on macbook intel os ventura page...,"[firefox, 10605, on, macbook, intel, os, ventu...","[firefox, 10605, macbook, intel, os, ventura, ..."
2,"pdf doesn't prompt for location with ""always a...",Firefox,Downloads Panel,2:32:10,pdf doesnt prompt for location with always ask...,"[pdf, doesnt, prompt, for, location, with, alw...","[pdf, doesnt, prompt, location, always, ask, s..."
3,"in the newest ubuntu 22.10 release, firefox sn...",Core,Widget: Gtk,6:45:29,in the newest ubuntu 2210 release firefox snap...,"[in, the, newest, ubuntu, 2210, release, firef...","[newest, ubuntu, 2210, release, firefox, snap,..."
4,issues with download negative download speed ...,Firefox,Downloads Panel,7:01:12,issues with download negative download speed ...,"[issues, with, download, negative, download, s...","[issues, download, negative, download, speed, ]"
...,...,...,...,...,...,...,...
9995,firefox wants to install helper application wi...,Toolkit,Password Manager,Wed 12:26,firefox wants to install helper application wi...,"[firefox, wants, to, install, helper, applicat...","[firefox, wants, install, helper, application,..."
9996,[macos] opening the library produces a pile of...,Firefox,Bookmarks & History,Wed 13:00,macos opening the library produces a pile of a...,"[macos, opening, the, library, produces, a, pi...","[macos, opening, library, produces, pile, atte..."
9997,audio and video desynchronization on twitch us...,Core,Audio/Video: Playbac,Wed 15:44,audio and video desynchronization on twitch us...,"[audio, and, video, desynchronization, on, twi...","[audio, video, desynchronization, twitch, usin..."
9998,access native messaging host with file write p...,Firefox,Messaging System,Wed 20:42,access native messaging host with file write p...,"[access, native, messaging, host, with, file, ...","[access, native, messaging, host, file, write,..."


## Lemmatize words

In [10]:
from nltk.corpus import wordnet
import nltk
import re

nltk.download('wordnet')
nltk.download('omw-1.4')
wn = nltk.WordNetLemmatizer()

# Define a function to check if a word has a dictionary meaning. Input: word
def word_has_dictionary_meaning(word):
    synsets = wordnet.synsets(word)
    return len(synsets) > 0

# Input: array of words as token
def clean_numbers(token_txt):
    number_pattern = r'^-?\d+$'
    text = [word for word in token_txt if not re.match(number_pattern, word)]
    return text

# Define a function to lemmatize words. Input: array of words as token
def lemmatization(token_txt):
    text = [wn.lemmatize(word) for word in token_txt]
    return text

# Apply the lemmatization function to the 'no_swds' column
df3['filtered_no_swds'] = df3['no_swds'].apply(lambda words: [word for word in words if word_has_dictionary_meaning(word)])

df3['lemmatized_no_swds'] = df3['filtered_no_swds'].apply(lambda x: lemmatization(x))

df3['clean_lemma']=df3['lemmatized_no_swds'].apply(lambda words: clean_numbers(words))

df3

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Unnamed: 0,summary,product,comp,updated,msg_clean,summary_tokenized,no_swds,filtered_no_swds,lemmatized_no_swds,clean_lemma
0,extremely laggy zoom and scroll in firefox mob...,Fenix,Performance,1:46:11,extremely laggy zoom and scroll in firefox mob...,"[extremely, laggy, zoom, and, scroll, in, fire...","[extremely, laggy, zoom, scroll, firefox, mobi...","[extremely, zoom, scroll, mobile, google, pixe...","[extremely, zoom, scroll, mobile, google, pixe...","[extremely, zoom, scroll, mobile, google, pixe..."
1,firefox 106.0.5 on macbook intel os ventura pa...,Core,Graphics,2:05:45,firefox 10605 on macbook intel os ventura page...,"[firefox, 10605, on, macbook, intel, os, ventu...","[firefox, 10605, macbook, intel, os, ventura, ...","[os, pages, crash, consumption, memory]","[o, page, crash, consumption, memory]","[o, page, crash, consumption, memory]"
2,"pdf doesn't prompt for location with ""always a...",Firefox,Downloads Panel,2:32:10,pdf doesnt prompt for location with always ask...,"[pdf, doesnt, prompt, for, location, with, alw...","[pdf, doesnt, prompt, location, always, ask, s...","[prompt, location, always, ask, save, files, c...","[prompt, location, always, ask, save, file, ch...","[prompt, location, always, ask, save, file, ch..."
3,"in the newest ubuntu 22.10 release, firefox sn...",Core,Widget: Gtk,6:45:29,in the newest ubuntu 2210 release firefox snap...,"[in, the, newest, ubuntu, 2210, release, firef...","[newest, ubuntu, 2210, release, firefox, snap,...","[newest, release, snap, occasionally, open, st...","[newest, release, snap, occasionally, open, st...","[newest, release, snap, occasionally, open, st..."
4,issues with download negative download speed ...,Firefox,Downloads Panel,7:01:12,issues with download negative download speed ...,"[issues, with, download, negative, download, s...","[issues, download, negative, download, speed, ]","[issues, download, negative, download, speed]","[issue, download, negative, download, speed]","[issue, download, negative, download, speed]"
...,...,...,...,...,...,...,...,...,...,...
9995,firefox wants to install helper application wi...,Toolkit,Password Manager,Wed 12:26,firefox wants to install helper application wi...,"[firefox, wants, to, install, helper, applicat...","[firefox, wants, install, helper, application,...","[wants, install, helper, application, descript...","[want, install, helper, application, description]","[want, install, helper, application, description]"
9996,[macos] opening the library produces a pile of...,Firefox,Bookmarks & History,Wed 13:00,macos opening the library produces a pile of a...,"[macos, opening, the, library, produces, a, pi...","[macos, opening, library, produces, pile, atte...","[opening, library, produces, pile, attempt, ov...","[opening, library, produce, pile, attempt, ove...","[opening, library, produce, pile, attempt, ove..."
9997,audio and video desynchronization on twitch us...,Core,Audio/Video: Playbac,Wed 15:44,audio and video desynchronization on twitch us...,"[audio, and, video, desynchronization, on, twi...","[audio, video, desynchronization, twitch, usin...","[audio, video, desynchronization, twitch, using]","[audio, video, desynchronization, twitch, using]","[audio, video, desynchronization, twitch, using]"
9998,access native messaging host with file write p...,Firefox,Messaging System,Wed 20:42,access native messaging host with file write p...,"[access, native, messaging, host, with, file, ...","[access, native, messaging, host, file, write,...","[access, native, messaging, host, file, write,...","[access, native, messaging, host, file, write,...","[access, native, messaging, host, file, write,..."


## Extract meaningful lemma

In [11]:
# Check if words were lemmatized
df3['lemmatized_check'] = df3.apply(lambda row: any(original != lemmatized for original, lemmatized in zip(row['no_swds'], row['lemmatized_no_swds'])), axis=1)

# Filter rows where lemmatization was applied
lemmatized_rows = df3[df3['lemmatized_check']]

# Display the rows where lemmatization was applied
print(lemmatized_rows[['no_swds', 'clean_lemma']])

                                                no_swds  \
0     [extremely, laggy, zoom, scroll, firefox, mobi...   
1     [firefox, 10605, macbook, intel, os, ventura, ...   
2     [pdf, doesnt, prompt, location, always, ask, s...   
3     [newest, ubuntu, 2210, release, firefox, snap,...   
4       [issues, download, negative, download, speed, ]   
...                                                 ...   
9993                  [meta, associate, pdf, firefox, ]   
9995  [firefox, wants, install, helper, application,...   
9996  [macos, opening, library, produces, pile, atte...   
9998  [access, native, messaging, host, file, write,...   
9999  [bookmark, submenus, inconsistent, look, macos, ]   

                                            clean_lemma  
0     [extremely, zoom, scroll, mobile, google, pixe...  
1                 [o, page, crash, consumption, memory]  
2     [prompt, location, always, ask, save, file, ch...  
3     [newest, release, snap, occasionally, open, st...  
4

In [12]:
df3['meaningful_lemm_str'] = df3['clean_lemma'].apply(lambda x: ' '.join(x))
df3

Unnamed: 0,summary,product,comp,updated,msg_clean,summary_tokenized,no_swds,filtered_no_swds,lemmatized_no_swds,clean_lemma,lemmatized_check,meaningful_lemm_str
0,extremely laggy zoom and scroll in firefox mob...,Fenix,Performance,1:46:11,extremely laggy zoom and scroll in firefox mob...,"[extremely, laggy, zoom, and, scroll, in, fire...","[extremely, laggy, zoom, scroll, firefox, mobi...","[extremely, zoom, scroll, mobile, google, pixe...","[extremely, zoom, scroll, mobile, google, pixe...","[extremely, zoom, scroll, mobile, google, pixe...",True,extremely zoom scroll mobile google pixel pro
1,firefox 106.0.5 on macbook intel os ventura pa...,Core,Graphics,2:05:45,firefox 10605 on macbook intel os ventura page...,"[firefox, 10605, on, macbook, intel, os, ventu...","[firefox, 10605, macbook, intel, os, ventura, ...","[os, pages, crash, consumption, memory]","[o, page, crash, consumption, memory]","[o, page, crash, consumption, memory]",True,o page crash consumption memory
2,"pdf doesn't prompt for location with ""always a...",Firefox,Downloads Panel,2:32:10,pdf doesnt prompt for location with always ask...,"[pdf, doesnt, prompt, for, location, with, alw...","[pdf, doesnt, prompt, location, always, ask, s...","[prompt, location, always, ask, save, files, c...","[prompt, location, always, ask, save, file, ch...","[prompt, location, always, ask, save, file, ch...",True,prompt location always ask save file checked
3,"in the newest ubuntu 22.10 release, firefox sn...",Core,Widget: Gtk,6:45:29,in the newest ubuntu 2210 release firefox snap...,"[in, the, newest, ubuntu, 2210, release, firef...","[newest, ubuntu, 2210, release, firefox, snap,...","[newest, release, snap, occasionally, open, st...","[newest, release, snap, occasionally, open, st...","[newest, release, snap, occasionally, open, st...",True,newest release snap occasionally open stuck bl...
4,issues with download negative download speed ...,Firefox,Downloads Panel,7:01:12,issues with download negative download speed ...,"[issues, with, download, negative, download, s...","[issues, download, negative, download, speed, ]","[issues, download, negative, download, speed]","[issue, download, negative, download, speed]","[issue, download, negative, download, speed]",True,issue download negative download speed
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,firefox wants to install helper application wi...,Toolkit,Password Manager,Wed 12:26,firefox wants to install helper application wi...,"[firefox, wants, to, install, helper, applicat...","[firefox, wants, install, helper, application,...","[wants, install, helper, application, descript...","[want, install, helper, application, description]","[want, install, helper, application, description]",True,want install helper application description
9996,[macos] opening the library produces a pile of...,Firefox,Bookmarks & History,Wed 13:00,macos opening the library produces a pile of a...,"[macos, opening, the, library, produces, a, pi...","[macos, opening, library, produces, pile, atte...","[opening, library, produces, pile, attempt, ov...","[opening, library, produce, pile, attempt, ove...","[opening, library, produce, pile, attempt, ove...",True,opening library produce pile attempt override ...
9997,audio and video desynchronization on twitch us...,Core,Audio/Video: Playbac,Wed 15:44,audio and video desynchronization on twitch us...,"[audio, and, video, desynchronization, on, twi...","[audio, video, desynchronization, twitch, usin...","[audio, video, desynchronization, twitch, using]","[audio, video, desynchronization, twitch, using]","[audio, video, desynchronization, twitch, using]",False,audio video desynchronization twitch using
9998,access native messaging host with file write p...,Firefox,Messaging System,Wed 20:42,access native messaging host with file write p...,"[access, native, messaging, host, with, file, ...","[access, native, messaging, host, file, write,...","[access, native, messaging, host, file, write,...","[access, native, messaging, host, file, write,...","[access, native, messaging, host, file, write,...",True,access native messaging host file write permis...


In [13]:
from transformers import BertTokenizer, BertModel
import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [14]:
def generate_batches(texts, tokenizer, max_length, batch_size):
    num_batches = (len(texts) + batch_size - 1) // batch_size  # Adjust for the last incomplete batch
    for i in range(num_batches):
        batch = texts[i * batch_size: (i + 1) * batch_size]
        tokenized_texts = [tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            truncation=True,
            max_length=max_length,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        ) for text in batch]
        yield tokenized_texts

In [15]:
# Extract the text data from df3['meaningful_lemm_str']
text_data = df3['meaningful_lemm_str'].tolist()

# Process tokenized texts in batches using the generator
batch_generator = generate_batches(text_data, tokenizer, max_length=128, batch_size=8)

embeddings = []
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [None]:
for tokenized_texts in batch_generator:
    input_ids_list = [item['input_ids'] for item in tokenized_texts if item['input_ids'] is not None]
    attention_masks_list = [item['attention_mask'] for item in tokenized_texts if item['attention_mask'] is not None]

    if input_ids_list and attention_masks_list:
        input_ids = torch.cat(input_ids_list, dim=0)
        attention_masks = torch.cat(attention_masks_list, dim=0)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_masks)
            batch_embeddings = outputs[0][:, 0, :].cpu().numpy()
            embeddings.append(batch_embeddings)

In [None]:
embeddings = np.array(embeddings)

min_threshold = 0.3
max_threshold = 1.0

# Perform the comparison after converting embeddings to a NumPy array
threshold_mask = (embeddings > min_threshold) & (embeddings < max_threshold)
max_num_tokens = threshold_mask.shape[1]

# Get the maximum number of tokens from the embeddings
max_num_tokens = threshold_mask.shape[1]

# Tokenize the text to get BERT tokens
tokenized_texts = df3['meaningful_lemm_str'].apply(lambda x: tokenizer.tokenize(x))

# Flatten the list of tokens
flat_tokens = [token for sublist in tokenized_texts for token in sublist]

# Get the unique tokens
unique_tokens = list(set(flat_tokens))

# Iterate over the threshold mask and print words with their corresponding embeddings
for i, doc_mask in enumerate(threshold_mask):
    print(f"Document {i + 1}:")
    for j, masked in enumerate(doc_mask[:max_num_tokens]):
        if masked.any():  # Check if any element in 'masked' meets the condition
            if j < len(embeddings[i]):
                token = unique_tokens[j]
                # Convert the array to a string for printing
                embedding_values = ', '.join(f'{val:.4f}' for val in embeddings[i][j])
                print(f"{token}: {embedding_values}")

In [None]:
model.save_pretrained('bert_model_directory')
tokenizer.save_pretrained('bert_tokenizer_directory')

In [None]:
model = BertModel.from_pretrained('bert_model_directory')
tokenizer = BertTokenizer.from_pretrained('bert_tokenizer_directory')

In [None]:
df3

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Reshape the embeddings array to have (num_documents * sentence_length, embedding_dim)
reshaped_embeddings = embeddings.reshape(-1, embeddings.shape[-1])

# Apply KMeans clustering to the reshaped_embeddings
num_clusters = 3
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(reshaped_embeddings)

# Get cluster labels for each document
cluster_labels = kmeans.labels_

# Reduce dimensionality using PCA for visualization
pca = PCA(n_components=2)

# Apply PCA on reshaped embeddings
reduced_data = pca.fit_transform(reshaped_embeddings)

# Visualize clusters using scatter plot
plt.figure(figsize=(10, 8))
plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=cluster_labels, cmap='viridis')
plt.title('K-Means Clustering with BERT Embeddings', fontsize=20)
plt.xlabel('Principal Component 1', fontsize=20)
plt.ylabel('Principal Component 2', fontsize=20)
plt.show()


In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
# Step 1: Aggregate tokens for each cluster
cluster_tokens = {}
for i, label in enumerate(cluster_labels):
    if label not in cluster_tokens:
        cluster_tokens[label] = []
    cluster_tokens[label].extend(tokenized_texts[i])

# Step 2: Create word clouds for each cluster
for label, tokens in cluster_tokens.items():
    cluster_words = " ".join(tokens)

    # Create word cloud using the cluster_words data
    if cluster_words:
        wordcloud = WordCloud(width=800, height=400, max_words=150, background_color='white').generate(cluster_words)
        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.title(f'Cluster {label} Word Cloud')
        plt.axis('off')
        plt.show()
    else:
        print(f"No documents with words in Cluster {label}")


In [None]:
from sklearn.cluster import KMeans

# Reshape the embeddings array to have (num_documents * sentence_length, embedding_dim)
reshaped_embeddings = embeddings.reshape(-1, embeddings.shape[-1])

# Define a range of cluster numbers to try
num_clusters_range = range(1, 11)  # You can adjust the range as needed

# Initialize a list to store the inertia values
inertia_values = []

# Fit KMeans for each value of k and calculate inertia
for k in num_clusters_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(reshaped_embeddings)
    inertia_values.append(kmeans.inertia_)

# Plot the elbow curve
plt.plot(num_clusters_range, inertia_values, marker='o')
plt.title('Elbow Curve for KMeans', fontsize=20)
plt.xlabel('Number of Clusters (k)', fontsize=20)
plt.ylabel('Inertia', fontsize=20)
plt.xticks(num_clusters_range)
plt.show()

In [None]:
# If DataFrame length and cluster labels length are different
if len(df3) != len(kmeans.labels_):
    # Ensure DataFrame length and cluster labels length match
    df3 = df3[:len(kmeans.labels_)]  # Truncate DataFrame to match cluster labels length
    # Now assign cluster labels to the DataFrame
    df3['cluster_labels'] = kmeans.labels_


In [None]:
df3['cluster_labels'] = kmeans.labels_
unique, counts = np.unique(cluster_labels, return_counts=True)
print(dict(zip(unique, counts)))


In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertTokenizer, BertForSequenceClassification

# Assuming 'df' is your DataFrame containing the bug reports
X = df3['meaningful_lemm_str']  # Feature: Bug reports
y = df3['cluster_labels']  # Labels: Cluster labels

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(df3['cluster_labels'].unique()))

# Tokenize input sequences
def tokenize_sentences(sentences, tokenizer):
    input_ids = []
    attention_masks = []

    for sent in sentences:
        encoded_dict = tokenizer.encode_plus(
                            sent,
                            add_special_tokens=True,
                            max_length=128,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                            truncation=True
                      )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)


In [None]:

X_train_ids, X_train_attention = tokenize_sentences(X_train, tokenizer)
X_test_ids, X_test_attention = tokenize_sentences(X_test, tokenizer)

# Create DataLoader for training and testing
train_dataset = TensorDataset(X_train_ids, X_train_attention, torch.tensor(y_train.values))
test_dataset = TensorDataset(X_test_ids, X_test_attention, torch.tensor(y_test.values))

train_dataloader = DataLoader(train_dataset, batch_size=8)
test_dataloader = DataLoader(test_dataset, batch_size=8)

# Fine-tune BERT model for sequence classification
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()



In [None]:
model.train()
for epoch in range(3):  # Adjust epochs as needed
    for batch in train_dataloader:
        b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)
        b_labels = b_labels.to(device=device, dtype=torch.long)

        optimizer.zero_grad()
        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        logits = outputs.logits

        loss.backward()
        optimizer.step()

# Evaluation
model.eval()
predictions = []
true_labels = []



In [None]:
for batch in test_dataloader:
    b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)

    with torch.no_grad():
        outputs = model(b_input_ids, attention_mask=b_input_mask)
        logits = outputs.logits

    preds = torch.argmax(logits, dim=1)
    predictions.extend(preds.tolist())
    true_labels.extend(b_labels.tolist())

# Evaluate the model
accuracy = accuracy_score(true_labels, predictions)
print(f"Accuracy: {accuracy}")

# Classification report
print(classification_report(true_labels, predictions))

In [None]:
def tokenize_sentence(sentence, tokenizer):
    encoded_dict = tokenizer.encode_plus(
        sentence,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
        truncation=True
    )
    return encoded_dict['input_ids'], encoded_dict['attention_mask']


In [None]:
def create_dataloader(sentences, tokenizer):
    input_ids = []
    attention_masks = []

    for sent in sentences:
        ids, mask = tokenize_sentence(sent, tokenizer)
        input_ids.append(ids)
        attention_masks.append(mask)

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    dataset = TensorDataset(input_ids, attention_masks)
    dataloader = DataLoader(dataset, batch_size=8)

    return dataloader

In [None]:
def predict_labels(df3, model, tokenizer):
    text_data = df['meaningful_lemm_str'].tolist()
    dataloader = create_dataloader(text_data, tokenizer)

    predictions = []

    for batch in dataloader:
        b_input_ids, b_input_mask = batch
        with torch.no_grad():
            outputs = model(b_input_ids.to(device), attention_mask=b_input_mask.to(device))
            logits = outputs.logits

        preds = torch.argmax(logits, dim=1)
        predictions.extend(preds.tolist())

    df3['text_label'] = predictions
    return df3

# Use the function to predict labels and add them to the DataFrame
df3 = predict_labels(df3, model, tokenizer)

In [None]:
# The above is the end of the model creation and model tuning

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score, classification_report

# df3 = df3[:len(embeddings)]

# # Assuming 'df3' is your DataFrame containing 'cluster_labels'
# X = embeddings  # Features: BERT embeddings
# y = df3['cluster_labels']  # Labels: Cluster labels

# # Splitting the data into train and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Reshape the embeddings to 2D arrays
# X_train_reshaped = X_train.reshape(X_train.shape[0], -1)
# X_test_reshaped = X_test.reshape(X_test.shape[0], -1)

# # Initialize and train RandomForestClassifier
# clf = RandomForestClassifier(n_estimators=100, random_state=42)
# clf.fit(X_train_reshaped, y_train)

# # Predict on test set
# predictions = clf.predict(X_test_reshaped)

# # Evaluate the model
# accuracy = accuracy_score(y_test, predictions)
# print(f"Accuracy: {accuracy}")

# # Classification report
# print(classification_report(y_test, predictions))

In [40]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import numpy as np  # Import necessary libraries

# Set random state for reproducibility
random_state = 42  # You can choose any integer value as per your preference

# Apply KMeans clustering to BERT embeddings
num_clusters = 3  # Choose the number of clusters

# Create a KMeans instance
kmeans = KMeans(n_clusters=num_clusters, random_state=random_state)

# Fit KMeans to the BERT embeddings
kmeans.fit(embeddings)

# Get cluster labels for each document
cluster_labels = kmeans.labels_

# Reduce dimensionality using PCA for visualization
pca = PCA(n_components=2)
reduced_data = pca.fit_transform(embeddings)

# Visualize clusters using scatter plot
plt.figure(figsize=(10, 8))
plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=cluster_labels, cmap='viridis')
plt.title('K-Means Clustering with BERT Embeddings', fontsize=20)
plt.xlabel('Principal Component 1', fontsize=20)
plt.ylabel('Principal Component 2', fontsize=20)
plt.show()

ValueError: Found array with dim 3. KMeans expected <= 2.

In [None]:
# Assuming 'threshold_mask', 'embeddings', 'cluster_labels', and 'tokenized_texts' are defined

from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Step 1: Aggregate tokens for each cluster
cluster_tokens = {}
for i, label in enumerate(cluster_labels):
    if label not in cluster_tokens:
        cluster_tokens[label] = []
    cluster_tokens[label].extend(tokenized_texts[i])

# Step 2: Create word clouds for each cluster
for label, tokens in cluster_tokens.items():
    cluster_words = " ".join(tokens)

    # Create word cloud using the cluster_words data
    if cluster_words:
        wordcloud = WordCloud(width=800, height=400, max_words=150, background_color='white').generate(cluster_words)
        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.title(f'Cluster {label} Word Cloud')
        plt.axis('off')
        plt.show()
    else:
        print(f"No documents with words in Cluster {label}")


## Generate elbow curve to determine numbe of clusters

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Define a range of cluster numbers to try
num_clusters_range = range(1, 11)  # You can adjust the range as needed

# Initialize a list to store the inertia values
inertia_values = []

# Fit KMeans for each value of k and calculate inertia
for k in num_clusters_range:
    kmeans = KMeans(n_clusters=k, random_state=random_state)
    kmeans.fit(embeddings)
    inertia_values.append(kmeans.inertia_)

# Plot the elbow curve
plt.plot(num_clusters_range, inertia_values, marker='o')
plt.title('Elbow Curve for KMeans', fontsize=20)
plt.xlabel('Number of Clusters (k)', fontsize=20)
plt.ylabel('Inertia', fontsize=20)
plt.xticks(num_clusters_range)
plt.show()


## Reduce dimensionality and plot clusters

In [None]:
# get cluster labels for each row in the tfidf_matrix
df3['cluster_labels'] = kmeans.labels_

# print number of rows in each cluster
unique, counts = np.unique(cluster_labels, return_counts=True)
print(dict(zip(unique, counts)))


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Assuming 'embeddings' contains the BERT embeddings you've generated

# Assuming 'df3' is your DataFrame containing 'cluster_labels'
X = embeddings  # Features: BERT embeddings
y = df3['cluster_labels']  # Labels: Cluster labels

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Predict on test set
predictions = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")

# Classification report
print(classification_report(y_test, predictions))


## Generate word-cloud

In [None]:
# Assuming 'threshold_mask' and 'embeddings' variables are already defined
threshold_mask = (embeddings > min_threshold) & (embeddings < max_threshold)
max_num_tokens = threshold_mask.shape[1]

# Get the maximum number of tokens from the embeddings
max_num_tokens = threshold_mask.shape[1]

# Tokenize the text to get BERT tokens
tokenized_texts = df3['meaningful_lemm_str'].apply(lambda x: tokenizer.tokenize(x))

# Flatten the list of tokens
flat_tokens = [token for sublist in tokenized_texts for token in sublist]

# Get the unique tokens
unique_tokens = list(set(flat_tokens))

# Store words and their corresponding scores in a dictionary
word_scores = {}

# Iterate over the threshold mask and store words with their corresponding embeddings' scores
for i, doc_mask in enumerate(threshold_mask):
    for j, masked in enumerate(doc_mask[:max_num_tokens]):
        if masked:
            if j < len(embeddings[i]):
                token = unique_tokens[j]
                score = embeddings[i][j]
                word_scores[token] = score

# Sort the word_scores dictionary by values (scores) in descending order
sorted_word_scores = {k: v for k, v in sorted(word_scores.items(), key=lambda item: item[1], reverse=True)}

# Get the top 20 words and their scores
top_20_words = list(sorted_word_scores.items())[:20]

# Print the top 20 words and their scores
for word, score in top_20_words:
    print(f"{word}: {score:.4f}")



In [None]:
df4=df3.drop(['msg_clean','summary_tokenized','no_swds','lemmatized_check'],axis=1)
df4

In [None]:
from collections import defaultdict

# Initialize a dictionary to store word scores for each cluster
cluster_word_scores = defaultdict(dict)

# Assuming you have cluster_labels (list/array) containing cluster labels for each document
# Assuming tokens_for_documents is a list containing tokenized words for each document
for cluster_id in range(num_clusters):
    # Get documents belonging to the current cluster
    cluster_indices = [i for i, label in enumerate(cluster_labels) if label == cluster_id]

    # Collect word scores for the current cluster
    word_scores = {}
    for i in cluster_indices:
        doc_mask = threshold_mask[i]
        for j, masked in enumerate(doc_mask[:max_num_tokens]):
            if masked:
                if j < len(embeddings[i]):
                    token = unique_tokens[j]
                    score = embeddings[i][j]
                    if token in word_scores:
                        word_scores[token].append(score)
                    else:
                        word_scores[token] = [score]

    # Calculate the mean score for each word in the cluster
    for word, scores in word_scores.items():
        cluster_word_scores[cluster_id][word] = sum(scores) / len(scores)

# Get the top 20 words for each cluster based on scores
top_20_words_per_cluster = {}
for cluster_id, word_scores in cluster_word_scores.items():
    sorted_words = sorted(word_scores, key=word_scores.get, reverse=True)
    top_20_words_per_cluster[cluster_id] = sorted_words[:20]

# Print the top 20 words for each cluster
for cluster_id, top_words in top_20_words_per_cluster.items():
    print(f"Cluster {cluster_id + 1} Top 20 Words:")
    print(top_words)


In [None]:
# Define a function to map cluster labels to cluster tags
def map_cluster_labels_to_tags(cluster_label):
    cluster_tags = {
        0: "User Interface",
        1: "Functional Performance Bugs",
        2: "Workflow Bugs"
        # Add more mappings if you have additional clusters
    }
    return cluster_tags.get(cluster_label, "Other")

# Assuming 'cluster_labels' is a column in your DataFrame 'df4' containing the cluster labels
df4['cluster_tag'] = df4['cluster_labels'].apply(map_cluster_labels_to_tags)


In [None]:
print(embeddings.shape)  # This will print the shape of the array


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

if not isinstance(embeddings, np.ndarray):
    embeddings_dense = embeddings.toarray()
else:
    embeddings_dense = embeddings

# Create a dictionary to store BERT embeddings for each term
o_embeddings = {word: [] for word in feature_names}

# Loop through the BERT embeddings for each document
for i in range(len(embeddings)):
    if i < len(embeddings):  # Check if i is within the range of embeddings
        for j, word in enumerate(feature_names):
            if j < len(embeddings[i]):  # Check if j is within the range of embeddings[i]
                embedding_score = embeddings[i][j]  # Accessing element in embeddings array
                o_embeddings[word].append(embedding_score)

# Filter terms that have consistent embedding lengths across documents
consistent_terms = {word: embeddings for word, embeddings in o_embeddings.items() if isinstance(embeddings, list) and len(embeddings) > 0 and len(embeddings[0]) > 0}

# Get the maximum length of embeddings among consistent terms
max_len = max(len(embeddings) for embeddings in consistent_terms.values())

# Filter consistent terms based on max length
consistent_terms = {word: embeddings for word, embeddings in consistent_terms.items() if len(embeddings) == max_len}

# Convert the dictionary to a DataFrame
df_embeddings = pd.DataFrame(consistent_terms)

# Select the top 20 terms based on average embedding scores
avg_embeddings = df_embeddings.mean().sort_values(ascending=False)
top_20_terms = avg_embeddings.head(20).index

# Filter the DataFrame to include only the top 20 terms
df_embeddings_top_20 = df_embeddings[top_20_terms]

# Create box plot for the top 20 terms
plt.figure(figsize=(12, 8))
sns.boxplot(data=df_embeddings_top_20, orient="v")
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.xlabel("Top 20 Terms")
plt.ylabel("BERT Embedding Value")
plt.title("Box plot of BERT Embeddings for Top 20 Terms")
plt.show()



In [None]:
from sklearn.metrics import accuracy_score
print("Accuracy:", accuracy_score(true_labels, predictions)) # Value between 0 and 1

print("Accuracy Percentage {} %:".format(100*accuracy_score(true_labels, predictions)))