In [1]:
!export CUDA_VISIBLE_DEVICES=1,5

In [2]:
#Imports
import tensorflow as tf
import torch
from transformers import BertTokenizerFast
import pandas as pd 
import keras
from keras.optimizers import Adam
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import nltk
import string
nltk.download('stopwords')
from nltk.corpus import stopwords #Word Stop
from nltk.tokenize import word_tokenize #Tokenization & Word Stop
stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation + '``'+ '`'+ ''+ ',' + '/')
import joblib
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

2024-07-19 21:39:52.273721: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-19 21:39:52.304247: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-19 21:39:52.304281: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-19 21:39:52.323454: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  _torch_pytree._register_pytree_node(
[nltk_data] D

In [3]:
if torch.cuda.is_available():
    print("CUDA is available. Number of GPUs:", torch.cuda.device_count())
    print("CUDA device name:", torch.cuda.get_device_name(0))
else:
    print("CUDA is not available.")
torch.cuda.set_device(torch.device("cuda:0"))
device = torch.device('cpu')

CUDA is available. Number of GPUs: 8
CUDA device name: NVIDIA GeForce GTX 1080 Ti


In [4]:
#LDA Pipelines
class LDATopicModelPipeline:
    def __init__(self, lda_model_path, vectorizer_path):
        self.lda_model_path = lda_model_path
        self.vectorizer_path = vectorizer_path
        self.lda = None
        self.vectorizer = None
        
    def load_model(self):
        self.lda = joblib.load(self.lda_model_path)
        self.vectorizer = joblib.load(self.vectorizer_path)
        
    def topic_distributions(self, new_documents):
        if self.lda is None or self.vectorizer is None:
            self.load_model()
            
        if isinstance(new_documents["Combined_Text"], list):
            # If new_documents is a list of strings
            texts = new_documents
        elif isinstance(new_documents["Combined_Text"], pd.Series):
            # If new_documents is a Pandas Series (assuming it's a single column from a DataFrame)
            texts = new_documents["Combined_Text"].tolist()
            
        else:
            raise TypeError("Input data should be a list, Pandas Series, or DataFrame of strings.")
            
        x = self.vectorizer.transform(texts)
        topic_distributions = self.lda.transform(x)
        return topic_distributions
    
    def get_vectorizer(self):
        return self.vectorizer
    
    def append_topics(self, data, topic_distributions):
        topics = []
        for topic_dist in topic_distributions:
            dominant_topic = topic_dist.argmax()
            topics.append(dominant_topic)
            
        data['Topic'] = topics
        return data
            
#gives necessary files to process information
#use through this syntax: topic_distributions = topic_model.topic_distributions(new_documents)

In [5]:
#Training BERT model for text classification
class BertTextClassification:
    def __init__(self):
        self.documents = None
        self.tokenizer = BertTokenizerFast.from_pretrained("google-bert/bert-base-uncased")
        self.model = None
        self.device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
        
    def tokenize_function(self, text):
        encoding = self.tokenizer(text, padding="max_length", truncation=True, return_tensors="pt")
        #print("Tokenized input IDs:", encoding['input_ids'])
        #print("Tokenized attention mask:", encoding['attention_mask'])
        return encoding

    def load_model(self, topic_num):
        #Use Trained Text Classifier Based on Topic Number
         for x in [topic_num]:
            self.model = joblib.load(f'/home//Files/BERTTrained/topic_pred_{x}.pkl')
            self.model.to('cpu')
            
    def priority(self, documents):
        self.documents = documents.copy()  # Ensure documents is a DataFrame
        self.num_topics = documents['Topic'].drop_duplicates().values

        for topic_num in self.num_topics:
            self.load_model(topic_num)
            print(f"Processing Topic Number: {topic_num}")  # Debug print
            for i, text in enumerate(documents['Combined_Text']):  # Ensure you're iterating through the correct column
                if self.documents.loc[i, 'Topic'] == topic_num:
                    #print(f"Processing Row: {i}")  # Debug print
                    encoding = self.tokenize_function(text)
                    inputs = {
                        'input_ids': encoding['input_ids'],
                        'attention_mask': encoding['attention_mask']
                    }

                    with torch.no_grad():
                        outputs = self.model(**inputs)
                        logits = outputs.logits
                        if torch.any(torch.isnan(logits)):
                            print("NaN detected in logits")
                        predicted_priority = logits.argmax(dim=1).cpu().numpy()[0]
                        self.documents.at[i, "Predicted_Priority"] = predicted_priority
        return self.documents
        

In [6]:
#Accuracy Assessment
class AccuracyAssessment:
    def __init__(self, priority_levels):
        self.actual_priority = None
        self.predicted_priority = None
        self.true_pos = 0
        self.false_pos = 0
        self.false_neg = 0
        self.num_priority_levels = priority_levels
        self.confusion_matrix = np.zeros((self.num_priority_levels, self.num_priority_levels), dtype=int)
    
    def update_vals(self, actual, predicted):
        self.actual_priority = actual.astype(int)
        self.predicted_priority = predicted.astype(int)
        
        # Update confusion matrix
        for i in range(len(self.actual_priority)):
            true_idx = self.actual_priority[i] - 1
            pred_idx = self.predicted_priority[i] - 1
            self.confusion_matrix[true_idx, pred_idx] += 1
    
    def calc_metrics(self, class_index):
        self.true_pos = self.confusion_matrix[class_index, class_index]
        self.false_pos = np.sum(self.confusion_matrix[:, class_index]) - self.true_pos
        self.false_neg = np.sum(self.confusion_matrix[class_index, :]) - self.true_pos
    
    def precision(self, class_index):
        self.calc_metrics(class_index)
        
        if self.true_pos + self.false_pos == 0:
            return 0
        precision = self.true_pos / (self.true_pos + self.false_pos)
        
        return precision
    
    def recall(self, class_index):
        self.calc_metrics(class_index)
        
        if self.true_pos + self.false_neg == 0:
            return 0
        
        recall = self.true_pos / (self.true_pos + self.false_neg)
        
        return recall
    
    def fmeasure(self, class_index):
        precision = self.precision(class_index)
        recall = self.recall(class_index)
        
        if precision + recall == 0:
            return 0
        
        fmeasure = (2 * precision * recall) / (precision + recall)
        
        return fmeasure
    
    def accuracyOverall(self):
        accuratePriority = 0
        for i in range(self.num_priority_levels):
            self.calc_metrics(i)
            accuratePriority += self.true_pos
        accuratePriorities = accuratePriority / len(self.actual_priority)
        return accuratePriorities
    
    def microAnalysis(self):
        precisions = []
        recalls = []
        f_measures = []
        for i in range(self.num_priority_levels):
            precision_score = self.precision(i)
            recall_score = self.recall(i)
            f_measure_score = self.fmeasure(i)
            
            precisions.append(precision_score)
            recalls.append(recall_score)
            f_measures.append(f_measure_score)
        
        micro_precision = sum(precisions) / sum(precisions + recalls)
        micro_recall = sum(precisions) / sum(precisions + f_measures)
        micro_fmeasure = (2 * micro_precision * micro_recall) / (micro_precision + micro_recall)
        print(f"Micro-Analysis for Priority Levels: Precision = {micro_precision:.4f}, Recall={micro_recall:.4f}, F-measure={micro_fmeasure:.4f}")
   
    def macroAnalysis(self):
        precisions = []
        recalls = []
        f_measures = []
        for i in range(self.num_priority_levels):
            precision_score = self.precision(i)
            recall_score = self.recall(i)
            f_measure_score = self.fmeasure(i)
            
            precisions.append(precision_score)
            recalls.append(recall_score)
            f_measures.append(f_measure_score)

        macro_precision = sum(precisions) / len(precisions)
        macro_recall = sum(recalls) / len(recalls)
        macro_fmeasure = sum(f_measures) / len(f_measures)
        print(f"Macro-Analysis for Priority Levels: Precision = {macro_precision:.4f}, Recall={macro_recall:.4f}, F-measure={macro_fmeasure:.4f}")
    def printAssessment(self):
        for i in range(self.num_priority_levels):
            precision_score = self.precision(i)
            recall_score = self.recall(i)
            f_measure_score = self.fmeasure(i)
    
            print(f"Priority P{i+1}: Precision={precision_score:.4f}, Recall={recall_score:.4f}, F-measure={f_measure_score:.4f}")

In [7]:
#Classes together in pipeline:
def priority_pipeline_without_assessment(data):
    #PreProcessing
    #preprocess = PreprocessingPineline(stop_words, punctuation)
    #data = preprocess.data_to_tokens(data)
    #Topic Modeling (Insert Trained Model Here, Save as a .pth)
    lda_model_path = '/home//extracted/lda.pkl'
    vectorizer_path = '/home//extracted/vec.pkl'
    topic_model = LDATopicModelPipeline(lda_model_path, vectorizer_path)
    topic_model.load_model()
    df = pd.DataFrame(columns = ['Combined_Text', 'Topic'])
    df['Combined_Text'] = data
    df['Combined_Text'] = df['Combined_Text'].fillna(' ')
    topic_distributions = topic_model.topic_distributions(df)
    data = topic_model.append_topics(df, topic_distributions)
    print(df)
    vectorizer = topic_model.get_vectorizer()
    
    #Text Classification Per Topic (Insert Trained Model Here, Save as a .pth)
    textClass = BertTextClassification()
    data = textClass.priority(df)
    return data

In [8]:
import pandas as pd

to_see_results = pd.read_csv('/home///extracted/test_dataset_notpreprocessed.csv')
to_see_results.head()

to_see_results['Combined_Text'] = to_see_results['Title'] + " " + to_see_results['Component'] + " " + to_see_results['Description']

actual = [to_see_results["Priority"]]
actual_df = pd.DataFrame(actual)
actual_df = actual_df.transpose()
actual_df

Combined_Text = [to_see_results["Combined_Text"]]
df = pd.DataFrame(Combined_Text)
df = df.transpose()
df
label_map = {'P1': 1, 'P2': 2, 'P3': 3, 'P4': 4, 'P5': 5}
to_see_results['Priority'] = to_see_results['Priority'].map(label_map)

In [9]:
new_df = priority_pipeline_without_assessment(data = to_see_results['Combined_Text'])

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


                                           Combined_Text  Topic
0      Cant disable a feature Update  (deprecated - u...      9
1      build id wrong in the about dialog UI In win32...      9
2      [JFace] ConfigureColumnsDialog does not work c...      9
3      Widget is disposed in ControlExample SWT - run...      4
4      An internal error occurred during: Initializin...      4
...                                                  ...    ...
17027                                                         0
17028  [GTK/Linux] Blank Windows with GTK3 UI I start...      9
17029                                                         0
17030  Crash (MacOS) - getIvar SWT Process:         e...      3
17031                                                         0

[17032 rows x 2 columns]


  _torch_pytree._register_pytree_node(


Processing Topic Number: 9
Processing Topic Number: 4
Processing Topic Number: 0
Processing Topic Number: 8
Processing Topic Number: 7
Processing Topic Number: 5
Processing Topic Number: 6
Processing Topic Number: 3
Processing Topic Number: 2
Processing Topic Number: 1


In [10]:
label_map = {0.0: 1, 1.0: 2, 2.0: 3, 3.0: 4, 4.0: 5}
new_df['Predicted_Priority'] = new_df['Predicted_Priority'].map(label_map)

In [11]:
assessment = AccuracyAssessment(5)
assessment.update_vals(to_see_results['Priority'], new_df["Predicted_Priority"])

In [12]:
assessment.printAssessment()

Priority P1: Precision=0.0000, Recall=0.0000, F-measure=0.0000
Priority P2: Precision=0.0093, Recall=0.0033, F-measure=0.0048
Priority P3: Precision=0.9747, Recall=0.9926, F-measure=0.9835
Priority P4: Precision=0.8000, Recall=0.1509, F-measure=0.2540
Priority P5: Precision=0.0000, Recall=0.0000, F-measure=0.0000


In [13]:
assessment.macroAnalysis()

Macro-Analysis for Priority Levels: Precision = 0.3568, Recall=0.2294, F-measure=0.2485


In [14]:
assessment.microAnalysis()

Micro-Analysis for Priority Levels: Precision = 0.6087, Recall=0.5895, F-measure=0.5989


In [15]:
assessment.accuracyOverall()

0.9676491310474401

In [16]:
import os
extract_dir = '/home//Files'
file = os.path.join(extract_dir, 'BERTResults.csv')
new_df.to_csv(file, index = False)