In [1]:
!pip install nltk
import nltk
import string
nltk.download('stopwords')
from nltk.corpus import stopwords #Word Stop
from nltk.tokenize import word_tokenize #Tokenization & Word Stop
stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation + '``'+ '`'+ ''+ ',' + '/')
import joblib
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score



[nltk_data] Downloading package stopwords to
[nltk_data]     /home//nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
#Did not end up using preprocessing

In [2]:
#Define a Preprocessing subpipeline
class PreprocessingPineline:
    def __init__(self, stop_words, punctuation):
        self.punctuation = punctuation
        self.stop_words = stop_words
        self.data = None
        self.tokens = None        
        
    def data_to_tokens(self, data):
        self.data = data
        self.tokens = self.data.astype(str).apply(word_tokenize)
        self.tokens = [[word for word in tokens if word.lower() not in stop_words and word not in punctuation] for tokens in self.tokens]
        return self.tokens

In [None]:
#Pretrained usage of LDA

In [3]:
#Define a Topic Modeling subPipeline
class LDATopicModelPipeline:
    def __init__(self, lda_model_path, vectorizer_path):
        self.lda_model_path = lda_model_path
        self.vectorizer_path = vectorizer_path
        self.lda = None
        self.vectorizer = None
        
    def load_model(self):
        self.lda = joblib.load(self.lda_model_path)
        self.vectorizer = joblib.load(self.vectorizer_path)
        
    def topic_distributions(self, new_documents):
        if self.lda is None or self.vectorizer is None:
            self.load_model()
            
        if isinstance(new_documents["Combined_Text"], list):
            # If new_documents is a list of strings
            texts = new_documents
        elif isinstance(new_documents["Combined_Text"], pd.Series):
            # If new_documents is a Pandas Series (assuming it's a single column from a DataFrame)
            texts = new_documents["Combined_Text"].tolist()
            
        else:
            raise TypeError("Input data should be a list, Pandas Series, or DataFrame of strings.")
            
        x = self.vectorizer.transform(texts)
        topic_distributions = self.lda.transform(x)
        return topic_distributions
    
    def get_vectorizer(self):
        return self.vectorizer
    
    def append_topics(self, data, topic_distributions):
        topics = []
        for topic_dist in topic_distributions:
            dominant_topic = topic_dist.argmax()
            topics.append(dominant_topic)
            
        data['Topic'] = topics
        return data
            
#gives necessary files to process information
#use through this syntax: topic_distributions = topic_model.topic_distributions(new_documents)

In [None]:
#Pretrained Text Classifiers used

In [16]:
#Define an Text Classification subpipeline
class TextClassificationNaiveBayes:
    def __init__(self):
        self.documents = None
        self.vectorizer = None
        self.bayes_trained = None
        
        
    def load_model(self, topic_num):
        #Use Trained Text Classifier Based on Topic Number
         for x in [topic_num]:
            self.bayes_trained = joblib.load(f'/home/TrainingNaiveBayesClassifiers/bayes/topic_pred_{x}.pkl')
            self.vectorizer = joblib.load(f'/home/TrainingNaiveBayesClassifiers/bayes/vec_{x}.pkl')
    def priority(self, documents):
        self.num_topics = documents['Topic'].drop_duplicates().values
        documents["Predicted_Priority"] = "" 
        self.documents = documents
        
        for topic_num in self.num_topics:
            self.load_model(topic_num)
            for i in self.documents.index: 
                if self.documents.loc[i, 'Topic'] == topic_num:
                    text = self.documents.loc[i, "Combined_Text"]
                    vector = self.vectorizer.transform([text]).toarray().reshape(1, -1)
                    prediction = self.bayes_trained.predict(vector)
                    self.documents.at[i, "Predicted_Priority"] = prediction[0]
        return self.documents
        
    
        #topic_prediction_function.predict(topic_validation_data.toarray())
        


In [None]:
#Accuracy assessments for our pipeline

In [36]:
class AccuracyAssessment:
    def __init__(self, priority_levels):
        self.actual_priority = None
        self.predicted_priority = None
        self.true_pos = 0
        self.false_pos = 0
        self.false_neg = 0
        self.num_priority_levels = priority_levels
        self.confusion_matrix = np.zeros((self.num_priority_levels, self.num_priority_levels), dtype=int)
    
    def update_vals(self, actual, predicted):
        self.actual_priority = actual.astype(int)
        self.predicted_priority = predicted.astype(int)
        
        # Update confusion matrix
        for i in range(len(self.actual_priority)):
            true_idx = self.actual_priority[i] - 1
            pred_idx = self.predicted_priority[i] - 1
            self.confusion_matrix[true_idx, pred_idx] += 1
    
    def calc_metrics(self, class_index):
        self.true_pos = self.confusion_matrix[class_index, class_index]
        self.false_pos = np.sum(self.confusion_matrix[:, class_index]) - self.true_pos
        self.false_neg = np.sum(self.confusion_matrix[class_index, :]) - self.true_pos
    
    def precision(self, class_index):
        self.calc_metrics(class_index)
        
        if self.true_pos + self.false_pos == 0:
            return 0
        precision = self.true_pos / (self.true_pos + self.false_pos)
        
        return precision
    
    def recall(self, class_index):
        self.calc_metrics(class_index)
        
        if self.true_pos + self.false_neg == 0:
            return 0
        
        recall = self.true_pos / (self.true_pos + self.false_neg)
        
        return recall
    
    def fmeasure(self, class_index):
        precision = self.precision(class_index)
        recall = self.recall(class_index)
        
        if precision + recall == 0:
            return 0
        
        fmeasure = (2 * precision * recall) / (precision + recall)
        
        return fmeasure
    
    def accuracyOverall(self):
        accuratePriority = 0
        for i in range(self.num_priority_levels):
            self.calc_metrics(i)
            accuratePriority += self.true_pos
        accuratePriorities = accuratePriority / len(self.actual_priority)
        return accuratePriorities
    
    def microAnalysis(self):
        precisions = []
        recalls = []
        f_measures = []
        for i in range(self.num_priority_levels):
            precision_score = self.precision(i)
            recall_score = self.recall(i)
            f_measure_score = self.fmeasure(i)
            
            precisions.append(precision_score)
            recalls.append(recall_score)
            f_measures.append(f_measure_score)
        
        micro_precision = sum(precisions) / sum(precisions + recalls)
        micro_recall = sum(precisions) / sum(precisions + f_measures)
        micro_fmeasure = (2 * micro_precision * micro_recall) / (micro_precision + micro_recall)
        print(f"Micro-Analysis for Priority Levels: Precision = {micro_precision:.4f}, Recall={micro_recall:.4f}, F-measure={micro_fmeasure:.4f}")
   
    def macroAnalysis(self):
        precisions = []
        recalls = []
        f_measures = []
        for i in range(self.num_priority_levels):
            precision_score = self.precision(i)
            recall_score = self.recall(i)
            f_measure_score = self.fmeasure(i)
            
            precisions.append(precision_score)
            recalls.append(recall_score)
            f_measures.append(f_measure_score)

        macro_precision = sum(precisions) / len(precisions)
        macro_recall = sum(recalls) / len(recalls)
        macro_fmeasure = sum(f_measures) / len(f_measures)
        print(f"Macro-Analysis for Priority Levels: Precision = {macro_precision:.4f}, Recall={macro_recall:.4f}, F-measure={macro_fmeasure:.4f}")
    def printAssessment(self):
        for i in range(self.num_priority_levels):
            precision_score = self.precision(i)
            recall_score = self.recall(i)
            f_measure_score = self.fmeasure(i)
    
            print(f"Priority P{i+1}: Precision={precision_score:.4f}, Recall={recall_score:.4f}, F-measure={f_measure_score:.4f}")

In [None]:
#I prefer without assessment, this priority_pipeline may need adjustments

In [18]:
def priority_pipeline(data, actual):
    data = data
    #PreProcessing
   # preprocess = PreprocessingPineline(stop_words, punctuation)
   # data = preprocess.data_to_tokens(data)
    
    #Topic Modeling (Insert Trained Model Here, Save as a .pth)
    lda_model_path = '/home/extracted/lda.pkl'
    vectorizer_path = '/home/extracted/vec.pkl'
    topic_model = LDATopicModelPipeline(lda_model_path, vectorizer_path)
    topic_model.load_model()
    df = pd.DataFrame(columns = ['Combined_Text', 'Topic'])
    df['Combined_Text'] = data
    topic_distributions = topic_model.topic_distributions(df)
    data = topic_model.append_topics(df, topic_distributions)
    print(df)
    vectorizer = topic_model.get_vectorizer()
    
    #Text Classification Per Topic (Insert Trained Model Here, Save as a .pth)
    nb = TextClassificationNaiveBayes()
    data = nb.priority(df)
    print(data["Predicted_Priority"])
    print(data.groupby('Predicted_Priority').sum())
    
    #Accuracy and Evaluation
    assessment = AccuracyAssessment(priority_levels = 5)
    assessment.update_vals(actual, data["Predicted_Priority"])
    assessment.printAssessment()

In [None]:
#Used to test our pipeline

In [19]:
def priority_pipeline_without_assessment(data):
    data = data
    #PreProcessing
    #preprocess = PreprocessingPineline(stop_words, punctuation)
    #data = preprocess.data_to_tokens(data)
    #Topic Modeling (Insert Trained Model Here, Save as a .pth)
    lda_model_path = '/home/extracted/lda.pkl'
    vectorizer_path = '/home/extracted/vec.pkl'
    topic_model = LDATopicModelPipeline(lda_model_path, vectorizer_path)
    topic_model.load_model()
    df = pd.DataFrame(columns = ['Combined_Text', 'Topic'])
    df['Combined_Text'] = data
    df['Combined_Text'] = df['Combined_Text'].fillna(' ')
    topic_distributions = topic_model.topic_distributions(df)
    data = topic_model.append_topics(df, topic_distributions)
    print(df)
    vectorizer = topic_model.get_vectorizer()
    
    #Text Classification Per Topic (Insert Trained Model Here, Save as a .pth)
    nb = TextClassificationNaiveBayes()
    data = nb.priority(df)
    return data

In [20]:
import pandas as pd

to_see_results = pd.read_csv('/home/extracted/test_dataset_notpreprocessed.csv')
to_see_results.head()

to_see_results['Combined_Text'] = to_see_results['Title'] + " " + to_see_results['Component'] + " " + to_see_results['Description']

actual = [to_see_results["Priority"]]
actual_df = pd.DataFrame(actual)
actual_df = actual_df.transpose()
actual_df

Combined_Text = [to_see_results["Combined_Text"]]
df = pd.DataFrame(Combined_Text)
df = df.transpose()
df
label_map = {'P1': 1, 'P2': 2, 'P3': 3, 'P4': 4, 'P5': 5}
to_see_results['Priority'] = to_see_results['Priority'].map(label_map)

In [21]:
level = list(range(1, 6))
level

[1, 2, 3, 4, 5]

In [22]:
new_df = priority_pipeline_without_assessment(data = to_see_results['Combined_Text'])

                                           Combined_Text  Topic
0      Cant disable a feature Update  (deprecated - u...      9
1      build id wrong in the about dialog UI In win32...      9
2      [JFace] ConfigureColumnsDialog does not work c...      9
3      Widget is disposed in ControlExample SWT - run...      4
4      An internal error occurred during: Initializin...      4
...                                                  ...    ...
17027                                                         0
17028  [GTK/Linux] Blank Windows with GTK3 UI I start...      9
17029                                                         0
17030  Crash (MacOS) - getIvar SWT Process:         e...      3
17031                                                         0

[17032 rows x 2 columns]


In [37]:
assessment = AccuracyAssessment(5)
assessment.update_vals(to_see_results['Priority'], new_df["Predicted_Priority"])

In [38]:
assessment.printAssessment()

Priority P1: Precision=0.0035, Recall=0.0755, F-measure=0.0067
Priority P2: Precision=0.0234, Recall=0.1013, F-measure=0.0380
Priority P3: Precision=0.9752, Recall=0.7691, F-measure=0.8600
Priority P4: Precision=0.0051, Recall=0.0755, F-measure=0.0095
Priority P5: Precision=0.0029, Recall=0.0800, F-measure=0.0056


In [39]:
actual = pd.DataFrame(to_see_results['Priority'])
predicted = pd.DataFrame(new_df["Predicted_Priority"])

In [40]:
assessment.macroAnalysis()
#It treats each class equally, so classes with fewer instances have the same weight as those with more instances, which may not reflect the practical importance or impact of each class.

Macro-Analysis for Priority Levels: Precision = 0.2020, Recall=0.2203, F-measure=0.1839


In [41]:
assessment.microAnalysis()
#different aggregation strategies: Micro-averaging gives equal weight to each instance
# Macro-averaging gives equal weight to each class
#It can be skewed by classes with more instances, giving them more influence over the final metric.

Micro-Analysis for Priority Levels: Precision = 0.4784, Recall=0.5234, F-measure=0.4999


In [42]:
#Use micro-averaging if you want to emphasize overall performance across all instances equally.
#Use macro-averaging if you want to evaluate and compare performance across different classes equally.

In [43]:
assessment.accuracyOverall()

0.7517613903240958