In [1]:
!pip install nltk
import nltk
import string
nltk.download('stopwords')
from nltk.corpus import stopwords #Word Stop
from nltk.tokenize import word_tokenize #Tokenization & Word Stop
stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation + '``'+ '`'+ ''+ ',' + '/')
import joblib
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score



[nltk_data] Downloading package stopwords to
[nltk_data]     /home/rpierson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#Define a Preprocessing subpipeline
class PreprocessingPineline:
    def __init__(self, stop_words, punctuation):
        self.punctuation = punctuation
        self.stop_words = stop_words
        self.data = None
        self.tokens = None        
        
    def data_to_tokens(self, data):
        self.data = data
        self.tokens = self.data.astype(str).apply(word_tokenize)
        self.tokens = [[word for word in tokens if word.lower() not in stop_words and word not in punctuation] for tokens in self.tokens]
        return self.tokens

In [3]:
#Define a Topic Modeling subPipeline
class LDATopicModelPipeline:
    def __init__(self, lda_model_path, vectorizer_path):
        self.lda_model_path = lda_model_path
        self.vectorizer_path = vectorizer_path
        self.lda = None
        self.vectorizer = None
        
    def load_model(self):
        self.lda = joblib.load(self.lda_model_path)
        self.vectorizer = joblib.load(self.vectorizer_path)
        
    def topic_distributions(self, new_documents):
        if self.lda is None or self.vectorizer is None:
            self.load_model()
            
        if isinstance(new_documents["Combined_Text"], list):
            # If new_documents is a list of strings
            texts = new_documents
        elif isinstance(new_documents["Combined_Text"], pd.Series):
            # If new_documents is a Pandas Series (assuming it's a single column from a DataFrame)
            texts = new_documents["Combined_Text"].tolist()
            
        else:
            raise TypeError("Input data should be a list, Pandas Series, or DataFrame of strings.")
            
        x = self.vectorizer.transform(texts)
        topic_distributions = self.lda.transform(x)
        return topic_distributions
    
    def get_vectorizer(self):
        return self.vectorizer
    
    def append_topics(self, data, topic_distributions):
        topics = []
        for topic_dist in topic_distributions:
            dominant_topic = topic_dist.argmax()
            topics.append(dominant_topic)
            
        data['Topic'] = topics
        return data
            
#gives necessary files to process information
#use through this syntax: topic_distributions = topic_model.topic_distributions(new_documents)

In [4]:
#Define an Text Classification subpipeline
class TextClassificationNaiveBayes:
    def __init__(self):
        self.documents = None
        self.vectorizer = None
        self.bayes_trained = None
        
        
    def load_model(self, topic_num):
        #Use Trained Text Classifier Based on Topic Number
         for x in [topic_num]:
            self.bayes_trained = joblib.load(f'/home/rpierson/githubPierson/bayes/topic_pred_{x}.pkl')
            self.vectorizer = joblib.load(f'/home/rpierson/githubPierson/bayes/vec_{x}.pkl')
    def priority(self, documents):
        self.num_topics = documents['Topic'].drop_duplicates().values
        documents["Predicted_Priority"] = "" 
        self.documents = documents
        
        for topic_num in self.num_topics:
            self.load_model(topic_num)
            for i in self.documents.index: 
                if self.documents.loc[i, 'Topic'] == topic_num:
                    text = self.documents.loc[i, "Combined_Text"]
                    vector = self.vectorizer.transform([text]).toarray().reshape(1, -1)
                    prediction = self.bayes_trained.predict(vector)
                    self.documents.at[i, "Predicted_Priority"] = prediction[0]
        return self.documents
        
    
        #topic_prediction_function.predict(topic_validation_data.toarray())
        


In [39]:
#Using the equations for accuracy information
class AccuracyAssessment:
    def __init__(self, priority_levels): 
        self.priority_levels = list(range(1, priority_levels + 1))
        self.actual = None
        self.predicted = None
        self.true_level = []
        self.pred_level = []
        
    def update_vals(self, actual, predicted):
        self.actual = actual
        self.predicted = predicted
        
    def tru_pred_levs(self, i):
        
        priority_level = self.priority_levels[i]
        self.true_level = [self.actual[idx] for idx in range(len(self.actual)) if self.actual[idx] == priority_level]
        self.pred_level = [self.predicted[idx] for idx in range(len(self.predicted)) if self.actual[idx] == priority_level]
    
    def accuracy(self, i):
        self.tru_pred_levs(i)
        accuracy = accuracy_score(self.true_level, self.pred_level)
        return accuracy
        
    def precision(self, i):
        self.tru_pred_levs(i)
        precision = precision_score(self.true_level, self.pred_level, zero_division = 0, average = None)
        return precision
    
    def recall(self, i):
        self.tru_pred_levs(i)
        recall = recall_score(self.true_level, self.pred_level, zero_division = 0, average = None)
        return recall
    
    def fmeasure(self, i):
        self.tru_pred_levs(i)
        fmeasure = f1_score(self.true_level, self.pred_level, zero_division = 0, average = None)
        return fmeasure
    
    def macroeval(self):
        macro_precision = 0.0
        macro_recall = 0.0
        macro_fmeasure = 0.0
        macro_accuracy = 0.0
        
        for i in range(self.num_priority_levels):
            self.tru_pred_levs(i)
            accuracy_score = accuracy_score(self.true_level, self.pred_level)
            precision_score = precision_score(self.true_level, self.pred_level, zero_division = 0)
            recall_score = recall_score(self.true_level, self.pred_level, zero_division = 0)
            f_measure_score = f1_score(self.true_level, self.pred_level, zero_division = 0)
            
            macro_accuracy += accuracy_score
            macro_precision += precision_score
            macro_recall += recall_score
            macro_fmeasure += f_measure_score
        
        num_levels = len(self.priority_levels)
        macro_accuracy /= num_levels
        macro_precision /= num_levels
        macro_recall /= num_levels
        macro_fmeasure /= num_levels
        
        return macro_accuracy, macro_precision, macro_recall, macro_fmeasure
    
    def printAssessment(self):
        for i in range(len(self.priority_levels)):
            if i == 0:
                continue
            accuracy_score = self.accuracy(i)
            precision_score = self.precision(i)
            recall_score = self.recall(i)
            f_measure_score = self.fmeasure(i)    
            
            for class_idx in range(len(precision_score)):  # Assuming all scores have the same length
                precision = precision_score[class_idx]
                recall = recall_score[class_idx]
                f_measure = f_measure_score[class_idx]
                print(f"  Priority {class_idx}: Accuracy={accuracy_score:.4f}, Precision={precision:.4f}, Recall={recall:.4f}, F-measure={f_measure:.4f}")
            
        macro_accuracy, macro_precision, macro_recall, macro_fmeasure = self.macroeval()
        print(f"Priority Overall: Accuracy={macro_accuracy:.4f}, Precision={macro_precision:.4f}, Recall={macro_recall:.4f}, F-measure={macro_fmeasure:.4f}")

#make sure the rows are formatted appropriately for the following:
#assessment.update_vals(actual, predicted)

In [6]:
def priority_pipeline(data, actual):
    data = data
    #PreProcessing
   # preprocess = PreprocessingPineline(stop_words, punctuation)
   # data = preprocess.data_to_tokens(data)
    
    #Topic Modeling (Insert Trained Model Here, Save as a .pth)
    lda_model_path = '/home/rpierson/PiersonREU/extracted/lda.pkl'
    vectorizer_path = '/home/rpierson/PiersonREU/extracted/vec.pkl'
    topic_model = LDATopicModelPipeline(lda_model_path, vectorizer_path)
    topic_model.load_model()
    df = pd.DataFrame(columns = ['Combined_Text', 'Topic'])
    df['Combined_Text'] = data
    topic_distributions = topic_model.topic_distributions(df)
    data = topic_model.append_topics(df, topic_distributions)
    print(df)
    vectorizer = topic_model.get_vectorizer()
    
    #Text Classification Per Topic (Insert Trained Model Here, Save as a .pth)
    nb = TextClassificationNaiveBayes()
    data = nb.priority(df)
    print(data["Predicted_Priority"])
    print(data.groupby('Predicted_Priority').sum())
    
    #Accuracy and Evaluation
    assessment = AccuracyAssessment(priority_levels = 5)
    assessment.update_vals(actual, data["Predicted_Priority"])
    assessment.printAssessment()

In [36]:
def priority_pipeline_without_assessment(data):
    data = data
    #PreProcessing
   # preprocess = PreprocessingPineline(stop_words, punctuation)
   # data = preprocess.data_to_tokens(data)
    
    #Topic Modeling (Insert Trained Model Here, Save as a .pth)
    lda_model_path = '/home/rpierson/PiersonREU/extracted/lda.pkl'
    vectorizer_path = '/home/rpierson/PiersonREU/extracted/vec.pkl'
    topic_model = LDATopicModelPipeline(lda_model_path, vectorizer_path)
    topic_model.load_model()
    df = pd.DataFrame(columns = ['Combined_Text', 'Topic'])
    df['Combined_Text'] = data
    topic_distributions = topic_model.topic_distributions(df)
    data = topic_model.append_topics(df, topic_distributions)
    print(df)
    vectorizer = topic_model.get_vectorizer()
    
    #Text Classification Per Topic (Insert Trained Model Here, Save as a .pth)
    nb = TextClassificationNaiveBayes()
    data = nb.priority(df)
    return data

In [8]:
import pandas as pd

to_see_results = pd.read_csv('/home/rpierson/PiersonREU/extracted/train_dataset_0.csv')
to_see_results.head()

actual = [to_see_results["Priority"]]
actual_df = pd.DataFrame(actual)
actual_df = actual_df.transpose()
actual_df

Combined_Text = [to_see_results["Combined_Text"]]
df = pd.DataFrame(Combined_Text)
df = df.transpose()
df
label_map = {'P1': 1, 'P2': 2, 'P3': 3, 'P4': 4, 'P5': 5}
to_see_results['Priority'] = to_see_results['Priority'].map(label_map)

In [33]:
priority_pipeline(data = to_see_results['Combined_Text'], actual = to_see_results['Priority'])

                                           Combined_Text  Topic
0      Usability issue with external editors (1GE6IRL...      9
1      CC Discussion: local versioning (1GAT3PL) Team...      9
2      Manage/unmanage support and policies (1GALAEG)...      9
3      API: ISharingManager::load mapping vcm project...      9
4      API - VCM event notification (1G8G6RR) Team Th...      9
...                                                  ...    ...
40869  wrong size computation for Link widget SWT I20...      6
40870  Visibility Property for TableColumns SWT Prior...      9
40871  Invalid required space shown on feature instal...      9
40872  Widget disposed exception after updating incom...      4
40873  Widget is disposed in WorkbenchContextSupport....      9

[40874 rows x 2 columns]
0        3
1        3
2        3
3        3
4        3
        ..
40869    3
40870    3
40871    3
40872    3
40873    3
Name: Predicted_Priority, Length: 40874, dtype: object
                              

NameError: name 'precision_score_val' is not defined

In [30]:
level = list(range(1, 6))
level

[1, 2, 3, 4, 5]

In [38]:
new_df = priority_pipeline_without_assessment(data = to_see_results['Combined_Text'])

                                           Combined_Text  Topic
0      Usability issue with external editors (1GE6IRL...      9
1      CC Discussion: local versioning (1GAT3PL) Team...      9
2      Manage/unmanage support and policies (1GALAEG)...      9
3      API: ISharingManager::load mapping vcm project...      9
4      API - VCM event notification (1G8G6RR) Team Th...      9
...                                                  ...    ...
40869  wrong size computation for Link widget SWT I20...      6
40870  Visibility Property for TableColumns SWT Prior...      9
40871  Invalid required space shown on feature instal...      9
40872  Widget disposed exception after updating incom...      4
40873  Widget is disposed in WorkbenchContextSupport....      9

[40874 rows x 2 columns]


In [41]:
assessment = AccuracyAssessment(priority_levels = 5)
assessment.update_vals(to_see_results['Priority'], new_df["Predicted_Priority"])
assessment.printAssessment()

  Priority 0: Accuracy=0.8588, Precision=0.0000, Recall=0.0000, F-measure=0.0000
  Priority 1: Accuracy=0.8588, Precision=1.0000, Recall=0.8588, F-measure=0.9241
  Priority 2: Accuracy=0.8588, Precision=0.0000, Recall=0.0000, F-measure=0.0000
  Priority 3: Accuracy=0.8588, Precision=0.0000, Recall=0.0000, F-measure=0.0000
  Priority 0: Accuracy=0.8132, Precision=0.0000, Recall=0.0000, F-measure=0.0000
  Priority 1: Accuracy=0.8132, Precision=0.0000, Recall=0.0000, F-measure=0.0000
  Priority 2: Accuracy=0.8132, Precision=1.0000, Recall=0.8132, F-measure=0.8970
  Priority 3: Accuracy=0.8132, Precision=0.0000, Recall=0.0000, F-measure=0.0000
  Priority 4: Accuracy=0.8132, Precision=0.0000, Recall=0.0000, F-measure=0.0000
  Priority 0: Accuracy=0.8977, Precision=0.0000, Recall=0.0000, F-measure=0.0000
  Priority 1: Accuracy=0.8977, Precision=1.0000, Recall=0.8977, F-measure=0.9461
  Priority 2: Accuracy=0.8977, Precision=0.0000, Recall=0.0000, F-measure=0.0000
  Priority 0: Accuracy=0.998

AttributeError: 'AccuracyAssessment' object has no attribute 'num_priority_levels'