In [1]:
!pip install nltk
import nltk
import string
nltk.download('stopwords')
from nltk.corpus import stopwords #Word Stop
from nltk.tokenize import word_tokenize #Tokenization & Word Stop
stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation + '``'+ '`'+ ''+ ',' + '/')
import joblib
import numpy as np



[nltk_data] Downloading package stopwords to
[nltk_data]     /home/rpierson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#Define a Preprocessing subpipeline
class PreprocessingPineline:
    def __init__(self, stop_words, punctuation):
        self.punctuation = punctuation
        self.stop_words = stop_words
        self.data = None
        self.tokens = None        
        
    def data_to_tokens(self, data):
        self.data = data
        self.tokens = self.data.astype(str).apply(word_tokenize)
        self.tokens = [[word for word in tokens if word.lower() not in stop_words and word not in punctuation] for tokens in self.tokens]
        return self.tokens

In [3]:
#Define a Topic Modeling subPipeline
class LDATopicModelPipeline:
    def __init__(self, lda_model_path, vectorizer_path):
        self.lda_model_path = lda_model_path
        self.vectorizer_path = vectorizer_path
        self.lda = None
        self.vectorizer = None
        
    def load_model(self):
        self.lda = joblib.load(self.lda_model_path)
        self.vectorizer = joblib.load(self.vectorizer_path)
        
    def topic_distributions(self, new_documents):
        if self.lda is None or self.vectorizer is None:
            self.load_model()
            
        if isinstance(new_documents["Combined_Text"], list):
            # If new_documents is a list of strings
            texts = new_documents
        elif isinstance(new_documents["Combined_Text"], pd.Series):
            # If new_documents is a Pandas Series (assuming it's a single column from a DataFrame)
            texts = new_documents["Combined_Text"].tolist()
            
        else:
            raise TypeError("Input data should be a list, Pandas Series, or DataFrame of strings.")
            
        x = self.vectorizer.transform(texts)
        topic_distributions = self.lda.transform(x)
        return topic_distributions
    
    def get_vectorizer(self):
        return self.vectorizer
    
    def append_topics(self, data, topic_distributions):
        topics = []
        for topic_dist in topic_distributions:
            dominant_topic = topic_dist.argmax()
            topics.append(dominant_topic)
            
        data['Topic'] = topics
        return data
            
#gives necessary files to process information
#use through this syntax: topic_distributions = topic_model.topic_distributions(new_documents)

In [20]:
#Define an Text Classification subpipeline
class TextClassificationNaiveBayes:
    def __init__(self, vectorizer):
        self.documents = None
        self.vectorizer = vectorizer
        self.bayes_trained = None
        
        
    def load_model(self, topic_num):
        #Use Trained Text Classifier Based on Topic Number
         for x in [topic_num]:
            self.bayes_trained = joblib.load(f'/home/rpierson/githubPierson/bayes/topic_pred_{x}.pkl')
        
    def priority(self, documents):
        self.num_topics = documents['Topic'].drop_duplicates().values
        documents["Predicted_Priority"] = "" 
        self.documents = documents
        
        for topic_num in self.num_topics:
            self.load_model(topic_num)
            for i in self.documents.index: 
                if self.documents.loc[i, 'Topic'] == topic_num:
                    text = self.documents.loc[i, "Combined_Text"]
                    vector = self.vectorizer.transform([text])
                    vector = vector.toarray().reshape(1, -1)
                    prediction = self.bayes_trained.predict(vector)
                    self.documents.at[i, "Predicted_Priority"] = prediction[0]
        
    
        #topic_prediction_function.predict(topic_validation_data.toarray())
        


In [6]:
#Using the equations for accuracy information
class AccuracyAssessment:
    def __init__(self, priority_levels):
        self.actual_priority = None
        self.predicted_priority = None
        self.true_pos = None
        self.false_pos = None
        self.false_neg = None
        self.num_priority_levels = priority_levels
        self.confusion_matrix = np.zeros((self.num_priority_levels, self.num_priority_levels), dtype=int)
    
    def update_vals(self, actual, predicted):
        self.actual_priority = actual
        self.predicted_priority = predicted
        for i in range(len(self.actual_priority)):
            self.confusion_matrix[self.actual_priority[i] - 1, self.predicted_priority[i] - 1] += 1
    
    def calc_metrics(self, class_index):
        self.true_pos = (actual_priority == predicted_priority).sum()
        self.false_pos = np.sum(self.confusion_matrix[:, class_index]) - self.true_pos
        self.false_neg = np.sum(self.confusion_matrix[class_index, :]) - self.true_pos
    
    def precision(self, class_index):
        AccuracyAssessment.calc_metrics(class_index)
        
        if self.true_pos + self.false_pos == 0:
            return 0
        precision = self.true_pos / (self.true_pos + self.false_pos)
        
        return precision
    
    def recall(self, class_index):
        AccuracyAssessment.calc_metrics(class_index)
        
        if self.true_pos + self.false_neg == 0:
            return 0
        
        recall = self.true_pos / (self.true_pos + self.false_neg)
        
        return recall
    
    def fmeasure(self, class_index):
        precision = AccuracyAssessment.precision(class_index)
        recall = AccuracyAssessment.recall(class_index)
        
        if precision + recall == 0:
            return 0
        
        fmeasure = (2 * precision * recall) / (precision + recall)
        
        return fmeasure
    
    def printAssessment(self):
        for i in range(self.num_priority_levels):
            precision_score = AccuracyAssessment.precision(i)
            recall_score = AccuracyAssessment.recall(i)
            f_measure_score = AccuracyAssessment.fmeasure(i)
    
            print(f"Priority P{i+1}: Precision={precision_score:.4f}, Recall={recall_score:.4f}, F-measure={f_measure_score:.4f}")

#make sure the rows are formatted appropriately for the following:
#assessment.update_vals(actual, predicted)

In [7]:
def priority_pipeline(data, actual):
    data = data
    #PreProcessing
   # preprocess = PreprocessingPineline(stop_words, punctuation)
   # data = preprocess.data_to_tokens(data)
    
    #Topic Modeling (Insert Trained Model Here, Save as a .pth)
    lda_model_path = '/home/rpierson/PiersonREU/extracted/lda.pkl'
    vectorizer_path = '/home/rpierson/PiersonREU/extracted/vec.pkl'
    topic_model = LDATopicModelPipeline(lda_model_path, vectorizer_path)
    topic_model.load_model()
    df = pd.DataFrame(columns = ['Combined_Text', 'Topic'])
    df['Combined_Text'] = data
    topic_distributions = topic_model.topic_distributions(df)
    data = topic_model.append_topics(df, topic_distributions)
    print(df)
    vectorizer = topic_model.get_vectorizer()
    
    #Text Classification Per Topic (Insert Trained Model Here, Save as a .pth)
    nb = TextClassificationNaiveBayes(vectorizer)
    data = nb.priority(df)
    
    #Accuracy and Evaluation
    assessment = AccuracyAssessment(priority_levels = 5)
    assessment.update_values(actual, data["Predicted_Priority"])
    assessment.printAssessment()

In [8]:
def priority_pipeline_without_assessment(self, data):
    self.data = data
    #PreProcessing
    preprocess = PreprocessingPineline(stop_words, punctuation)
    self.data = preprocess.data_to_tokens(self.data)
    
    #Topic Modeling (Insert Trained Model Here, Save as a .pth)
    lda_model_path = '/home/rpierson/PiersonREU/extracted/lda.pkl'
    vectorizer_path = '/home/rpierson/PiersonREU/extracted/vec.pkl'
    topic_model = LDATopicModelPipeline(lda_model_path, vectorizer_path)
    topic_model.load_model()
    topic_distributions = topic_model.topic_distributions(self.data)
    self.data = topic_model.append_topics(self.data, topic_distributions)
    
    #Text Classification Per Topic (Insert Trained Model Here, Save as a .pth)
    nb = TextClassificationNaiveBayes()
    self.data = nb.priority(self.data)
    return self.data

In [9]:
import pandas as pd

to_see_results = pd.read_csv('/home/rpierson/PiersonREU/extracted/train_dataset_0.csv')
to_see_results.head()

actual = [to_see_results["Priority"]]
actual_df = pd.DataFrame(actual)
actual_df = actual_df.transpose()
actual_df

Combined_Text = [to_see_results["Combined_Text"]]
df = pd.DataFrame(Combined_Text)
df = df.transpose()
df

Unnamed: 0,Combined_Text
0,Usability issue with external editors (1GE6IRL...
1,CC Discussion: local versioning (1GAT3PL) Team...
2,Manage/unmanage support and policies (1GALAEG)...
3,API: ISharingManager::load mapping vcm project...
4,API - VCM event notification (1G8G6RR) Team Th...
...,...
40869,wrong size computation for Link widget SWT I20...
40870,Visibility Property for TableColumns SWT Prior...
40871,Invalid required space shown on feature instal...
40872,Widget disposed exception after updating incom...


In [21]:
priority_pipeline(data = to_see_results['Combined_Text'], actual = to_see_results['Priority'])

                                           Combined_Text  Topic
0      Usability issue with external editors (1GE6IRL...      9
1      CC Discussion: local versioning (1GAT3PL) Team...      9
2      Manage/unmanage support and policies (1GALAEG)...      9
3      API: ISharingManager::load mapping vcm project...      9
4      API - VCM event notification (1G8G6RR) Team Th...      9
...                                                  ...    ...
40869  wrong size computation for Link widget SWT I20...      6
40870  Visibility Property for TableColumns SWT Prior...      9
40871  Invalid required space shown on feature instal...      9
40872  Widget disposed exception after updating incom...      4
40873  Widget is disposed in WorkbenchContextSupport....      9

[40874 rows x 2 columns]


ValueError: X has 219686 features, but GaussianNB is expecting 42405 features as input.