In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Ensure NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Function to preprocess text
def preprocess_text(text):
    tokens = word_tokenize(str(text).lower())
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words])






[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [3]:
# Read the CSV file
data = pd.read_csv('priority_dataset.csv')  # Replace with your file path
#data = pd.read_csv('your_dataset.csv')  # Replace with your file path


In [4]:
xcol = "Short Description"
ycol = "Priority"
#xcol = "Message"
#ycol = "Category"


In [5]:
data.head()

Unnamed: 0,Priority,Short Description,Subdirectory,Report ID
0,P1,Assertions from inline spellchecker when closi...,Thunderbird,329672
1,P1,Mail composition : address autocompletion broken,Thunderbird,338658
2,P1,Thunderbird - Chinese/Japanese/Korean installe...,Thunderbird,351644
3,P1,Version/config bumps for Tb 2004,Thunderbird,382823
4,P1,Version/config bumps for Tb 2005,Thunderbird,388103


In [6]:
data.groupby("Priority").describe()
#data.groupby("Category").describe()

Unnamed: 0_level_0,Report ID,Report ID,Report ID,Report ID,Report ID,Report ID,Report ID,Report ID
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Priority,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
P1,3336.0,494416.564448,141066.603181,322045.0,389417.25,458852.5,575336.75,825526.0
P2,3717.0,476117.471079,117524.136994,322135.0,402345.0,445567.0,517124.0,824956.0
P3,1670.0,491640.060479,143325.281877,322270.0,389980.0,438663.0,597522.25,823830.0
P4,547.0,481864.683729,128345.307976,323229.0,392172.0,413903.0,594910.5,822579.0
P5,172.0,487772.575581,142658.17668,323958.0,379608.0,439232.0,592976.75,821938.0


In [7]:
# Preprocess descriptions
data[xcol] = data[xcol].apply(preprocess_text)
data.head()

Unnamed: 0,Priority,Short Description,Subdirectory,Report ID
0,P1,assertion inline spellchecker closing mail window,Thunderbird,329672
1,P1,mail composition address autocompletion broken,Thunderbird,338658
2,P1,thunderbird installer ui look ugly,Thunderbird,351644
3,P1,bump tb,Thunderbird,382823
4,P1,bump tb,Thunderbird,388103


In [8]:


# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data[xcol])
y = data[ycol]
print("Lengths before smote: ", X.shape[0], y.shape[0])
# Handle class imbalance with SMOTE
smote = SMOTE()
X_res, y_res = smote.fit_resample(X, y)
#X_res, y_res = X, y
print("Lengths after smote: ", X_res.shape[0], y_res.shape[0])
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.15, random_state=42)


Lengths before smote:  9442 9442
Lengths after smote:  18585 18585


In [9]:
# Train a Random Forest Classifier
classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)


In [10]:
# Evaluate the model
y_pred = classifier.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

          P1       0.58      0.61      0.59       578
          P2       0.53      0.43      0.48       550
          P3       0.79      0.78      0.79       558
          P4       0.94      0.94      0.94       541
          P5       0.87      0.99      0.93       561

    accuracy                           0.75      2788
   macro avg       0.74      0.75      0.74      2788
weighted avg       0.74      0.75      0.74      2788

