In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Ensure NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Function to preprocess text
def preprocess_text(text):
    tokens = word_tokenize(str(text).lower())
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words])






[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# Read the CSV file
#data = pd.read_csv('priority_dataset.csv')  # Replace with your file path
data = pd.read_csv('your_dataset.csv')  # Replace with your file path


In [None]:
#xcol = "Short Description"
#ycol = "Priority"
xcol = "Message"
ycol = "Category"


In [None]:
data.head()

Unnamed: 0,Bug-ID,Project,Classification,Summary,Link,Enviroment,Unnamed: 6
0,XALANC-11,XalanC,Add issue,Documentation comment for getVariable/getParam...,https://issues.apache.org/jira/browse/XALANC-1...,APACHE,123.0
1,SVN-491,Subversion,Add issue,svn status' needs better error reporting,https://issues.apache.org/jira/browse/SVN-491?...,APACHE,
2,FOP-57,FOP,Add issue,Multolingual support in converting xml - pdf,https://issues.apache.org/jira/browse/FOP-57?j...,APACHE,
3,VELOCITY-31,Velocity,Add issue,Velocity should provide an interface for expli...,https://issues.apache.org/jira/browse/VELOCITY...,APACHE,
4,SVN-2196,Subversion,Add issue,Translate svn-ref.tex to French,https://issues.apache.org/jira/browse/SVN-2196...,APACHE,


In [None]:
df = data
df.rename(columns = {'Classification':'Category', 'Summary':'Message'}, inplace = True)
df.head()
df['Category'] = df['Category'].str.strip()

print(df['Category'].unique())
# droping some columns
df = df[df['Category'] != 'info release issue']
# Define the list of values to check
values_to_replace = ["Network issue", "Permission/Deprecation issue", "Security issue"]

# Replace values in 'Category' column
df.loc[df['Category'].isin(values_to_replace), 'Category'] = "Network/Security Issue"
data = df

['Add issue' 'Configuration issue' 'Database-related issue'
 'Functional issue' 'GUI-related issue' 'info release issue'
 'Network issue' 'Performance issue' 'Permission/Deprecation issue'
 'Security issue' 'Test Code-related issue']


In [None]:
#data.groupby("Priority").describe()
data.groupby("Category").describe()

Unnamed: 0_level_0,Unnamed: 6,Unnamed: 6,Unnamed: 6,Unnamed: 6,Unnamed: 6,Unnamed: 6,Unnamed: 6,Unnamed: 6
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Add issue,1.0,123.0,,123.0,123.0,123.0,123.0,123.0
Configuration issue,1.0,188.0,,188.0,188.0,188.0,188.0,188.0
Database-related issue,1.0,35.0,,35.0,35.0,35.0,35.0,35.0
Functional issue,1.0,471.0,,471.0,471.0,471.0,471.0,471.0
GUI-related issue,1.0,197.0,,197.0,197.0,197.0,197.0,197.0
Network/Security Issue,3.0,42.333333,2.886751,39.0,41.5,44.0,44.0,44.0
Performance issue,1.0,42.0,,42.0,42.0,42.0,42.0,42.0
Test Code-related issue,1.0,79.0,,79.0,79.0,79.0,79.0,79.0


In [None]:
# Preprocess descriptions
data[xcol] = data[xcol].apply(preprocess_text)
data.head()

Unnamed: 0,Bug-ID,Project,Category,Message,Link,Enviroment,Unnamed: 6
0,XALANC-11,XalanC,Add issue,documentation comment incorrect,https://issues.apache.org/jira/browse/XALANC-1...,APACHE,123.0
1,SVN-491,Subversion,Add issue,svn status need better error reporting,https://issues.apache.org/jira/browse/SVN-491?...,APACHE,
2,FOP-57,FOP,Add issue,multolingual support converting xml pdf,https://issues.apache.org/jira/browse/FOP-57?j...,APACHE,
3,VELOCITY-31,Velocity,Add issue,velocity provide interface explicit property r...,https://issues.apache.org/jira/browse/VELOCITY...,APACHE,
4,SVN-2196,Subversion,Add issue,translate french,https://issues.apache.org/jira/browse/SVN-2196...,APACHE,


In [None]:


# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data[xcol])
y = data[ycol]
print("Lengths before smote: ", X.shape[0], y.shape[0])
# Handle class imbalance with SMOTE
smote = SMOTE()
X_res, y_res = smote.fit_resample(X, y)
#X_res, y_res = X, y
print("Lengths after smote: ", X_res.shape[0], y_res.shape[0])
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.15, random_state=42)


Lengths before smote:  1262 1262
Lengths after smote:  3768 3768


In [None]:
# Train a Random Forest Classifier
classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)


In [None]:
# Evaluate the model
y_pred = classifier.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

                         precision    recall  f1-score   support

              Add issue       0.90      0.90      0.90        70
    Configuration issue       0.88      0.70      0.78        84
 Database-related issue       1.00      1.00      1.00        69
       Functional issue       0.61      0.77      0.68        70
      GUI-related issue       0.87      0.94      0.90        50
 Network/Security Issue       0.97      0.91      0.94        86
      Performance issue       0.98      0.97      0.98        67
Test Code-related issue       0.97      1.00      0.99        70

               accuracy                           0.89       566
              macro avg       0.90      0.90      0.90       566
           weighted avg       0.90      0.89      0.89       566

