In [1]:
%%capture
!pip install pandas
!pip install scipy
!pip install matplotlib
!pip install seaborn
!pip install tensorflow
import zipfile
import os
import pandas as pd
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
import seaborn as sns
import requests

## imports for preprocessing
!pip install -U scikit-learn
!pip install nltk
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('all')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

#imports for sentiment analysis
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

#imports for model prediction
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, classification_report
import warnings
from pandas.errors import SettingWithCopyWarning
import joblib

In [2]:
##Accessing a CSV
uploaded_zip = '/home/spope/EclipsePlatform/eclipse_platform.zip'
extract_dir = '/home/spope/EclipsePlatform/extracted'
if not os.path.exists(extract_dir):
    os.makedirs(extract_dir)
with zipfile.ZipFile(uploaded_zip, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

csv_file = os.path.join(extract_dir, 'eclipse_platform.csv')
fulldata = pd.read_csv(csv_file)

In [3]:
# Split the data into training and test sets
def SplitData(dataset):
        train_size = int(0.8 * len(dataset))
        test_size = len(dataset) - train_size
        trainset, testset = dataset[:train_size], dataset[train_size:]
        return trainset, testset

In [4]:
class TimeSet:
    def CreateDuration(dataset):
        dataset['Created_time'] = pd.to_datetime(dataset['Created_time'], utc=True)
        dataset['Resolved_time'] = pd.to_datetime(dataset['Resolved_time'], utc=True)

        # Convert to the desired format
        dataset['Created_time2'] = dataset['Created_time'].dt.strftime('%m/%d/%Y %H:%M')
        dataset['Resolved_time2'] = dataset['Resolved_time'].dt.strftime('%m/%d/%Y %H:%M')

        # Parse the formatted datetime strings back to datetime objects to ensure they are in the correct format
        dataset['Created_time2'] = pd.to_datetime(dataset['Created_time2'], format='%m/%d/%Y %H:%M')
        dataset['Resolved_time2'] = pd.to_datetime(dataset['Resolved_time2'], format='%m/%d/%Y %H:%M')

        # Calculate the duration in hours
        dataset['Duration'] = dataset['Resolved_time2'] - dataset['Created_time2']
        dataset['Duration_hours'] = dataset['Duration'].dt.total_seconds() / 3600

        # Drop the intermediate columns
        dataset.drop(['Created_time2', 'Resolved_time2', 'Duration'], axis=1, inplace=True)
    def CreateTimeLabel(dataset):
        mean_duration_hours = dataset['Duration_hours'].mean()

        # Apply labels based on the mean duration
        dataset.loc[:, 'TimeLabel'] = dataset['Duration_hours'].apply(lambda x: 'long' if x >= mean_duration_hours else 'short')
        return dataset

In [5]:
class Preprocessing:
    def RemoveStopWords(dataset):
        #making all the items in the descriptions columns lower
        dataset['Description'] = dataset['Description'].str.lower()
        # Define stop words
        stop_words = set(stopwords.words('english'))
         # Remove stop words
        for index, row in dataset.iterrows():
            # Check if the 'Description' is not NaN
            if isinstance(row['Description'], str):
                words = row['Description'].split()
                final_tokens = [word for word in words if word not in stop_words]
                dataset.at[index, 'Description'] = ' '.join(final_tokens)
    
        return dataset
    def Lemmitization(dataset):
        # Initialize the WordNet Lemmatizer
        lemmatizer = WordNetLemmatizer()
        # Function to lemmatize a sentence
        def lemmatize_sentence(sentence):
            words = nltk.word_tokenize(sentence)
            lemmatized_words = [lemmatizer.lemmatize(word, pos='v') for word in words]  # Lemmatize verbs (default)
            return ' '.join(lemmatized_words)
        dataset['Description'] = dataset['Description'].astype(str)  # Convert all to strings

        # Apply lemmatization to each row in the 'Description' column
        dataset['Description'] = dataset['Description'].apply(lemmatize_sentence)


In [6]:
class SentimentAnalysis:
    def get_wordnet_pos(treebank_tag):
          if treebank_tag.startswith('J'):
            return wn.ADJ
          elif treebank_tag.startswith('V'):
            return wn.VERB
          elif treebank_tag.startswith('N'):
            return wn.NOUN
          elif treebank_tag.startswith('R'):
            return wn.ADV
          else:
            return None
    # Function to calculate sentiment scores
    def CalculateSentimentScores(description):
        tokens = word_tokenize(description)
        tagged_tokens = pos_tag(tokens)
        
        pos_score = 0
        neg_score = 0
        token_count = 0
    
        for word, tag in tagged_tokens:
            wn_tag = SentimentAnalysis.get_wordnet_pos(tag)
            if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
                continue
        
            synsets = list(swn.senti_synsets(word, wn_tag))
            if not synsets:
                continue
        
            # Use the first synset for simplicity
            synset = synsets[0]
            pos_score += synset.pos_score()
            neg_score += synset.neg_score()
            token_count += 1
    
        # Normalize scores by the number of tokens
        if token_count > 0:
            pos_score /= token_count
            neg_score /= token_count
            return pos_score, neg_score
    # Apply sentiment score calculation to each description
    def CreatePosNegColumns(dataset): 
        dataset[['Pos_Score', 'Neg_Score']] = dataset['Description'].apply(lambda x: pd.Series(SentimentAnalysis.CalculateSentimentScores(x)))
    #Creating a column for Emotion (Positive or Negative)
    def EmotionColumn(dataset):
        #Creating a column for binary emotion (positive or negative)
        dataset['Emotion'] = dataset['Pos_Score'] - dataset['Neg_Score']
    
        # Assign labels based on the difference
        dataset['Emotion'] = dataset['Emotion'].apply(lambda x: 'positive' if x > 0 else 'negative')
    def EmotionalityColumn(dataset):
        dataset['Emotionality'] = dataset['Pos_Score']+ dataset['Neg_Score']

In [12]:
def Destiny(df):
    # Create a new column 'Destiny' with default value 'Not Fixed'
    df['Destiny'] = 'Not Fixed'
    
    # Update 'Destiny' based on conditions in 'Resolution' column
    df.loc[df['Resolution'] == 'FIXED', 'Destiny'] = 'Fixed'
    df = df[~df['Resolution'].isin(['DUPLICATE', 'NDUPLICATE'])]
    
    return df

In [8]:
from sklearn.metrics import recall_score

def ModelTest(df, input_column, target_column, vectorizer_path, model_path):
    if isinstance(input_column, str):
        input_column = [input_column]
    # Concatenate input columns to form the feature set
    X = df[input_column].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
    y = df[target_column]  # Target variable
    
    # Load the pre-trained model and vectorizer
    model = joblib.load(model_path)
    vectorizer = joblib.load(vectorizer_path)
    
    # Vectorize the text data
    X_vec = vectorizer.transform(X)
    
    # Make predictions
    predictions = model.predict(X_vec)
    
    # Calculate accuracy and F1 score
    accuracy = accuracy_score(y, predictions)
    f1 = f1_score(y, predictions, average='weighted')
    precision = precision_score(y, predictions, average='weighted')
    recall = recall_score(y, predictions, average='weighted')
    print(f"Accuracy: {accuracy:.2f}")
    print(f"F1 Score: {f1:.2f}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall:.2f}")
    report = classification_report(y, predictions)
    print(report)


In [9]:
warnings.filterwarnings('ignore', category=SettingWithCopyWarning)

def pipelineTimeLabel(dataset):
    trainset, testset = SplitData(dataset)
    TimeSet.CreateDuration(testset)
    TimeSet.CreateTimeLabel(testset)
    Preprocessing.RemoveStopWords(testset)
    Preprocessing.Lemmitization(testset)
    SentimentAnalysis.CreatePosNegColumns(testset)
    SentimentAnalysis.EmotionColumn(testset)
    SentimentAnalysis.EmotionalityColumn(testset)
    Destiny(testset)
    return testset

In [10]:
MLPtestset = pipelineTimeLabel(fulldata)
Destiny(MLPtestset)

Unnamed: 0,Issue_id,Priority,Component,Duplicated_issue,Title,Description,Status,Resolution,Version,Created_time,Resolved_time,Duration_hours,TimeLabel,Pos_Score,Neg_Score,Emotion,Emotionality,Destiny
68124,229802,P3,Update (deprecated - use RT>Equinox>p2),,Cant disable a feature,build id : 3.3.2 ; ; step reproduce : ; im try...,RESOLVED,WONTFIX,3.3.2,2008-05-01 15:57:00+00:00,2012-07-24 14:22:18+00:00,37078.416667,long,0.017045,0.053977,negative,0.071023,Not Fixed
68126,229807,P3,UI,,[JFace] ConfigureColumnsDialog does not work c...,logic deal column order backwards . be ; colum...,VERIFIED,FIXED,3.4,2008-05-01 16:21:00+00:00,2008-05-02 15:27:15+00:00,23.100000,short,0.000000,0.062500,negative,0.062500,Fixed
68127,229841,P3,SWT,,Widget is disposed in ControlExample,- run controlexample ; - press set/get api but...,RESOLVED,FIXED,3.4,2008-05-01 18:34:00+00:00,2008-05-15 15:02:19+00:00,332.466667,short,0.076923,0.057692,positive,0.134615,Fixed
68128,229847,P3,Resources,,An internal error occurred during: Initializin...,get one restart self-hosting workspace . seem ...,RESOLVED,WORKSFORME,3.4,2008-05-01 18:52:00+00:00,2010-10-27 10:47:15+00:00,21807.916667,long,0.050000,0.100000,negative,0.150000,Not Fixed
68129,229849,P3,SWT,,Up or down arrows in Navigator cause horizonta...,youre scroll right navigator project explorer ...,RESOLVED,WORKSFORME,3.3.2,2008-05-01 18:53:00+00:00,2008-05-05 15:47:23+00:00,92.900000,short,0.072368,0.000000,positive,0.072368,Not Fixed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85150,424672,P3,UI,,Menu Bar does not display,click one button menu bar ( ie : file ; edit ;...,CLOSED,NDUPLICATE,4.3,2013-12-26 17:32:00+00:00,2014-01-27 14:20:08+00:00,764.800000,short,0.000000,0.000000,negative,0.000000,Not Fixed
85152,424714,P3,UI,,[GTK/Linux] Blank Windows with GTK3,start test 4.4 notice editor windows ; package...,CLOSED,NDUPLICATE,4.4,2013-12-28 13:57:00+00:00,2014-01-04 18:54:33+00:00,172.950000,short,0.015625,0.057292,negative,0.072917,Not Fixed
85153,424722,P3,Website,,software,,RESOLVED,INVALID,4.3.1,2013-12-29 08:57:00+00:00,2014-01-07 13:47:24+00:00,220.833333,short,0.000000,0.000000,negative,0.000000,Not Fixed
85154,424764,P3,SWT,,Crash (MacOS) - getIvar,process : eclipse [ 42412 ] ; path : /applicat...,CLOSED,NDUPLICATE,4.4,2013-12-30 21:43:00+00:00,2013-12-31 09:35:42+00:00,11.866667,short,0.066919,0.049747,positive,0.116667,Not Fixed


In [13]:
MLPtestset = Destiny(MLPtestset)

In [32]:
ModelTest(MLPtestset, ['Emotion'], 'TimeLabel', model_path= 'MLPmodelTimeLabelEmotion.joblib', vectorizer_path='vectorizer2.joblib')
ModelTest(MLPtestset, ['Emotion', 'Priority'], 'TimeLabel', model_path= 'MLPmodelTimeLabelPriorityEmotion.joblib', vectorizer_path='vectorizer3.joblib')
ModelTest(MLPtestset, ['Emotion', 'Priority', 'Emotionality'], 'TimeLabel', model_path= 'MLPmodelTimeLabelEmotionPriorityEmotionality.joblib', vectorizer_path='vectorizer4.joblib')
ModelTest(MLPtestset, 'Priority', 'TimeLabel', model_path= 'MLPmodelTimeLabelPriority.joblib', vectorizer_path='vectorizer5.joblib')
ModelTest(MLPtestset, ['Emotionality'], 'TimeLabel', model_path= 'MLPmodelTimeLabelEmotionality.joblib', vectorizer_path='vectorizer6.joblib')
ModelTest(MLPtestset, ['Emotionality', 'Priority'], 'TimeLabel', model_path= 'MLPmodelTimeLabelPriorityEmotionality.joblib', vectorizer_path='vectorizer13.joblib')

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.76
F1 Score: 0.65
Precision: 0.5744522102290915
Recall: 0.76


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

        long       0.00      0.00      0.00      4123
       short       0.76      1.00      0.86     12909

    accuracy                           0.76     17032
   macro avg       0.38      0.50      0.43     17032
weighted avg       0.57      0.76      0.65     17032

Accuracy: 0.76
F1 Score: 0.66
Precision: 0.7373152526811652
Recall: 0.76
              precision    recall  f1-score   support

        long       0.67      0.01      0.02      4123
       short       0.76      1.00      0.86     12909

    accuracy                           0.76     17032
   macro avg       0.71      0.51      0.44     17032
weighted avg       0.74      0.76      0.66     17032

Accuracy: 0.76
F1 Score: 0.66
Precision: 0.7174126505514131
Recall: 0.76
              precision    recall  f1-score   support

        long       0.59      0.01      0.02      4123
       short       0.76      1.00      0.86     12909

    accuracy                        

In [26]:
ModelTest(MLPtestset, ['Emotion'], 'Destiny', model_path= 'MLPmodelResolutionEmotion.joblib', vectorizer_path='vectorizer7.joblib')
ModelTest(MLPtestset, ['Emotion','Priority'], 'Destiny', model_path= 'MLPmodelResolutionEmotionPriority.joblib', vectorizer_path='vectorizer8.joblib')
ModelTest(MLPtestset, ['Priority'], 'Destiny', model_path= 'MLPmodelResolutionPriority.joblib', vectorizer_path='vectorizer9.joblib')
ModelTest(MLPtestset, ['Priority', 'Emotionality','Emotion'], 'Destiny', model_path= 'MLPmodelResolutionEmotionPriorityEmotionality.joblib', vectorizer_path='vectorizer10.joblib')
ModelTest(MLPtestset, ['Emotionality'], 'Destiny', model_path= 'MLPmodelResolutionEmotionality.joblib', vectorizer_path='vectorizer11.joblib')
ModelTest(MLPtestset, ['Priority','Emotionality','Emotion', 'Title'], 'Destiny', model_path= 'MLPmodelResolutionEmotionPriorityEmotionalityTitle.joblib', vectorizer_path='vectorizer12.joblib')


NameError: name 'Counter' is not defined

In [17]:
ModelTest(MLPtestset, ['Emotion'], 'TimeLabel', model_path= 'SVMmodelTimeLabelEmotion.joblib', vectorizer_path='SVMvectorizer1.joblib')

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.76
F1 Score: 0.65
Precision: 0.5744522102290915
              precision    recall  f1-score   support

        long       0.00      0.00      0.00      4123
       short       0.76      1.00      0.86     12909

    accuracy                           0.76     17032
   macro avg       0.38      0.50      0.43     17032
weighted avg       0.57      0.76      0.65     17032



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [14]:
ModelTest(MLPtestset, ['Priority', 'Emotionality','Emotion'], 'Destiny', model_path= 'NODUPMLPmodelTimeLabelPriorityEmotionalityEmotion.joblib', vectorizer_path='NODUPvectorizer13.joblib')

Accuracy: 0.70
F1 Score: 0.61
Precision: 0.6078909262768835
Recall: 0.70
              precision    recall  f1-score   support

       Fixed       0.72      0.97      0.82      9786
   Not Fixed       0.33      0.04      0.08      3874

    accuracy                           0.70     13660
   macro avg       0.52      0.50      0.45     13660
weighted avg       0.61      0.70      0.61     13660



In [38]:
MLPtestset

Unnamed: 0,Issue_id,Priority,Component,Duplicated_issue,Title,Description,Status,Resolution,Version,Created_time,Resolved_time,Duration_hours,TimeLabel,Pos_Score,Neg_Score,Emotion,Emotionality,Destiny
68124,229802,P3,Update (deprecated - use RT>Equinox>p2),,Cant disable a feature,build id : 3.3.2 ; ; step reproduce : ; im try...,RESOLVED,WONTFIX,3.3.2,2008-05-01 15:57:00+00:00,2012-07-24 14:22:18+00:00,37078.416667,long,0.017045,0.053977,negative,0.071023,Not Fixed
68126,229807,P3,UI,,[JFace] ConfigureColumnsDialog does not work c...,logic deal column order backwards . be ; colum...,VERIFIED,FIXED,3.4,2008-05-01 16:21:00+00:00,2008-05-02 15:27:15+00:00,23.100000,short,0.000000,0.062500,negative,0.062500,Fixed
68127,229841,P3,SWT,,Widget is disposed in ControlExample,- run controlexample ; - press set/get api but...,RESOLVED,FIXED,3.4,2008-05-01 18:34:00+00:00,2008-05-15 15:02:19+00:00,332.466667,short,0.076923,0.057692,positive,0.134615,Fixed
68128,229847,P3,Resources,,An internal error occurred during: Initializin...,get one restart self-hosting workspace . seem ...,RESOLVED,WORKSFORME,3.4,2008-05-01 18:52:00+00:00,2010-10-27 10:47:15+00:00,21807.916667,long,0.050000,0.100000,negative,0.150000,Not Fixed
68129,229849,P3,SWT,,Up or down arrows in Navigator cause horizonta...,youre scroll right navigator project explorer ...,RESOLVED,WORKSFORME,3.3.2,2008-05-01 18:53:00+00:00,2008-05-05 15:47:23+00:00,92.900000,short,0.072368,0.000000,positive,0.072368,Not Fixed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85150,424672,P3,UI,,Menu Bar does not display,click one button menu bar ( ie : file ; edit ;...,CLOSED,NDUPLICATE,4.3,2013-12-26 17:32:00+00:00,2014-01-27 14:20:08+00:00,764.800000,short,0.000000,0.000000,negative,0.000000,Not Fixed
85152,424714,P3,UI,,[GTK/Linux] Blank Windows with GTK3,start test 4.4 notice editor windows ; package...,CLOSED,NDUPLICATE,4.4,2013-12-28 13:57:00+00:00,2014-01-04 18:54:33+00:00,172.950000,short,0.015625,0.057292,negative,0.072917,Not Fixed
85153,424722,P3,Website,,software,,RESOLVED,INVALID,4.3.1,2013-12-29 08:57:00+00:00,2014-01-07 13:47:24+00:00,220.833333,short,0.000000,0.000000,negative,0.000000,Not Fixed
85154,424764,P3,SWT,,Crash (MacOS) - getIvar,process : eclipse [ 42412 ] ; path : /applicat...,CLOSED,NDUPLICATE,4.4,2013-12-30 21:43:00+00:00,2013-12-31 09:35:42+00:00,11.866667,short,0.066919,0.049747,positive,0.116667,Not Fixed
