In [80]:
%%capture
!pip install pandas
!pip install scipy
!pip install matplotlib
!pip install seaborn
!pip install tensorflow
import zipfile
import os
import pandas as pd
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
import seaborn as sns
import requests

## imports for preprocessing
!pip install -U scikit-learn
!pip install nltk
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('all')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

#imports for sentiment analysis
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

#imports for model prediction
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, precision_score
import warnings
from pandas.errors import SettingWithCopyWarning

In [70]:
##Accessing a CSV
uploaded_zip = '/home/spope/EclipsePlatform/eclipse_platform.zip'
extract_dir = '/home/spope/EclipsePlatform/extracted'
if not os.path.exists(extract_dir):
    os.makedirs(extract_dir)
with zipfile.ZipFile(uploaded_zip, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

csv_file = os.path.join(extract_dir, 'eclipse_platform.csv')
fulldata = pd.read_csv(csv_file)

In [62]:
# Split the data into training and test sets
def SplitData(dataset):
        train_size = int(0.8 * len(dataset))
        test_size = len(dataset) - train_size
        trainset, testset = dataset[:train_size], dataset[train_size:]
        return trainset, testset

In [63]:
class TimeSet:
    def CreateDuration(dataset):
        dataset['Created_time'] = pd.to_datetime(dataset['Created_time'], utc=True)
        dataset['Resolved_time'] = pd.to_datetime(dataset['Resolved_time'], utc=True)

        # Convert to the desired format
        dataset['Created_time2'] = dataset['Created_time'].dt.strftime('%m/%d/%Y %H:%M')
        dataset['Resolved_time2'] = dataset['Resolved_time'].dt.strftime('%m/%d/%Y %H:%M')

        # Parse the formatted datetime strings back to datetime objects to ensure they are in the correct format
        dataset['Created_time2'] = pd.to_datetime(dataset['Created_time2'], format='%m/%d/%Y %H:%M')
        dataset['Resolved_time2'] = pd.to_datetime(dataset['Resolved_time2'], format='%m/%d/%Y %H:%M')

        # Calculate the duration in hours
        dataset['Duration'] = dataset['Resolved_time2'] - dataset['Created_time2']
        dataset['Duration_hours'] = dataset['Duration'].dt.total_seconds() / 3600

        # Drop the intermediate columns
        dataset.drop(['Created_time2', 'Resolved_time2', 'Duration'], axis=1, inplace=True)
    def CreateTimeLabel(dataset):
        mean_duration_hours = dataset['Duration_hours'].mean()

        # Apply labels based on the mean duration
        dataset.loc[:, 'TimeLabel'] = dataset['Duration_hours'].apply(lambda x: 'long' if x >= mean_duration_hours else 'short')

In [64]:
class Preprocessing:
    def RemoveStopWords(dataset):
        #making all the items in the descriptions columns lower
        dataset['Description'] = dataset['Description'].str.lower()
        # Define stop words
        stop_words = set(stopwords.words('english'))
         # Remove stop words
        for index, row in dataset.iterrows():
            # Check if the 'Description' is not NaN
            if isinstance(row['Description'], str):
                words = row['Description'].split()
                final_tokens = [word for word in words if word not in stop_words]
                dataset.at[index, 'Description'] = ' '.join(final_tokens)
    
        return dataset
    def Lemmitization(dataset):
        # Initialize the WordNet Lemmatizer
        lemmatizer = WordNetLemmatizer()
        # Function to lemmatize a sentence
        def lemmatize_sentence(sentence):
            words = nltk.word_tokenize(sentence)
            lemmatized_words = [lemmatizer.lemmatize(word, pos='v') for word in words]  # Lemmatize verbs (default)
            return ' '.join(lemmatized_words)
        dataset['Description'] = dataset['Description'].astype(str)  # Convert all to strings

        # Apply lemmatization to each row in the 'Description' column
        dataset['Description'] = dataset['Description'].apply(lemmatize_sentence)


In [65]:
class SentimentAnalysis:
    def get_wordnet_pos(treebank_tag):
          if treebank_tag.startswith('J'):
            return wn.ADJ
          elif treebank_tag.startswith('V'):
            return wn.VERB
          elif treebank_tag.startswith('N'):
            return wn.NOUN
          elif treebank_tag.startswith('R'):
            return wn.ADV
          else:
            return None
    # Function to calculate sentiment scores
    def CalculateSentimentScores(description):
        tokens = word_tokenize(description)
        tagged_tokens = pos_tag(tokens)
        
        pos_score = 0
        neg_score = 0
        token_count = 0
    
        for word, tag in tagged_tokens:
            wn_tag = SentimentAnalysis.get_wordnet_pos(tag)
            if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
                continue
        
            synsets = list(swn.senti_synsets(word, wn_tag))
            if not synsets:
                continue
        
            # Use the first synset for simplicity
            synset = synsets[0]
            pos_score += synset.pos_score()
            neg_score += synset.neg_score()
            token_count += 1
    
        # Normalize scores by the number of tokens
        if token_count > 0:
            pos_score /= token_count
            neg_score /= token_count
            return pos_score, neg_score
    # Apply sentiment score calculation to each description
    def CreatePosNegColumns(dataset): 
        dataset[['Pos_Score', 'Neg_Score']] = dataset['Description'].apply(lambda x: pd.Series(SentimentAnalysis.CalculateSentimentScores(x)))
    #Creating a column for Emotion (Positive or Negative)
    def EmotionColumn(dataset):
        #Creating a column for binary emotion (positive or negative)
        dataset['Emotion'] = dataset['Pos_Score'] - dataset['Neg_Score']
    
        # Assign labels based on the difference
        dataset['Emotion'] = dataset['Emotion'].apply(lambda x: 'positive' if x > 0 else 'negative')
    def EmotionalityColumn(dataset):
        dataset['Emotionality'] = dataset['Pos_Score']+ dataset['Neg_Score']

In [106]:
import pandas as pd
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score

def ModelTest(df, input_column, target_column, vectorizer_path, model_path):
    # Concatenate input columns to form the feature set
    X = df[input_column].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
    y = df[target_column]  # Target variable
    
    # Load the pre-trained model and vectorizer
    model = joblib.load(model_path)
    vectorizer = joblib.load(vectorizer_path)
    
    # Vectorize the text data
    X_vec = vectorizer.transform(X)
    
    # Make predictions
    predictions = model.predict(X_vec)
    
    # Calculate accuracy and F1 score
    accuracy = accuracy_score(y, predictions)
    f1 = f1_score(y, predictions, average='weighted')
    precision = precision_score(y, predictions, average='weighted')
    print(f"Accuracy: {accuracy:.2f}")
    print(f"F1 Score: {f1:.2f}")
    print(f"Precision: {precision}")

In [77]:
warnings.filterwarnings('ignore', category=SettingWithCopyWarning)

def pipelineTimeLabel(dataset, model):
    trainset, testset = SplitData(dataset)
    TimeSet.CreateDuration(testset)
    TimeSet.CreateTimeLabel(testset)
    Preprocessing.RemoveStopWords(testset)
    Preprocessing.Lemmitization(testset)
    SentimentAnalysis.CreatePosNegColumns(testset)
    SentimentAnalysis.EmotionColumn(testset)
    SentimentAnalysis.EmotionalityColumn(testset)
    ModelTest(testset, 'Emotion', 'TimeLabel', model_path= model)
    testset

Accuracy: 0.75
F1 Score: 0.65


In [103]:
testset2 = testset[testset['Resolution'].str.contains('FIXED', na=False)]
ModelTest(testset2, 'Emotion', 'TimeLabel', model_path= 'MLPmodelFixedTime.joblib', vectorizer_path='vectorizer.joblib')

Accuracy: 0.75
F1 Score: 0.65
Precision: 0.567030853061567


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [102]:
ModelTest(testset, 'Emotion', 'TimeLabel', model_path= 'MLPmodelTimeLabel.joblib', vectorizer_path='vectorizer2.joblib')

Accuracy: 0.76
F1 Score: 0.65
Precision: 0.5744522102290915


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [107]:
ModelTest(testset, ['Priority', 'Emotion'], 'TimeLabel', model_path= 'MLPmodelTimeLabelPriority.joblib', vectorizer_path='vectorizer3.joblib')

Accuracy: 0.76
F1 Score: 0.66
Precision: 0.7373152526811652


In [108]:
ModelTest(testset, ['Priority', 'Emotion','Emotionality'], 'TimeLabel', model_path= 'MLPmodelTimeLabelPriorityEmotionality.joblib', vectorizer_path='vectorizer4.joblib')

Accuracy: 0.76
F1 Score: 0.66
Precision: 0.7373152526811652
