In [9]:
%%capture
import pandas as pd
import zipfile
!pip install torch
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier

df = pd.read_csv(zipfile.ZipFile('sentiAnalysis.zip').open('sentiAnalysis.csv'))

In [10]:
def categorize_label(label):
    if 'FIXED' in label.upper():  # Convert to uppercase and check for 'FIXED'
        return 'FIXED'
    else:
        return 'NOTFIXED'

df['eventual'] = df['Resolution'].apply(categorize_label)

In [11]:
df

Unnamed: 0,Issue_id,Priority,Component,Duplicated_issue,Title,Description,Status,Resolution,Version,Created_time,Resolved_time,Duration_hours,label,Pos_Score,Neg_Score,emotion,Emotionality,eventual
0,1,P3,Team,,Usability issue with external editors (1GE6IRL),- setup project contain * .gif resource ; - re...,CLOSED,FIXED,2.0,2001-10-11 01:34:00+00:00,2012-02-09 20:57:47+00:00,90571.383333,long,0.051768,0.065657,negative,0.117424,FIXED
1,2,P5,Team,,Opening repository resources doesnt honor type...,open repository resource always open default t...,RESOLVED,FIXED,2.0,2001-10-11 01:34:00+00:00,2002-05-07 14:33:56+00:00,5004.983333,short,0.020833,0.056818,negative,0.077652,FIXED
2,3,P5,Team,,Sync does not indicate deletion (1GIEN83),km ( 10/2/2001 5:55:18 pm ) ; pr deletion indi...,RESOLVED,FIXED,2.0,2001-10-11 01:34:00+00:00,2010-05-07 14:28:53+00:00,75132.900000,long,0.086364,0.036364,positive,0.122727,FIXED
3,4,P5,Team,,need better error message if catching up over ...,- become synchronize project repository ; - us...,RESOLVED,FIXED,2.0,2001-10-11 01:34:00+00:00,2002-03-01 21:27:31+00:00,3403.883333,short,0.049342,0.092105,negative,0.141447,FIXED
4,5,P3,Team,,ISharingManager sharing API inconsistent (1GAU...,getting/setting manage state resource ; method...,RESOLVED,WONTFIX,2.0,2001-10-11 01:34:00+00:00,2008-08-15 12:04:36+00:00,60010.500000,long,0.085227,0.005682,positive,0.090909,NOTFIXED
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68119,229777,P3,UI,,[Viewers] Wrong argument in the first statemen...,build id : 3.3 ( i sure mean build id ; 3.3 ec...,RESOLVED,FIXED,3.3,2008-05-01 13:47:00+00:00,2008-05-10 14:06:07+00:00,216.316667,short,0.064024,0.027439,positive,0.091463,FIXED
68120,229779,P3,UI,,NPE in performance tests,several npes within ui session test prevent ru...,VERIFIED,FIXED,3.4,2008-05-01 13:52:00+00:00,2008-05-20 14:12:14+00:00,456.333333,short,0.033333,0.033333,negative,0.066667,FIXED
68121,229782,P3,UI,,Performance tests for ICU Collator,i20080501-0100 ; ; use collator ( see dependan...,VERIFIED,FIXED,3.4,2008-05-01 14:05:00+00:00,2009-06-01 18:25:12+00:00,9508.333333,long,0.025000,0.000000,positive,0.025000,FIXED
68122,229789,P3,UI,,[Examples] examples plugins create duplicate m...,create attachment 98318 ; screenshot ; ; i2008...,VERIFIED,FIXED,3.4,2008-05-01 15:02:00+00:00,2008-05-31 01:57:57+00:00,706.916667,short,0.092105,0.013158,positive,0.105263,FIXED


In [12]:
def SVMmodel(df, feature_cols, target_col, test_size=0.2, random_state=42):
    # Fill missing values in feature columns
    df[feature_cols] = df[feature_cols].fillna(' ')

    # Combine feature columns into a single column for vectorization
    df['combined'] = df[feature_cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

    # Encode the combined text column
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(df['combined'])

    # Encode the target column
    y = df[target_col]

    # Ensure X and y have the same length
    assert X.shape[0] == len(y), "Features and labels have inconsistent lengths."

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Initialize and train the SVM model
    model = SVC()
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    print(f'Accuracy: {accuracy}')
    print('Classification Report:')
    print(report)
    
    return accuracy, report


In [13]:
def MLPmodel(df, input_columns, target_column, test_size=0.25, max_features=1000, hidden_layer_sizes=(64, 32), activation='relu', max_iter=100, random_state=42):
    """
    Train and evaluate a neural network model on the provided DataFrame.
    
    Parameters:
    - df: DataFrame containing the data.
    - input_columns: List of column names to be used as input features.
    - target_column: Name of the column to be used as the target variable.
    - test_size: Proportion of the dataset to include in the test split.
    - max_features: Maximum number of features for TF-IDF vectorization.
    - hidden_layer_sizes: The ith element represents the number of neurons in the ith hidden layer.
    - activation: Activation function for the hidden layer.
    - max_iter: Maximum number of iterations for training the neural network.
    - random_state: Seed used by the random number generator.
    
    Returns:
    - accuracy: Test accuracy of the trained model.
    """
    # Fill NaN values with a space
    
    df.fillna(' ', inplace=True)
    
    # Concatenate input columns to form the feature set
    X = df[input_columns].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
    y = df[target_column]  # Target variable

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Vectorize the text data
    vectorizer = TfidfVectorizer(max_features=max_features)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    # Initialize and train the MLPClassifier
    model = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation, max_iter=max_iter, random_state=random_state)
    model.fit(X_train_vec, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test_vec)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print("Test Accuracy:", accuracy)


In [20]:
##Creating a function to train a naive bayes
def NBmodel(df, feature_cols, target_col, test_size=0.2, random_state=42):


    # Fill missing values in feature columns
    df[feature_cols] = df[feature_cols].fillna(' ')

    # Combine feature columns into a single column for vectorization
    df['combined'] = df[feature_cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

    # Encode the combined text column
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(df['combined'])

    # Encode the target column
    y = df[target_col]

    # Ensure X and y have the same length
    assert X.shape[0] == len(y), "Features and labels have inconsistent lengths."

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Initialize and train the Naive Bayes model
    model = MultinomialNB()
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    print(f'Accuracy: {accuracy}')
    print('Classification Report:')
    print(report)
    
    return accuracy, report

In [14]:
import torch
if torch.cuda.is_available():
    print("CUDA is available. Number of GPUs:", torch.cuda.device_count())
    print("CUDA device name:", torch.cuda.get_device_name(6))
else:
    print("CUDA is not available.")
torch.cuda.set_device(torch.device("cuda:6"))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

NameError: name '_C' is not defined

In [24]:
SVMmodel(df, feature_cols=['Component', 'Title', 'Description','emotion','Emotionality', 'label','Duration_hours', 'Priority'], target_col='eventual')

Accuracy: 0.6548256880733945
Classification Report:
              precision    recall  f1-score   support

       FIXED       0.63      0.69      0.66      6573
    NOTFIXED       0.68      0.62      0.65      7052

    accuracy                           0.65     13625
   macro avg       0.66      0.66      0.65     13625
weighted avg       0.66      0.65      0.65     13625



(0.6548256880733945,
 '              precision    recall  f1-score   support\n\n       FIXED       0.63      0.69      0.66      6573\n    NOTFIXED       0.68      0.62      0.65      7052\n\n    accuracy                           0.65     13625\n   macro avg       0.66      0.66      0.65     13625\nweighted avg       0.66      0.65      0.65     13625\n')

In [25]:
MLPmodel(df, input_columns=['Component', 'Title', 'Description','emotion','Emotionality', 'label','Duration_hours', 'Priority'], target_column='eventual')

Test Accuracy: 0.6202806646703071


In [28]:
NBmodel(df, feature_cols = ['Component', 'Title', 'Description','emotion','Emotionality', 'label','Duration_hours','Priority'], target_col = 'eventual')

Accuracy: 0.5556697247706422
Classification Report:
              precision    recall  f1-score   support

       FIXED       0.52      0.93      0.67      6573
    NOTFIXED       0.76      0.20      0.32      7052

    accuracy                           0.56     13625
   macro avg       0.64      0.57      0.50     13625
weighted avg       0.65      0.56      0.49     13625



(0.5556697247706422,
 '              precision    recall  f1-score   support\n\n       FIXED       0.52      0.93      0.67      6573\n    NOTFIXED       0.76      0.20      0.32      7052\n\n    accuracy                           0.56     13625\n   macro avg       0.64      0.57      0.50     13625\nweighted avg       0.65      0.56      0.49     13625\n')

In [27]:
MLPmodel(df, input_columns=['Component', 'Title', 'Description','emotion','Emotionality','Priority'], target_column='label')

Test Accuracy: 0.6900945334977394


In [29]:
MLPmodel(df, input_columns=['emotion','Emotionality','Priority'], target_column='label')

Test Accuracy: 0.755328518583759




In [30]:
MLPmodel(df, input_columns=['Emotionality','Priority'], target_column='label')

Test Accuracy: 0.7579707592038049


In [31]:
MLPmodel(df, input_columns=['emotion','Priority'], target_column='label')

Test Accuracy: 0.7644882860665845


In [32]:
MLPmodel(df, input_columns=['emotion','Emotionality', 'label','Duration_hours', 'Priority'], target_column='eventual')

Test Accuracy: 0.5748341260055193




In [33]:
MLPmodel(df, input_columns=['emotion','Emotionality', 'label', 'Priority'], target_column='eventual')

Test Accuracy: 0.5777112324584581




In [44]:
MLPmodel(df, input_columns=['emotion','Emotionality','Duration_hours', 'Priority'], target_column='eventual')

Test Accuracy: 0.5666138218542658




In [36]:
MLPmodel(df, input_columns=['emotion','label','Duration_hours', 'Priority'], target_column='eventual')

Test Accuracy: 0.6007280840819682




In [37]:
MLPmodel(df, input_columns=['Emotionality', 'label', 'Priority'], target_column='eventual')

Test Accuracy: 0.5734836474663848




In [40]:
MLPmodel(df, input_columns=['emotion','label', 'Priority'], target_column='eventual')

Test Accuracy: 0.57230931830192


In [45]:
MLPmodel(df, input_columns=['Emotionality','Duration_hours', 'Priority'], target_column='eventual')

Test Accuracy: 0.5791791439140391




In [46]:
MLPmodel(df, input_columns=['emotion','label','Duration_hours',], target_column='eventual')

Test Accuracy: 0.5885150607715343




In [47]:
MLPmodel(df, input_columns=['emotion','label'], target_column='eventual')

Test Accuracy: 0.5620926545710763


In [48]:
MLPmodel(df, input_columns=['emotion','Duration_hours',], target_column='eventual')

Test Accuracy: 0.5834067289061123




In [52]:
MLPmodel(df, input_columns=['Component', 'Title', 'Description','emotion','Emotionality','Priority', 'label','Duration_hours'], target_column='eventual')

Test Accuracy: 0.6219834419587811


In [54]:
MLPmodel(df, input_columns=['Title', 'Description','emotion','Emotionality','Priority', 'label','Duration_hours'], target_column= 'eventual')

Test Accuracy: 0.6100640009394633


In [55]:
MLPmodel(df, input_columns=['Component','Description','emotion','Emotionality','Priority', 'label','Duration_hours'], target_column='eventual')

Test Accuracy: 0.6112383301039281




In [56]:
MLPmodel(df, input_columns=['Component', 'emotion','Emotionality','Priority', 'label','Duration_hours'], target_column='eventual')

Test Accuracy: 0.5797663084962715




In [58]:
MLPmodel(df, input_columns=['Description','label','Duration_hours'], target_column='eventual')

Test Accuracy: 0.5927426457636076




In [59]:
MLPmodel(df, input_columns=['Component','label','Duration_hours'], target_column='eventual')

Test Accuracy: 0.6014913980388703




In [60]:
MLPmodel(df, input_columns=['Title','label','Duration_hours'], target_column='eventual')

Test Accuracy: 0.5817626680758616




In [61]:
MLPmodel(df, input_columns=['Priority','label','Duration_hours'], target_column='eventual')

Test Accuracy: 0.6085373730256591




In [62]:
MLPmodel(df, input_columns=['Priority','Title', 'label','Duration_hours'], target_column='eventual')

Test Accuracy: 0.5983207092948153


In [63]:
MLPmodel(df, input_columns=['Priority','Title', 'emotion'], target_column='eventual')

Test Accuracy: 0.5828195643238799




In [64]:
MLPmodel(df, input_columns=['Priority','Title', 'emotion', 'Emotionality'], target_column='eventual')

Test Accuracy: 0.5746579766308496


