In [28]:
##imports for the machine learning testing
%%capture
import pandas as pd
import zipfile
!pip install torch
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC

df = pd.read_csv(zipfile.ZipFile('sentiAnalysis.zip').open('sentiAnalysis.csv'))

In [30]:
##creating a column for eventual label (FIXED or NOTFIXED)
def categorize_label(label):
    if 'FIXED' in label.upper():  # Convert to uppercase and check for 'FIXED'
        return 'FIXED'
    else:
        return 'NOTFIXED'

df['eventual'] = df['Resolution'].apply(categorize_label)

In [16]:
##Creating a function to train a naive bayes
def naive_bayes(df, feature_cols, target_col, test_size=0.2, random_state=42):

    #if looking for time only look at the columns that turned out to be fixed 
    if 'label' in target_col:
        df = df[df['Resolution'].str.contains('FIXED', na=False)]

    # Fill missing values in feature columns
    df[feature_cols] = df[feature_cols].fillna(' ')

    # Combine feature columns into a single column for vectorization
    df['combined'] = df[feature_cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

    # Encode the combined text column
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(df['combined'])

    # Encode the target column
    y = df[target_col]

    # Ensure X and y have the same length
    assert X.shape[0] == len(y), "Features and labels have inconsistent lengths."

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Initialize and train the Naive Bayes model
    model = MultinomialNB()
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    print(f'Accuracy: {accuracy}')
    print('Classification Report:')
    print(report)
    
    return accuracy, report

Accuracy: 0.5696880733944955
Classification Report:
              precision    recall  f1-score   support

       FIXED       0.76      0.16      0.26      6573
    NOTFIXED       0.55      0.95      0.70      7052

    accuracy                           0.57     13625
   macro avg       0.65      0.56      0.48     13625
weighted avg       0.65      0.57      0.49     13625



In [18]:
##creating a function to train a SVM model 
def train_svm(df, feature_cols, target_col, test_size=0.2, random_state=42):

    #if testing for eventual time to fix then 
    if 'label' in target_col:
        df = df[df['Resolution'].str.contains('FIXED', na=False)]

    # Fill missing values in feature columns
    df[feature_cols] = df[feature_cols].fillna(' ')

    # Combine feature columns into a single column for vectorization
    df['combined'] = df[feature_cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

    # Encode the combined text column
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(df['combined'])

    # Encode the target column
    y = df[target_col]

    # Ensure X and y have the same length
    assert X.shape[0] == len(y), "Features and labels have inconsistent lengths."

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Initialize and train the SVM model
    model = SVC()
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    print(f'Accuracy: {accuracy}')
    print('Classification Report:')
    print(report)
    
    return accuracy, report

trial1 = train_svm(df, feature_cols=['emotion', 'Priority'], target_col='eventual')


Accuracy: 0.5696880733944955
Classification Report:
              precision    recall  f1-score   support

       FIXED       0.76      0.16      0.26      6573
    NOTFIXED       0.55      0.95      0.70      7052

    accuracy                           0.57     13625
   macro avg       0.65      0.56      0.48     13625
weighted avg       0.65      0.57      0.49     13625



In [28]:
trial1 = train_naive_bayes(df, feature_cols = ['emotion', 'Priority'], target_col = 'eventual')

Accuracy: 0.5696880733944955
Classification Report:
              precision    recall  f1-score   support

       FIXED       0.76      0.16      0.26      6573
    NOTFIXED       0.55      0.95      0.70      7052

    accuracy                           0.57     13625
   macro avg       0.65      0.56      0.48     13625
weighted avg       0.65      0.57      0.49     13625



In [27]:
trial2 = train_naive_bayes(df, feature_cols = ['emotion', 'Priority', 'Emotionality'], target_col = 'eventual')

Accuracy: 0.5639633027522936
Classification Report:
              precision    recall  f1-score   support

       FIXED       0.57      0.40      0.47      6573
    NOTFIXED       0.56      0.71      0.63      7052

    accuracy                           0.56     13625
   macro avg       0.56      0.56      0.55     13625
weighted avg       0.56      0.56      0.55     13625



In [29]:
trial2 = train_naive_bayes(df, feature_cols = ['Priority'], target_col = 'eventual')

Accuracy: 0.5696880733944955
Classification Report:
              precision    recall  f1-score   support

       FIXED       0.76      0.16      0.26      6573
    NOTFIXED       0.55      0.95      0.70      7052

    accuracy                           0.57     13625
   macro avg       0.65      0.56      0.48     13625
weighted avg       0.65      0.57      0.49     13625



In [30]:
trial2 = train_naive_bayes(df, feature_cols = ['emotion', 'Emotionality'], target_col = 'eventual')

Accuracy: 0.5069357798165137
Classification Report:
              precision    recall  f1-score   support

       FIXED       0.49      0.49      0.49      6573
    NOTFIXED       0.52      0.52      0.52      7052

    accuracy                           0.51     13625
   macro avg       0.51      0.51      0.51     13625
weighted avg       0.51      0.51      0.51     13625



In [33]:
trial2 = train_naive_bayes(df, feature_cols = ['Description','Priority', 'Emotionality', 'label'], target_col = 'eventual')

Accuracy: 0.5544954128440367
Classification Report:
              precision    recall  f1-score   support

       FIXED       0.52      0.93      0.67      6573
    NOTFIXED       0.76      0.20      0.32      7052

    accuracy                           0.55     13625
   macro avg       0.64      0.57      0.50     13625
weighted avg       0.64      0.55      0.49     13625



In [34]:
trial2 = train_naive_bayes(df, feature_cols = ['Emotionality'], target_col = 'eventual')

Accuracy: 0.5067155963302752
Classification Report:
              precision    recall  f1-score   support

       FIXED       0.49      0.48      0.48      6573
    NOTFIXED       0.52      0.53      0.53      7052

    accuracy                           0.51     13625
   macro avg       0.51      0.51      0.51     13625
weighted avg       0.51      0.51      0.51     13625



In [38]:
trial2 = train_naive_bayes(df, feature_cols = ['emotion', 'Priority', 'Emotionality'], target_col = 'label')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[feature_cols] = df[feature_cols].fillna(' ')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['combined'] = df[feature_cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)


Accuracy: 0.8303708243947288
Classification Report:
              precision    recall  f1-score   support

        long       0.39      0.03      0.06      1087
       short       0.84      0.99      0.91      5439

    accuracy                           0.83      6526
   macro avg       0.61      0.51      0.48      6526
weighted avg       0.76      0.83      0.77      6526



In [58]:
trial1 = train_svm(df, feature_cols=['Priority'], target_col='eventual')

Accuracy: 0.5696880733944955
Classification Report:
              precision    recall  f1-score   support

       FIXED       0.76      0.16      0.26      6573
    NOTFIXED       0.55      0.95      0.70      7052

    accuracy                           0.57     13625
   macro avg       0.65      0.56      0.48     13625
weighted avg       0.65      0.57      0.49     13625



In [7]:
trial1 = train_svm(df, feature_cols=['emotion'], target_col='Priority')

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.8557064220183487
Classification Report:
              precision    recall  f1-score   support

          P1       0.00      0.00      0.00       427
          P2       0.00      0.00      0.00       935
          P3       0.86      1.00      0.92     11659
          P4       0.00      0.00      0.00       377
          P5       0.00      0.00      0.00       227

    accuracy                           0.86     13625
   macro avg       0.17      0.20      0.18     13625
weighted avg       0.73      0.86      0.79     13625



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [19]:
train_svm(df, feature_cols=['Component', 'Title', 'Description','emotion','Emotionality'], target_col='eventual')

Accuracy: 0.6289908256880734
Classification Report:
              precision    recall  f1-score   support

       FIXED       0.60      0.70      0.64      6573
    NOTFIXED       0.67      0.56      0.61      7052

    accuracy                           0.63     13625
   macro avg       0.63      0.63      0.63     13625
weighted avg       0.63      0.63      0.63     13625



(0.6289908256880734,
 '              precision    recall  f1-score   support\n\n       FIXED       0.60      0.70      0.64      6573\n    NOTFIXED       0.67      0.56      0.61      7052\n\n    accuracy                           0.63     13625\n   macro avg       0.63      0.63      0.63     13625\nweighted avg       0.63      0.63      0.63     13625\n')

In [21]:
train_naive_bayes(df, feature_cols=['Component', 'Title', 'Description','emotion','Emotionality'], target_col='eventual')

Accuracy: 0.5547155963302752
Classification Report:
              precision    recall  f1-score   support

       FIXED       0.52      0.93      0.67      6573
    NOTFIXED       0.76      0.21      0.32      7052

    accuracy                           0.55     13625
   macro avg       0.64      0.57      0.50     13625
weighted avg       0.64      0.55      0.49     13625



(0.5547155963302752,
 '              precision    recall  f1-score   support\n\n       FIXED       0.52      0.93      0.67      6573\n    NOTFIXED       0.76      0.21      0.32      7052\n\n    accuracy                           0.55     13625\n   macro avg       0.64      0.57      0.50     13625\nweighted avg       0.64      0.55      0.49     13625\n')

In [1]:
import torch
if torch.cuda.is_available():
    print("CUDA is available. Number of GPUs:", torch.cuda.device_count())
    print("CUDA device name:", torch.cuda.get_device_name(6))
else:
    print("CUDA is not available.")
torch.cuda.set_device(torch.device("cuda:6"))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model2.to(device)

CUDA is available. Number of GPUs: 8
CUDA device name: NVIDIA GeForce GTX 1080 Ti


NameError: name 'model2' is not defined