In [1]:
%%capture
import pandas as pd
import zipfile
!pip install torch
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
import torch

df = pd.read_csv(zipfile.ZipFile('sentiAnalysis.zip').open('sentiAnalysis.csv'))

In [22]:

def MLPmodel(df, input_columns, target_column, test_size=0.25, max_features=1000, hidden_layer_sizes=(64, 32), activation='relu', max_iter=100, random_state=42):
    """
    Train and evaluate a neural network model on the provided DataFrame.
    
    Parameters:
    - df: DataFrame containing the data.
    - input_columns: List of column names to be used as input features.
    - target_column: Name of the column to be used as the target variable.
    - test_size: Proportion of the dataset to include in the test split.
    - max_features: Maximum number of features for TF-IDF vectorization.
    - hidden_layer_sizes: The ith element represents the number of neurons in the ith hidden layer.
    - activation: Activation function for the hidden layer.
    - max_iter: Maximum number of iterations for training the neural network.
    - random_state: Seed used by the random number generator.
    
    Returns:
    - accuracy: Test accuracy of the trained model.
    """
    # Filter rows if target column is 'Resolution'
    if target_column == 'Resolution':
        df = df[df[target_column].isin(['INVALID', 'WORKSFORME', 'WONTFIX', 'FIXED'])]

    # Fill NaN values with a space
    df.fillna(' ', inplace=True)
    
    # Concatenate input columns to form the feature set
    X = df[input_columns].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
    y = df[target_column]  # Target variable

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Vectorize the text data
    vectorizer = TfidfVectorizer(max_features=max_features)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    # Initialize and train the MLPClassifier
    model = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation, max_iter=max_iter, random_state=random_state)
    model.fit(X_train_vec, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test_vec)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print("Test Accuracy:", accuracy)

    return accuracy

In [23]:
MLPmodel(df, input_columns=['Component', 'Title', 'Description','emotion','Emotionality','Priority'], target_column='Resolution')

  df.fillna(' ', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.fillna(' ', inplace=True)


Test Accuracy: 0.4957866197699128


0.4957866197699128

In [21]:
def SVMmodel(df, feature_cols, target_col, test_size=0.2, random_state=42):
    # Fill missing values in feature columns
    df[feature_cols] = df[feature_cols].fillna(' ')    
    if target_col == 'Resolution':
        df = df[df[target_col].isin(['INVALID', 'WORKSFORME', 'WONTFIX', 'FIXED'])]
    

    # Combine feature columns into a single column for vectorization
    df['combined'] = df[feature_cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

    # Encode the combined text column
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(df['combined'])

    # Encode the target column
    y = df[target_col]

    # Ensure X and y have the same length
    assert X.shape[0] == len(y), "Features and labels have inconsistent lengths."

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Initialize and train the SVM model
    model = SVC()
    model.fit(X_train, y_train)
    # Make predictions
    y_pred = model.predict(X_test)
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    print(f'Accuracy: {accuracy}')
    print('Classification Report:')
    print(report)
    
    return accuracy, report, model



In [24]:
SVMmodel(df, feature_cols=['Component', 'Title', 'emotion', 'Priority'], target_col='Resolution')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['combined'] = df[feature_cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)


Accuracy: 0.6116504854368932
Classification Report:
              precision    recall  f1-score   support

       FIXED       0.62      0.98      0.75      6485
     INVALID       0.67      0.06      0.11      1195
     WONTFIX       0.52      0.14      0.23      1766
  WORKSFORME       0.61      0.01      0.02      1472

    accuracy                           0.61     10918
   macro avg       0.60      0.30      0.28     10918
weighted avg       0.61      0.61      0.50     10918



(0.6116504854368932,
 '              precision    recall  f1-score   support\n\n       FIXED       0.62      0.98      0.75      6485\n     INVALID       0.67      0.06      0.11      1195\n     WONTFIX       0.52      0.14      0.23      1766\n  WORKSFORME       0.61      0.01      0.02      1472\n\n    accuracy                           0.61     10918\n   macro avg       0.60      0.30      0.28     10918\nweighted avg       0.61      0.61      0.50     10918\n',
 SVC())

In [1]:
df

NameError: name 'df' is not defined