In [23]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from nltk.corpus import stopwords
from imblearn.over_sampling import SMOTE
import ast

def tune_hyperparameters(X_train, y_train):
    """Perform hyperparameter tuning using GridSearchCV."""
    # Define the pipeline for preprocessing and model training
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer()),  # Vectorizer for text data
        ('nb', MultinomialNB())       # Naive Bayes classifier
    ])

    # Define the parameter grid for hyperparameter tuning
    param_grid = {
        'tfidf__max_features': [2000, 3000, 5000, 10000],
        'tfidf__ngram_range': [(1, 1), (1, 2)],
        'tfidf__min_df': [1, 2, 3],
        'tfidf__stop_words': [None, stopwords.words('english')],
        'nb__alpha': [1.0, 0.5, 0.1, 0.01]
    }

    # Perform grid search with stratified cross-validation
    stratified_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    grid_search = GridSearchCV(pipeline, param_grid, cv=stratified_cv, scoring='accuracy', verbose=1, n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # Return the best parameters and the best model from grid search
    return grid_search.best_params_, grid_search.best_estimator_

def preprocess_data(train_df, test_df):
    """Preprocess training and testing datasets."""
    # Drop missing values
    train_data = train_df.dropna()
    test_data = test_df.dropna()

    # Convert string representations of lists to actual lists
    train_data['hyp_lemmas'] = train_data['hyp_lemmas'].apply(ast.literal_eval)
    train_data['tgt_src_lemmas'] = train_data['tgt_src_lemmas'].apply(ast.literal_eval)
    test_data['hyp_lemmas'] = test_data['hyp_lemmas'].apply(ast.literal_eval)
    test_data['tgt_src_lemmas'] = test_data['tgt_src_lemmas'].apply(ast.literal_eval)

    # Combine `hyp_lemmas` and `res_lemmas` into a single text feature
    train_data['combined_text'] = train_data['hyp_lemmas'].apply(lambda x: " ".join(x)) + " " + train_data['tgt_src_lemmas'].apply(lambda x: " ".join(x))
    test_data['combined_text'] = test_data['hyp_lemmas'].apply(lambda x: " ".join(x)) + " " + test_data['tgt_src_lemmas'].apply(lambda x: " ".join(x))

    # Extract features (text) and labels
    X_train_full = train_data['combined_text']
    y_train_full = train_data['label']

    X_test = test_data['combined_text']
    y_test = test_data['label']

    return X_train_full, y_train_full, X_test, y_test, test_data

def train_and_evaluate(X_train_full, y_train_full, X_test, y_test, test_data):
    """Train and evaluate the model with hyperparameter tuning."""
    # Split training data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_full, y_train_full, test_size=0.2, random_state=42, stratify=y_train_full
    )

    # Vectorize text data using TF-IDF
    vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=5000, stop_words=None)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_val_vec = vectorizer.transform(X_val)
    X_test_vec = vectorizer.transform(X_test)

    # Handle class imbalance using SMOTE
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_vec, y_train)

    # Perform hyperparameter tuning to find the best model
    best_params, best_model = tune_hyperparameters(X_train, y_train)

    # Evaluate the best model on the validation set
    y_val_pred = best_model.predict(X_val)
    print("Validation Classification Report:")
    print(classification_report(y_val, y_val_pred))

    # Retrain the best model on the full training data (including validation)
    best_model.fit(X_train_full, y_train_full)

    # Evaluate the retrained model on the test set
    y_test_pred = best_model.predict(X_test)
    print(f"Best Parameters: {best_params}")
    print("Test Classification Report:")
    print(classification_report(y_test, y_test_pred))

    # Add predicted labels to the test data for manual inspection
    test_data['predicted_label'] = y_test_pred
    print("Test data with actual and predicted labels:")
    print(test_data[['combined_text', 'label', 'predicted_label']])

    # Save the test data with predictions to a CSV file
    test_data.to_csv("test_data_with_predictions.csv", index=False)
    print("Predictions saved to 'test_data_with_predictions.csv'.")

# Main process
if __name__ == "__main__":
    train_df = pd.read_csv('data/labeled_data/preprocessed/train_preprocessed.csv')
    test_df = pd.read_csv('data/labeled_data/preprocessed/test_preprocessed.csv')

    # Preprocess data and extract features and labels
    X_train_full, y_train_full, X_test, y_test, test_data = preprocess_data(train_df, test_df)

    # Train the model and evaluate its performance
    train_and_evaluate(X_train_full, y_train_full, X_test, y_test, test_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['hyp_lemmas'] = train_data['hyp_lemmas'].apply(ast.literal_eval)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['tgt_src_lemmas'] = train_data['tgt_src_lemmas'].apply(ast.literal_eval)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['combined_text'] = train_data['hyp

Fitting 5 folds for each of 192 candidates, totalling 960 fits
Validation Classification Report:
                   precision    recall  f1-score   support

    Hallucination       0.74      0.63      0.68      2971
Not Hallucination       0.68      0.79      0.73      3022

         accuracy                           0.71      5993
        macro avg       0.71      0.71      0.70      5993
     weighted avg       0.71      0.71      0.70      5993

Best Parameters: {'nb__alpha': 0.5, 'tfidf__max_features': 10000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2), 'tfidf__stop_words': None}
Test Classification Report:
                   precision    recall  f1-score   support

    Hallucination       0.76      0.53      0.62       998
Not Hallucination       0.41      0.66      0.51       502

         accuracy                           0.57      1500
        macro avg       0.58      0.59      0.57      1500
     weighted avg       0.64      0.57      0.58      1500

Test data with act

In [28]:
# Reload the dataset
data = pd.read_csv("test_data_with_predictions.csv")

# Count hallucinations and non-hallucinations in the label and predicted columns
label_counts = data['label'].value_counts()
predicted_counts = data['predicted_label'].value_counts()

# Prepare the results as a DataFrame for clarity
results = pd.DataFrame({
    "Type": ["Hallucinations", "Non-Hallucinations"],
    "Count in Label": [label_counts.get("Hallucination", 0), label_counts.get("Not Hallucination", 0)],
    "Count in Predicted": [predicted_counts.get("Hallucination", 0), predicted_counts.get("Not Hallucination", 0)]
})

print(results)

                 Type  Count in Label  Count in Predicted
0      Hallucinations             998                 699
1  Non-Hallucinations             502                 801


In [29]:
# Count hallucinations and non-hallucinations per task type
task_grouped = data.groupby("task")

# Initialize results dictionary
task_results = {
    "Task Type": [],
    "Hallucinations in Label": [],
    "Non-Hallucinations in Label": [],
    "Hallucinations in Predicted": [],
    "Non-Hallucinations in Predicted": []
}

# Iterate through each task type
for task, group in task_grouped:
    task_results["Task Type"].append(task)
    task_results["Hallucinations in Label"].append((group['label'] == "Hallucination").sum())
    task_results["Non-Hallucinations in Label"].append((group['label'] == "Not Hallucination").sum())
    task_results["Hallucinations in Predicted"].append((group['predicted_label'] == "Hallucination").sum())
    task_results["Non-Hallucinations in Predicted"].append((group['predicted_label'] == "Not Hallucination").sum())

# Convert results to a DataFrame
task_results_df = pd.DataFrame(task_results)

print(task_results_df)

  Task Type  Hallucinations in Label  Non-Hallucinations in Label  \
0        DM                      434                          129   
1        MT                      352                          210   
2        PG                      212                          163   

   Hallucinations in Predicted  Non-Hallucinations in Predicted  
0                          551                               12  
1                           86                              476  
2                           62                              313  


In [43]:
# Recalculate comparison metrics
def recalculate_comparison(data, task_column="task", label_column="label", predicted_column="predicted_label"):
    results = {
        "Task Type": [],
        "Correct Hallucinations": [],
        "Incorrect Hallucinations": [],
        "Label Hallucinations": [],
        "Predicted Label Hallucinations": [],
        "Correct Non-Hallucinations": [],
        "Incorrect Non-Hallucinations": [],
        "Label Non-Hallucinations": [],
        "Predicted Label Non-Hallucinations": [],
    }
    
    grouped = data.groupby(task_column)
    for task, group in grouped:
        # Calculate correct and incorrect predictions
        label_hallucinations = (group['label'] == "Hallucination").sum()
        predicted_label_hallucinations = (group['predicted_label'] == "Hallucination").sum()
        
        label_non_hallucinations = (group['label'] == "Not Hallucination").sum()
        predicted_label_non_hallucinations = (group['predicted_label'] == "Not Hallucination").sum()
        
        correct_hallucinations = ((group[label_column] == "Hallucination") & (group[predicted_column] == "Hallucination")).sum()
        correct_non_hallucinations = ((group[label_column] == "Not Hallucination") & (group[predicted_column] == "Not Hallucination")).sum()
        
        incorrect_hallucinations = (label_hallucinations-correct_hallucinations)+(predicted_label_hallucinations-correct_hallucinations)
        incorrect_non_hallucinations = (label_hallucinations-correct_hallucinations)+(predicted_label_hallucinations-correct_hallucinations)
        
        # Append results
        results["Task Type"].append(task)
        
        results["Correct Hallucinations"].append(correct_hallucinations)
        results["Incorrect Hallucinations"].append(incorrect_hallucinations)
        results["Label Hallucinations"].append(label_hallucinations)
        results["Predicted Label Hallucinations"].append(predicted_label_hallucinations)
        
        results["Correct Non-Hallucinations"].append(correct_non_hallucinations)
        results["Incorrect Non-Hallucinations"].append(incorrect_non_hallucinations)
        results["Label Non-Hallucinations"].append(label_non_hallucinations)
        results["Predicted Label Non-Hallucinations"].append(predicted_label_non_hallucinations)
    
    return pd.DataFrame(results)

comparison_results = recalculate_comparison(data)

comparison_results

Unnamed: 0,Task Type,Correct Hallucinations,Incorrect Hallucinations,Label Hallucinations,Predicted Label Hallucinations,Correct Non-Hallucinations,Label Non-Hallucinations,Predicted Label Non-Hallucinations
0,DM,427,131,434,551,5,129,12
1,MT,62,314,352,86,186,210,476
2,PG,39,196,212,62,140,163,313
