In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/NLP_Project

Mounted at /content/drive
/content/drive/MyDrive/NLP_Project


In [4]:
  train_csv_path = '/content/drive/MyDrive/NLP_Project/liar_dataset/train_filtered_True_False.csv'
  valid_csv_path = '/content/drive/MyDrive/NLP_Project/liar_dataset/valid_filtered_True_False.csv'
  test_csv_path = '/content/drive/MyDrive/NLP_Project/liar_dataset/test_filtered_True_False.csv'

In [6]:
!pip install pybloom_live
import csv
from pybloom_live import BloomFilter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def load_dataset(file_path):
    """
    Load labels and text from a CSV file.
    """
    labels, texts = [], []
    unique_labels = set()  # To store unique label values
    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        for row in reader:
            label_str = row['Label'].upper()  # Convert to uppercase for case-insensitive check
            if label_str not in ('TRUE', 'FALSE'):
                print(f"Warning: Unexpected label value '{label_str}'. Skipping this row.")
                continue

            unique_labels.add(label_str)
            label = 1 if label_str == 'TRUE' else 0  # Convert to binary (1 for TRUE, 0 for FALSE)
            labels.append(label)
            texts.append(row['Text'])

    # Print unique label values
    print("Unique Label values:", unique_labels)

    return texts, labels

def build_bloom_filter(training_texts, capacity, error_rate):
    """
    Build a Bloom filter using PyBloom on the training texts.
    """
    bloom_filter = BloomFilter(capacity, error_rate)
    for text in training_texts:
        bloom_filter.add(text)
    return bloom_filter

def apply_bloom_filter(bloom_filter, dataset_texts):
    """
    Apply the Bloom filter to a dataset and return the predicted labels.
    """
    predictions = [1 if text in bloom_filter else 0 for text in dataset_texts]
    return predictions

def main():
    # Paths to the CSV files for train and test datasets
    train_csv_path = '/content/drive/MyDrive/NLP_Project/liar_dataset/train_filtered_True_False.csv'
    valid_csv_path = '/content/drive/MyDrive/NLP_Project/liar_dataset/valid_filtered_True_False.csv'
    test_csv_path = '/content/drive/MyDrive/NLP_Project/liar_dataset/test_filtered_True_False.csv'

    # Load the entire training dataset
    train_texts, train_labels = load_dataset(train_csv_path)

    # Split the training dataset into training and validation sets
    X_train, X_valid, y_train, y_valid = train_test_split(
        train_texts, train_labels, test_size=0.2, random_state=42
    )

    # Choose appropriate parameters for Bloom filter
    capacity = 1000
    error_rate = 0.001

    # Build Bloom filter on the training set
    bloom_filter = build_bloom_filter(X_train, capacity, error_rate)

    # Apply Bloom filter to validation set
    valid_predictions = apply_bloom_filter(bloom_filter, X_valid)

    # Evaluate accuracy on the validation set
    accuracy_on_valid = accuracy_score(y_valid, valid_predictions)
    print(f"Accuracy on validation set: {accuracy_on_valid * 100:.2f}%")

    # Apply Bloom filter to test set
    test_texts, test_labels = load_dataset(test_csv_path)
    test_predictions = apply_bloom_filter(bloom_filter, test_texts)

    # Evaluate accuracy on the test set
    accuracy_on_test = accuracy_score(test_labels, test_predictions)
    print(f"Accuracy on test set: {accuracy_on_test * 100:.2f}%")

if __name__ == "__main__":
    main()


Unique Label values: {'TRUE', 'FALSE'}
Accuracy on validation set: 45.71%
Unique Label values: {'TRUE', 'FALSE'}
Accuracy on test set: 60.94%
